[llvm] [AMDGPU][CodeGen][True16] Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11 (PR #157795)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 12 09:02:41 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/157795
>From 962bd62d200f222a6e446c19728255525458c426 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 9 Sep 2025 23:26:18 -0400
Subject: [PATCH] waitcnt patch
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 16 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 4 +
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 +
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 278 ++----------------
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 40 +--
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 150 ++--------
.../branch-relaxation-inst-size-gfx11.ll | 4 +-
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 1 +
llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 2 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 12 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 3 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 3 +-
.../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 4 +-
llvm/test/CodeGen/AMDGPU/spillv16.ll | 2 +-
15 files changed, 97 insertions(+), 433 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a366db1c580ba..590d2da1cb5ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
+ "Enable16bitD16HWBug",
+ "true",
+ "D16 for 16 bit data type interfere the other half in true16 mode"
+>;
+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
@@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureMemoryAtomicFAddF32DenormalSupport,
- FeatureRealTrue16Insts]>;
+ FeatureRealTrue16Insts,
+ Feature16bitD16HWBug,
+]>;
// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
@@ -2570,6 +2578,12 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
+def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->has16bitD16HWBug()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;
+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..521cd208f5326 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}
+bool AMDGPUSubtarget::has16bitD16HWBug() const {
+ return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
+}
+
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..e5203486436e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool Enable16bitD16HWBug = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool has16bitD16HWBug() const;
+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..db977cacbaebd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -845,6 +845,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
assert(Size % 16 == 0);
Result.second = Result.first + (Size / 16);
+
+ if (Size == 16 && Context->ST->has16bitD16HWBug()) {
+ // also update the other half since lo16/hi16 interfere with each other
+ if (AMDGPU::isHi16Reg(MCReg, *TRI))
+ Result.first -= 1;
+ else
+ Result.second += 1;
+ }
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
// sources like SRC_PRIVATE_BASE.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 46b82d3a3d651..1ce7179774349 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
@@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
@@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
@@ -147648,7 +147528,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_4
; GFX11-TRUE16-NEXT: .LBB88_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
@@ -147667,7 +147546,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
@@ -147988,10 +147866,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -148008,10 +147884,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
@@ -148019,7 +147893,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
@@ -148031,10 +147904,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -148051,10 +147922,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
@@ -148068,17 +147937,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -148096,10 +147962,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
@@ -173957,6 +173821,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
@@ -173988,7 +173853,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
@@ -174005,69 +173869,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
@@ -174081,7 +173913,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_4
; GFX11-TRUE16-NEXT: .LBB92_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
@@ -174100,7 +173931,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
@@ -174421,10 +174251,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -174441,10 +174269,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
@@ -174452,7 +174278,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
@@ -174464,10 +174289,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -174484,10 +174307,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
@@ -174501,17 +174322,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -174529,10 +174347,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
@@ -196529,6 +196345,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
@@ -196560,7 +196377,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
@@ -196577,69 +196393,37 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
@@ -196653,7 +196437,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_4
; GFX11-TRUE16-NEXT: .LBB96_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
@@ -196672,7 +196455,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
@@ -196993,10 +196775,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -197013,10 +196793,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
@@ -197024,7 +196802,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
@@ -197036,10 +196813,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -197056,10 +196831,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
@@ -197073,17 +196846,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
@@ -197101,10 +196871,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 436b1a038b274..2abb2f3b9de52 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -5033,6 +5033,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -5059,15 +5060,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -11993,6 +11989,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
@@ -12019,15 +12016,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -18559,6 +18551,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -18596,13 +18589,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -18701,10 +18690,9 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -24640,6 +24628,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
@@ -24677,13 +24666,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
@@ -24782,10 +24767,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
@@ -28760,6 +28744,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -28792,15 +28777,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -32871,6 +32851,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
@@ -32903,15 +32884,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ede44e738fe00..352b2cb7123b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -12492,6 +12492,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
@@ -12523,39 +12524,22 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -27377,6 +27361,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
@@ -27408,39 +27393,22 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -41534,6 +41502,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
@@ -41565,39 +41534,22 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -54837,6 +54789,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
@@ -54868,39 +54821,22 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -68501,6 +68437,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
@@ -68533,37 +68470,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
@@ -68710,6 +68634,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
@@ -68717,7 +68642,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
@@ -68732,11 +68656,10 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -68756,7 +68679,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
@@ -80726,6 +80648,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
@@ -80758,37 +80681,24 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
@@ -80935,6 +80845,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
@@ -80942,7 +80853,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
@@ -80957,11 +80867,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -80981,7 +80890,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
@@ -91233,6 +91141,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
@@ -91265,37 +91174,24 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
@@ -91442,6 +91338,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
@@ -91449,7 +91346,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
@@ -91464,11 +91360,10 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
@@ -91488,7 +91383,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
index dd389375b0d77..6bebc8f5d0d18 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
@@ -23,9 +23,9 @@ define amdgpu_kernel void @long_forward_branch_gfx11plus(ptr addrspace(1) %in, p
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_d16_b16 v0, v1, s[0:1]
; GFX11-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3] offset:2
; GFX11-NEXT: .LBB0_2: ; %bb3
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index d374ed072cdc6..d9ac2d80af920 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1109,6 +1109,7 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b
; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2
; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index ccdc0b1bf43c4..a84872d8eac0f 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -1561,8 +1561,8 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 305461ed6b208..049663a1e1bb4 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1685,19 +1685,18 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v5
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
-; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
@@ -1977,13 +1976,12 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
-; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
@@ -2726,10 +2724,10 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
-; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 25020673bce22..0a1d15bf945f9 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -376,9 +376,8 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, v2.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v0.h, v0.l
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 74ac181c120b5..448585afd2405 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -333,9 +333,8 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v2.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 91c88ec5e718c..b538d6066d551 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1528,8 +1528,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
@@ -1559,8 +1559,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l
; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 3d21860e2af40..0e45df223465d 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -61,8 +61,8 @@ define void @spill_i16_alu_two_vals() {
; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
More information about the llvm-commits
mailing list