[llvm] [AMDGPU][CodeGen][True16] Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11 (PR #157795)

via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 12 09:00:18 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

<details>
<summary>Changes</summary>

It seems the VMEM access on hi/lo half could interfere the other half. Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11.

---

Patch is 90.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157795.diff


15 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+16-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (+4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (+3) 
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+8) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+23-255) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+8-32) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+22-128) 
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+5-7) 
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/spillv16.ll (+1-1) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a366db1c580ba..74fd7d543d42c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
   "Use true 16-bit registers"
 >;
 
+def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
+  "Enable16bitD16HWBug",
+  "true",
+  "D16 for 16 bit data type interfere the other half in true16 mode"
+>;
+
 def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
   "HasBF16TransInsts",
   "true",
@@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
    FeatureMemoryAtomicFAddF32DenormalSupport,
-   FeatureRealTrue16Insts]>;
+   FeatureRealTrue16Insts,
+   Feature16bitD16HWBug,
+]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -2570,6 +2578,13 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
   // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
   // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
 
+// Do not use D16 inst for 16bit data type
+def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
+def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
+                                               "!Subtarget->has16bitD16HWBug()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;
+
 def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
   AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..521cd208f5326 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
   return hasTrue16BitInsts() && EnableRealTrue16Insts;
 }
 
+bool AMDGPUSubtarget::has16bitD16HWBug() const {
+  return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
+}
+
 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
 // allows the given function to achieve an occupancy of NWaves waves per
 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..e5203486436e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
   bool HasCvtPkF16F32Inst = false;
   bool HasF32ToF16BF16ConversionSRInsts = false;
   bool EnableRealTrue16Insts = false;
+  bool Enable16bitD16HWBug = false;
   bool HasBF16TransInsts = false;
   bool HasBF16ConversionInsts = false;
   bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
   // supported and the support for fake True16 instructions is removed.
   bool useRealTrue16Insts() const;
 
+  bool has16bitD16HWBug() const;
+
   bool hasBF16TransInsts() const { return HasBF16TransInsts; }
 
   bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..db977cacbaebd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -845,6 +845,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
     assert(Size % 16 == 0);
     Result.second = Result.first + (Size / 16);
+
+    if (Size == 16 && Context->ST->has16bitD16HWBug()) {
+      // also update the other half since lo16/hi16 interfere with each other
+      if (AMDGPU::isHi16Reg(MCReg, *TRI))
+        Result.first -= 1;
+      else
+        Result.second += 1;
+    }
   } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
     // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
     // sources like SRC_PRIVATE_BASE.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 46b82d3a3d651..1ce7179774349 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
@@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
@@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
@@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/157795


More information about the llvm-commits mailing list