[llvm] [AMDGPU][CodeGen][True16] Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11 (PR #157795)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 12 09:00:18 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
It seems the VMEM access on hi/lo half could interfere the other half. Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11.
---
Patch is 90.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157795.diff
15 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+16-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+8)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+23-255)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+8-32)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+22-128)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+5-7)
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/spillv16.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a366db1c580ba..74fd7d543d42c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
+ "Enable16bitD16HWBug",
+ "true",
+ "D16 for 16 bit data type interfere the other half in true16 mode"
+>;
+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
@@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureMemoryAtomicFAddF32DenormalSupport,
- FeatureRealTrue16Insts]>;
+ FeatureRealTrue16Insts,
+ Feature16bitD16HWBug,
+]>;
// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
@@ -2570,6 +2578,13 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+// Do not use D16 inst for 16bit data type
+def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
+def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->has16bitD16HWBug()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;
+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..521cd208f5326 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}
+bool AMDGPUSubtarget::has16bitD16HWBug() const {
+ return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
+}
+
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..e5203486436e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool Enable16bitD16HWBug = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool has16bitD16HWBug() const;
+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..db977cacbaebd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -845,6 +845,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
assert(Size % 16 == 0);
Result.second = Result.first + (Size / 16);
+
+ if (Size == 16 && Context->ST->has16bitD16HWBug()) {
+ // also update the other half since lo16/hi16 interfere with each other
+ if (AMDGPU::isHi16Reg(MCReg, *TRI))
+ Result.first -= 1;
+ else
+ Result.second += 1;
+ }
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
// sources like SRC_PRIVATE_BASE.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 46b82d3a3d651..1ce7179774349 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
@@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
@@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/157795
More information about the llvm-commits
mailing list