[llvm] a3d05e8 - Remove an incorrect assert in MFMASmallGemmSingleWaveOpt. (#130131)

Thu Apr 24 01:22:29 PDT 2025

Author: anjenner
Date: 2025-04-24T09:22:24+01:00
New Revision: a3d05e89873654dd1b27979b2bfd82ddd4859ba7

URL: https://github.com/llvm/llvm-project/commit/a3d05e89873654dd1b27979b2bfd82ddd4859ba7
DIFF: https://github.com/llvm/llvm-project/commit/a3d05e89873654dd1b27979b2bfd82ddd4859ba7.diff

LOG: Remove an incorrect assert in MFMASmallGemmSingleWaveOpt. (#130131)

This assert was failing in a fuzzing test. I consulted with @jrbyrnes
who said:

The MFMASmallGemmSingleWaveOpt::apply() method is invoked if and only if
the user has inserted an intrinsic llvm.amdgcn.iglp.opt(i32 1) into
their source code. This intrinsic applies a highly specialized DAG
mutation to result in specific scheduling for a specific set of kernels.
These assertions are really just confirming that the characteristics of
the kernel match what is expected (i.e. The kernels are similar to the
ones this DAG mutation strategy were designed against).

However, if we apply this DAG mutation to kernels for which is was not
designed, then we may not find the types of instructions we are looking
for, and may end up with empty caches.

I think it should be fine to just return false if the cache is empty
instead of the assert.

Added: 
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 87c1d2586cce5..4ff49c1e46dfe 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1885,7 +1885,6 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         }
       }
 
-      assert(Cache->size());
       auto *DAG = SyncPipe[0].DAG;
       for (auto &Elt : *Cache) {
         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
@@ -1922,8 +1921,6 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         return FitsInGroup;
       }
 
-      assert(Cache->size());
-
       // Does the VALU have a DS_WRITE successor that is the same as other
       // VALU already in the group. The V_PERMs will all share 1 DS_W succ
       return llvm::any_of(*Cache, [&SU](SUnit *Elt) {

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
new file mode 100644
index 0000000000000..a319f1260d870
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -O1 < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
+; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT:    ds_read_b128 v[28:31], v32 offset:112
+; GCN-NEXT:    ds_read_b128 v[24:27], v32 offset:96
+; GCN-NEXT:    ds_read_b128 v[20:23], v32 offset:80
+; GCN-NEXT:    ds_read_b128 v[16:19], v32 offset:64
+; GCN-NEXT:    ds_read_b128 v[0:3], v32
+; GCN-NEXT:    ds_read_b128 v[4:7], v32 offset:16
+; GCN-NEXT:    ds_read_b128 v[8:11], v32 offset:32
+; GCN-NEXT:    ds_read_b128 v[12:15], v32 offset:48
+; GCN-NEXT:    v_mov_b32_e32 v34, 0
+; GCN-NEXT:    v_mov_b32_e32 v35, v34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GCN-NEXT:    ; iglp_opt mask(0x00000001)
+; GCN-NEXT:    ds_write_b128 v32, v[28:31] offset:112
+; GCN-NEXT:    ds_write_b128 v32, v[24:27] offset:96
+; GCN-NEXT:    ds_write_b128 v32, v[20:23] offset:80
+; GCN-NEXT:    ds_write_b128 v32, v[16:19] offset:64
+; GCN-NEXT:    ds_write_b128 v32, v[12:15] offset:48
+; GCN-NEXT:    ds_write_b128 v32, v[8:11] offset:32
+; GCN-NEXT:    ds_write_b128 v32, v[4:7] offset:16
+; GCN-NEXT:    ds_write_b128 v32, v[0:3]
+; GCN-NEXT:    ds_write_b64 v32, v[34:35]
+; GCN-NEXT:    s_endpgm
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  %load.4 = load <32 x float>, ptr addrspace(3) null, align 128
+  %B = urem <1 x i64> zeroinitializer, %L1
+  store <32 x float> %load.4, ptr addrspace(3) null, align 128
+  store <1 x i64> %B, ptr addrspace(3) null, align 8
+  ret void
+}
+
+declare void @llvm.amdgcn.iglp.opt(i32 immarg) #0
+
+attributes #0 = { convergent nocallback nofree nounwind willreturn }