[llvm] Remove an incorrect assert in MFMASmallGemmSingleWaveOpt. (PR #130131)

via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 13 07:27:01 PDT 2025


https://github.com/anjenner updated https://github.com/llvm/llvm-project/pull/130131

>From 87f1d606c02843017a06e47b57b7256665b37284 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 6 Mar 2025 10:47:08 -0500
Subject: [PATCH 1/2] Remove an incorrect assert in MFMASmallGemmSingleWaveOpt.

This assert was failing in a fuzzing test. I consulted with @jrbyrnes who said:

The MFMASmallGemmSingleWaveOpt::apply() method is invoked if and only if the
user has inserted an intrinsic llvm.amdgcn.iglp.opt(i32 1) into their source
code. This intrinsic applies a highly specialized DAG mutation to result in
specific scheduling for a specific set of kernels. These assertions are really
just confirming that the characteristics of the kernel match what is expected
(i.e. The kernels are similar to the ones this DAG mutation strategy were
designed against).

However, if we apply this DAG mutation to kernels for which is was not
designed, then we may not find the types of instructions we are looking for,
and may end up with empty caches.

I think it should be fine to just return false if the cache is empty instead
of the assert.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index bbd262748d680..a284cc0e5af51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1891,7 +1891,6 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         }
       }
 
-      assert(Cache->size());
       auto *DAG = SyncPipe[0].DAG;
       for (auto &Elt : *Cache) {
         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))

>From 0a47cb57245f726a4e86a6789edcf60328f24cc9 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 13 Mar 2025 10:32:51 -0400
Subject: [PATCH 2/2] Address review feedback.

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |  2 -
 ...llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll | 93 +++++++++++++++++++
 2 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index a284cc0e5af51..03ab0a8f7b28d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1927,8 +1927,6 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         return FitsInGroup;
       }
 
-      assert(Cache->size());
-
       // Does the VALU have a DS_WRITE successor that is the same as other
       // VALU already in the group. The V_PERMs will all share 1 DS_W succ
       return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
new file mode 100644
index 0000000000000..bf0cef3ee28e8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O1 -global-isel=true < %s
+
+; Function Attrs: nounwind
+define amdgpu_kernel void @test_iglp_opt() #0 {
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 0) #4
+  ret void
+}
+
+; Function Attrs: nounwind
+define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 0)
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr, align 128
+  %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+  %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr, align 128
+  %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr, align 128
+  %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+  %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr, align 128
+  %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+  %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr, align 128
+  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.0, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.2, i32 0, i32 0, i32 0)
+  %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.3, i32 0, i32 0, i32 0)
+  %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.4, i32 0, i32 0, i32 0)
+  %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+  store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr, align 128
+  %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+  store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr, align 128
+  %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+  store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr, align 128
+  %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+  store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr, align 128
+  %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+  store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr, align 128
+  ret void
+}
+
+; Function Attrs: nounwind
+define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+entry:
+  call void @llvm.amdgcn.iglp.opt(i32 1)
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr, align 128
+  %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
+  %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr, align 128
+  %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr, align 128
+  %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
+  %L = load <1 x i64>, ptr addrspace(3) %load.3.addr, align 8
+  %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr, align 128
+  %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
+  %L1 = load <1 x i64>, ptr addrspace(3) %load.4.addr, align 8
+  %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr, align 128
+  %B = urem <1 x i64> %L, %L1
+  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.0, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.2, i32 0, i32 0, i32 0)
+  %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.3, i32 0, i32 0, i32 0)
+  %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.4, i32 0, i32 0, i32 0)
+  %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
+  store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr, align 128
+  %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
+  store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr, align 128
+  %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
+  store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr, align 128
+  %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
+  store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr, align 128
+  %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
+  store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr, align 128
+  store <1 x i64> %B, ptr addrspace(3) %store.3.addr, align 8
+  ret void
+}
+
+; Function Attrs: convergent nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.iglp.opt(i32 immarg) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #3
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { convergent nocallback nofree nounwind willreturn }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { convergent nounwind }



More information about the llvm-commits mailing list