[llvm] [AMDGPU] Fix image_msaa_load waitcnt insertion for pre-gfx12 (PR #90710)
David Stuttard via llvm-commits
llvm-commits at lists.llvm.org
Wed May 1 02:20:19 PDT 2024
https://github.com/dstutt updated https://github.com/llvm/llvm-project/pull/90710
>From a05142592b0e9236310d5905def4ec6325d9b0d4 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Wed, 1 May 2024 10:17:21 +0100
Subject: [PATCH 1/2] [AMDGPU] Pre-commit test
---
.../CodeGen/AMDGPU/waitcnt-sample-waw.mir | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
new file mode 100644
index 00000000000000..90ba9802dc8bed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
@@ -0,0 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX11 %s
+
+---
+name: sample_load_msaa
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ ; GFX11-LABEL: name: sample_load_msaa
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: S_WAITCNT 0
+ ; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GFX11-NEXT: S_WAITCNT 1015
+ ; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+
+...
>From 480d5653edfa9bc28a317a29883d5a80c790dd24 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard at amd.com>
Date: Tue, 30 Apr 2024 14:52:00 +0100
Subject: [PATCH 2/2] [AMDGPU] Fix image_msaa_load waitcnt insertion for
pre-gfx12
https://github.com/llvm/llvm-project/pull/90201 made some fixes for gfx12
image_msaa_load waitcnt insertion.
That fix might break in some situations for pre-gfx12 - this fixes that by
explitly checking for VSAMPLE which always requires a s_wait_samplecnt and
leaves the previous logic intact for non-gfx12.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 12 ++++++------
llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir | 1 +
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 15a1db51c6d78b..ebb5a1af7f2d17 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -187,12 +187,12 @@ VmemType getVmemType(const MachineInstr &Inst) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
- // The test for MSAA here is because gfx12+ image_msaa_load is actually
- // encoded as VSAMPLE and requires the appropriate s_waitcnt variant for that.
- // Pre-gfx12 doesn't care since all vmem types result in the same s_waitcnt.
- return BaseInfo->BVH ? VMEM_BVH
- : BaseInfo->Sampler || BaseInfo->MSAA ? VMEM_SAMPLER
- : VMEM_NOSAMPLER;
+ // We have to make an additional check for isVSAMPLE here since some
+ // instructions don't have a sampler, but are still classified as sampler
+ // instructions for the purposes of e.g. waitcnt.
+ return BaseInfo->BVH ? VMEM_BVH
+ : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
}
unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
index 90ba9802dc8bed..8eb4be266dd3bd 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
@@ -13,6 +13,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: S_WAITCNT 0
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GFX11-NEXT: S_WAITCNT 1015
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX11-NEXT: S_WAITCNT 1015
; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
More information about the llvm-commits
mailing list