[llvm] [AMDGPU] Treat image_msaa_load as a sampler operation (PR #141726)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 28 01:08:57 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Carl Ritson (perlfu)
<details>
<summary>Changes</summary>
While image_msaa_load does not take a sampler, it can behave as if it does on some hardware. This has implications for wait counting and clausing.
---
Full diff: https://github.com/llvm/llvm-project/pull/141726.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll (-4)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir (+22-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 67c4cac7b4c88..eb0977b92d5ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -116,7 +116,7 @@ class SIInsertHardClauses {
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
if (BaseInfo->BVH)
return HARDCLAUSE_BVH;
- if (BaseInfo->Sampler)
+ if (BaseInfo->Sampler || BaseInfo->MSAA)
return HARDCLAUSE_MIMG_SAMPLE;
return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_MIMG_ATOMIC
: HARDCLAUSE_MIMG_LOAD
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 99c00e149c140..abaeb24ac0dd5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -189,9 +189,10 @@ VmemType getVmemType(const MachineInstr &Inst) {
// We have to make an additional check for isVSAMPLE here since some
// instructions don't have a sampler, but are still classified as sampler
// instructions for the purposes of e.g. waitcnt.
- return BaseInfo->BVH ? VMEM_BVH
- : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
- : VMEM_NOSAMPLER;
+ bool HasSampler = BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst);
+ return BaseInfo->BVH ? VMEM_BVH
+ : HasSampler ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
}
unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
index 947c838740d43..ca6bccda124c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
@@ -90,7 +90,6 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: ; return to shader part epilog
@@ -100,7 +99,6 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: ; return to shader part epilog
@@ -166,7 +164,6 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: ; return to shader part epilog
@@ -176,7 +173,6 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
index 4bd35138815ed..eea99e7203537 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-out-order.mir
@@ -57,9 +57,9 @@ body: |
; GFX11: S_WAITCNT 1015
; GFX1150-NEXT: S_WAITCNT 1015
; GFX12-NEXT: S_WAIT_SAMPLECNT 0
- ; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GCN-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
$vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
- renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
S_ENDPGM 0
...
---
@@ -69,12 +69,12 @@ machineFunctionInfo:
body: |
bb.0:
; GCN-LABEL: name: waitcnt-load-sample
- ; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GCN: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX11: S_WAITCNT 1015
; GFX1150-NEXT: S_WAITCNT 1015
; GFX12-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
- renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ renamable $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
$vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
S_ENDPGM 0
...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
index 8eb4be266dd3b..64d553cb24e00 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-sample-waw.mir
@@ -13,7 +13,6 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: S_WAITCNT 0
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V1_gfx11 killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
- ; GFX11-NEXT: S_WAITCNT 1015
; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX11-NEXT: S_WAITCNT 1015
; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
@@ -22,3 +21,25 @@ body: |
SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
...
+
+---
+name: atomic_msaa
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ ; GFX11-LABEL: name: atomic_msaa
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: S_WAITCNT 0
+ ; GFX11-NEXT: dead $vgpr0 = IMAGE_ATOMIC_ADD_V1_V2_gfx11 killed $vgpr0, renamable $vgpr4_vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 6, -1, 1, 0, -1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX11-NEXT: S_WAITCNT 1015
+ ; GFX11-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ ; GFX11-NEXT: S_WAITCNT 1015
+ ; GFX11-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+ dead $vgpr0 = IMAGE_ATOMIC_ADD_V1_V2_gfx11 killed $vgpr0, renamable $vgpr4_vgpr5, renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 6, -1, 1, 0, -1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_MSAA_LOAD_V4_V2_gfx11 killed renamable $vgpr4_vgpr5, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 4, 7, -1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
+ SI_RETURN_TO_EPILOG killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3
+
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/141726
More information about the llvm-commits
mailing list