[llvm] [AMDGPU] Add no return image_sample intrinsics and instructions (PR #97542)

Wed Jul 3 06:51:05 PDT 2024

================
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -global-isel=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX10,GFX10-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX10,GFX10-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_ps void @sample_1d_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+; GFX10PLUS-LABEL: sample_1d_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_1d_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.1d.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_2d_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+; GFX10PLUS-LABEL: sample_2d_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_2d_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.2d.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_3d_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) {
+; GFX10PLUS-LABEL: sample_3d_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_3d_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.3d.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_cube_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
+; GFX10PLUS-LABEL: sample_cube_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_cube_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.cube.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_1darray_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) {
+; GFX10PLUS-LABEL: sample_1darray_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_1darray_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.1darray.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_2darray_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
+; GFX10PLUS-LABEL: sample_2darray_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_2darray_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.2darray.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @sample_b_1d_noret(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+; GFX10PLUS-LABEL: sample_b_1d_noret:
+; GFX10PLUS:       ; %bb.0: ; %main_body
+; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10PLUS-NEXT:    image_sample_b off, v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10PLUS-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sample_b_1d_noret:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-NEXT:    image_sample_b off, [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.image.sample.noret.b.1d.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
----------------
arsenm wrote:

I mean intrinsic names should match the underlying instruction name as closely as possible. Here, you've split the opcode name with the compiler invented infix of noret. It would be better if this were llvm.amdgcn.image.sample.b.1d.nortn.f32. That way it always sorts with llvm.amdgcn.image.sample.b.1d.f32

https://github.com/llvm/llvm-project/pull/97542