[llvm] [AMDGPU] Add some more GFX12 test coverage (PR #120581)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 19 19:51:18 PST 2024


================
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 {
+; GFX11-LABEL: mixed_vmem_types:
+; GFX11:       ; %bb.0: ; %.entry
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_mov_b32 s0, s3
+; GFX11-NEXT:    s_mov_b32 s3, s5
+; GFX11-NEXT:    s_mov_b32 s1, s5
+; GFX11-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
+; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[20:23], 0
+; GFX11-NEXT:    buffer_load_b32 v2, off, s[16:19], 0
+; GFX11-NEXT:    image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-NEXT:    buffer_load_b32 v4, off, s[40:43], 0
+; GFX11-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_cmp_eq_f32_e64 s1, 1.0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_and_b32 s0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[24:27], 0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: mixed_vmem_types:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_mov_b32 s0, s3
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
+; GFX12-NEXT:    s_mov_b32 s3, s5
+; GFX12-NEXT:    s_mov_b32 s1, s5
+; GFX12-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
+; GFX12-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[20:23], null
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[16:19], null
+; GFX12-NEXT:    image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    buffer_load_b32 v4, off, s[40:43], null
+; GFX12-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-NEXT:    s_wait_loadcnt 0x2
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
+; GFX12-NEXT:    s_wait_loadcnt 0x1
+; GFX12-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
+; GFX12-NEXT:    s_wait_samplecnt 0x1
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s1, 1.0, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX12-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX12-NEXT:    s_and_b32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, s0, s2
+; GFX12-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[24:27], null
+; GFX12-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: mixed_vmem_types:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_getpc_b64 s[20:21]
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, s3
+; GFX12-GISEL-NEXT:    s_sext_i32_i16 s21, s21
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, s21
+; GFX12-GISEL-NEXT:    s_mov_b32 s3, s21
+; GFX12-GISEL-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
+; GFX12-GISEL-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-GISEL-NEXT:    buffer_load_b32 v2, off, s[16:19], null
+; GFX12-GISEL-NEXT:    buffer_load_b32 v3, off, s[20:23], null
+; GFX12-GISEL-NEXT:    buffer_load_b32 v4, off, s[40:43], null
+; GFX12-GISEL-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
+; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x1
+; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s1, 0xac0, v3
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, s2
+; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-NEXT:    buffer_store_b32 v0, off, s[24:27], null
+; GFX12-GISEL-NEXT:    s_endpgm
+.entry:
+  %0 = call i64 @llvm.amdgcn.s.getpc()
+  %extelt.offset = lshr i64 %0, 32
+  %.i1 = trunc i64 %extelt.offset to i32
+  %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0
+  %1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1
+  %2 = bitcast <2 x i32> %1 to i64
+  %3 = inttoptr i64 %2 to ptr addrspace(4)
+  %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0
+  %4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1
+  %5 = bitcast <2 x i32> %4 to i64
+  %6 = inttoptr i64 %5 to ptr addrspace(4)
+  %7 = getelementptr i8, ptr addrspace(4) %6, i64 80
+  %8 = load <4 x i32>, ptr addrspace(4) %7, align 16
+  %9 = getelementptr i8, ptr addrspace(4) %3, i64 48
+  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16
+  %11 = getelementptr i8, ptr addrspace(4) %6, i64 64
+  %12 = load <4 x i32>, ptr addrspace(4) %11, align 16
+  %13 = getelementptr i8, ptr addrspace(4) %6, i64 16
+  %14 = load <4 x i32>, ptr addrspace(4) %13, align 16
+  %15 = getelementptr i8, ptr addrspace(4) %6, i64 32
+  %16 = load <8 x i32>, ptr addrspace(4) %15, align 32
+  %17 = load <4 x i32>, ptr addrspace(4) %6, align 16
+  %18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %16, <4 x i32> %17, i1 false, i32 0, i32 0)
+  %19 = fcmp oeq float %18, 0.000000e+00
+  %20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %14, i32 0, i32 0, i32 0)
+  %.not = icmp eq i32 %20, 2752
+  %21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %12, i32 0, i32 0, i32 0)
+  %.not1 = icmp eq i32 %21, 2752
+  %22 = getelementptr i8, ptr addrspace(4) %3, i64 16
+  %23 = load <8 x i32>, ptr addrspace(4) %22, align 32
+  %24 = load <4 x i32>, ptr addrspace(4) %3, align 16
+  %25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %23, <4 x i32> %24, i1 false, i32 0, i32 0)
+  %26 = fcmp oeq float %25, 1.000000e+00
+  %27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %10, i32 0, i32 0, i32 0)
+  %.not2 = icmp eq i32 %27, 2752
+  %28 = select i1 %.not2, i1 %26, i1 false
----------------
arsenm wrote:

Use named values 

https://github.com/llvm/llvm-project/pull/120581


More information about the llvm-commits mailing list