[llvm] 9b0b912 - [amdgpu][nfc] Add test case showing false aliasing in LDS lowering

Tue Aug 30 07:34:37 PDT 2022

Author: Jon Chesterfield
Date: 2022-08-30T15:33:57+01:00
New Revision: 9b0b912e15d091f8d581db37bd2ee474171c047f

URL: https://github.com/llvm/llvm-project/commit/9b0b912e15d091f8d581db37bd2ee474171c047f
DIFF: https://github.com/llvm/llvm-project/commit/9b0b912e15d091f8d581db37bd2ee474171c047f.diff

LOG: [amdgpu][nfc] Add test case showing false aliasing in LDS lowering

Added: 
    llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
new file mode 100644
index 000000000000..28facb38cff5

--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
+
+; Test case looks at the allocated offset of @used_by_both. It's at zero when
+; allocated by itself, but at 8 when allocated in combination with the double.
+; Redundantly also checks LDSByteSize.
+ at used_by_both = addrspace(3) global i32 undef
+ at used_by_kernel = addrspace(3) global i32 undef
+ at used_by_function = addrspace(3) global double undef
+
+; kernel that calls no functions and uses an LDS variable allocates only that
+; variable, so accesses at at offset 0 and LDSByteSize is 4
+define amdgpu_kernel void @nocall_ideal() {
+; CHECK-LABEL: nocall_ideal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    ds_write_b32 v0, v0
+; CHECK-NEXT:    s_endpgm
+store i32 0, i32 addrspace(3)* @used_by_kernel
+  ret void
+}
+; CHECK: ; LDSByteSize: 4 bytes
+
+; Needs to allocate both variables, store to used_by_both is at sizeof(double)
+define amdgpu_kernel void @withcall() {
+; GFX9-LABEL: withcall:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s3
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, nonkernel at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, nonkernel at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: withcall:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10-NEXT:    s_mov_b32 s10, -1
+; GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
+; GFX10-NEXT:    s_add_u32 s8, s8, s3
+; GFX10-NEXT:    s_addc_u32 s9, s9, 0
+; GFX10-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-NEXT:    s_add_u32 s2, s2, nonkernel at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s3, s3, nonkernel at gotpcrel32@hi+12
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX10-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX10-NEXT:    s_mov_b32 s32, 0
+; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_endpgm
+;
+; G_GFX9-LABEL: withcall:
+; G_GFX9:       ; %bb.0:
+; G_GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX9-NEXT:    s_mov_b32 s10, -1
+; G_GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; G_GFX9-NEXT:    s_add_u32 s8, s8, s3
+; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; G_GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; G_GFX9-NEXT:    s_getpc_b64 s[0:1]
+; G_GFX9-NEXT:    s_add_u32 s0, s0, nonkernel at gotpcrel32@lo+4
+; G_GFX9-NEXT:    s_addc_u32 s1, s1, nonkernel at gotpcrel32@hi+12
+; G_GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; G_GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; G_GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; G_GFX9-NEXT:    s_mov_b32 s32, 0
+; G_GFX9-NEXT:    ds_write_b32 v1, v0
+; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; G_GFX9-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: withcall:
+; G_GFX10:       ; %bb.0:
+; G_GFX10-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT:    s_mov_b32 s10, -1
+; G_GFX10-NEXT:    s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT:    s_add_u32 s8, s8, s3
+; G_GFX10-NEXT:    s_addc_u32 s9, s9, 0
+; G_GFX10-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; G_GFX10-NEXT:    s_getpc_b64 s[0:1]
+; G_GFX10-NEXT:    s_add_u32 s0, s0, nonkernel at gotpcrel32@lo+4
+; G_GFX10-NEXT:    s_addc_u32 s1, s1, nonkernel at gotpcrel32@hi+12
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 8
+; G_GFX10-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; G_GFX10-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; G_GFX10-NEXT:    s_mov_b32 s32, 0
+; G_GFX10-NEXT:    ds_write_b32 v1, v0
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; G_GFX10-NEXT:    s_endpgm
+  store i32 0, i32 addrspace(3)* @used_by_both
+  call void @nonkernel()
+  ret void
+}
+; CHECK: ; LDSByteSize: 16 bytes
+
+; Kernel only needs to allocate the i32 it uses, but because that i32 was
+; also used by a non-kernel function it was block allocated along with
+; the double used by the non-kernel function, this kernel allocates 16 bytes
+; and the accesses to the integer are at offset 8
+define amdgpu_kernel void @nocall_false_sharing() {
+; GFX9-LABEL: nocall_false_sharing:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: nocall_false_sharing:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT:    s_endpgm
+;
+; G_GFX9-LABEL: nocall_false_sharing:
+; G_GFX9:       ; %bb.0:
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, 8
+; G_GFX9-NEXT:    ds_write_b32 v1, v0
+; G_GFX9-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: nocall_false_sharing:
+; G_GFX10:       ; %bb.0:
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 8
+; G_GFX10-NEXT:    ds_write_b32 v1, v0
+; G_GFX10-NEXT:    s_endpgm
+  store i32 0, i32 addrspace(3)* @used_by_both
+  ret void
+}
+; CHECK: ; LDSByteSize: 16 bytes
+
+
+
+define void @nonkernel() {
+; GFX9-LABEL: nonkernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: nonkernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; G_GFX9-LABEL: nonkernel:
+; G_GFX9:       ; %bb.0:
+; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX9-NEXT:    ds_write_b32 v3, v2
+; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; G_GFX10-LABEL: nonkernel:
+; G_GFX10:       ; %bb.0:
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT:    ds_write_b32 v3, v2
+; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store i32 0, i32 addrspace(3)* @used_by_both
+  store double 0.0, double addrspace(3)* @used_by_function
+  ret void
+}