[llvm] [AMDGPU] Improve detection of non-null addrspacecast operands (PR #82311)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 1 04:59:24 PST 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/82311
>From 0074e6b7e2259a8d5e9c6c02dd2d274ed446dc89 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 20 Feb 2024 08:02:46 +0100
Subject: [PATCH 1/7] [AMDGPU] Add codegen-prepare-addrspacecast-non-null
---
.../codegen-prepare-addrspacecast-non-null.ll | 349 ++++++++++++++++++
1 file changed, 349 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
new file mode 100644
index 00000000000000..a7c48955b043f1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -0,0 +1,349 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-- -amdgpu-codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM
+
+; Tests that we can avoid nullptr checks for addrspacecasts from/to priv/local.
+;
+; Whenever a testcase is successful, we should see the addrspacecast replaced with the intrinsic
+; and the resulting code should have no select/cndmask null check for the pointer.
+
+define void @local_to_flat_nonnull_arg(ptr addrspace(3) nonnull %ptr) {
+; OPT-LABEL: define void @local_to_flat_nonnull_arg(
+; OPT-SAME: ptr addrspace(3) nonnull [[PTR:%.*]]) {
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
+; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: ret void
+;
+; DAGISEL-ASM-LABEL: local_to_flat_nonnull_arg:
+; DAGISEL-ASM: ; %bb.0:
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_shared_base
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; GISEL-ASM-LABEL: local_to_flat_nonnull_arg:
+; GISEL-ASM: ; %bb.0:
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+ %x = addrspacecast ptr addrspace(3) %ptr to ptr
+ store volatile i32 7, ptr %x
+ ret void
+}
+
+define void @private_to_flat_nonnull_arg(ptr addrspace(5) nonnull %ptr) {
+; OPT-LABEL: define void @private_to_flat_nonnull_arg(
+; OPT-SAME: ptr addrspace(5) nonnull [[PTR:%.*]]) {
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
+; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: ret void
+;
+; DAGISEL-ASM-LABEL: private_to_flat_nonnull_arg:
+; DAGISEL-ASM: ; %bb.0:
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; GISEL-ASM-LABEL: private_to_flat_nonnull_arg:
+; GISEL-ASM: ; %bb.0:
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+ %x = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 7, ptr %x
+ ret void
+}
+
+define void @flat_to_local_nonnull_arg(ptr nonnull %ptr) {
+; OPT-LABEL: define void @flat_to_local_nonnull_arg(
+; OPT-SAME: ptr nonnull [[PTR:%.*]]) {
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
+; OPT-NEXT: store volatile i32 7, ptr addrspace(3) [[X]], align 4
+; OPT-NEXT: ret void
+;
+; ASM-LABEL: flat_to_local_nonnull_arg:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; ASM-NEXT: v_mov_b32_e32 v1, 7
+; ASM-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; ASM-NEXT: ds_write_b32 v0, v1
+; ASM-NEXT: s_waitcnt lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
+ %x = addrspacecast ptr %ptr to ptr addrspace(3)
+ store volatile i32 7, ptr addrspace(3) %x
+ ret void
+}
+
+define void @flat_to_private_nonnull_arg(ptr nonnull %ptr) {
+; OPT-LABEL: define void @flat_to_private_nonnull_arg(
+; OPT-SAME: ptr nonnull [[PTR:%.*]]) {
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
+; OPT-NEXT: store volatile i32 7, ptr addrspace(5) [[X]], align 4
+; OPT-NEXT: ret void
+;
+; ASM-LABEL: flat_to_private_nonnull_arg:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; ASM-NEXT: v_mov_b32_e32 v1, 7
+; ASM-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; ASM-NEXT: s_waitcnt vmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
+ %x = addrspacecast ptr %ptr to ptr addrspace(5)
+ store volatile i32 7, ptr addrspace(5) %x
+ ret void
+}
+
+define void @private_alloca_to_flat(ptr %ptr) {
+; OPT-LABEL: define void @private_alloca_to_flat(
+; OPT-SAME: ptr [[PTR:%.*]]) {
+; OPT-NEXT: [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr
+; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: ret void
+;
+; DAGISEL-ASM-LABEL: private_alloca_to_flat:
+; DAGISEL-ASM: ; %bb.0:
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; GISEL-ASM-LABEL: private_alloca_to_flat:
+; GISEL-ASM: ; %bb.0:
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+ %alloca = alloca i8, addrspace(5)
+ %x = addrspacecast ptr addrspace(5) %alloca to ptr
+ store volatile i32 7, ptr %x
+ ret void
+}
+
+ at lds = internal unnamed_addr addrspace(3) global i8 undef, align 4
+
+define void @knownbits_on_flat_to_priv(ptr %ptr) {
+; OPT-LABEL: define void @knownbits_on_flat_to_priv(
+; OPT-SAME: ptr [[PTR:%.*]]) {
+; OPT-NEXT: [[PTR_INT:%.*]] = ptrtoint ptr [[PTR]] to i64
+; OPT-NEXT: [[PTR_OR:%.*]] = or i64 [[PTR_INT]], 15
+; OPT-NEXT: [[KB_PTR:%.*]] = inttoptr i64 [[PTR_OR]] to ptr
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr [[KB_PTR]] to ptr addrspace(5)
+; OPT-NEXT: store volatile i32 7, ptr addrspace(5) [[X]], align 4
+; OPT-NEXT: ret void
+;
+; DAGISEL-ASM-LABEL: knownbits_on_flat_to_priv:
+; DAGISEL-ASM: ; %bb.0:
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: v_or_b32_e32 v0, 15, v0
+; DAGISEL-ASM-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, 7
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; DAGISEL-ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; GISEL-ASM-LABEL: knownbits_on_flat_to_priv:
+; GISEL-ASM: ; %bb.0:
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: v_or_b32_e32 v0, 15, v0
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, 7
+; GISEL-ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+ %ptr.int = ptrtoint ptr %ptr to i64
+ %ptr.or = or i64 %ptr.int, 15 ; set some low bits
+ %kb.ptr = inttoptr i64 %ptr.or to ptr
+ %x = addrspacecast ptr %kb.ptr to ptr addrspace(5)
+ store volatile i32 7, ptr addrspace(5) %x
+ ret void
+}
+
+define void @knownbits_on_priv_to_flat(ptr addrspace(5) %ptr) {
+; OPT-LABEL: define void @knownbits_on_priv_to_flat(
+; OPT-SAME: ptr addrspace(5) [[PTR:%.*]]) {
+; OPT-NEXT: [[PTR_INT:%.*]] = ptrtoint ptr addrspace(5) [[PTR]] to i32
+; OPT-NEXT: [[PTR_OR:%.*]] = and i32 [[PTR_INT]], 65535
+; OPT-NEXT: [[KB_PTR:%.*]] = inttoptr i32 [[PTR_OR]] to ptr addrspace(5)
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[KB_PTR]] to ptr
+; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: ret void
+;
+; DAGISEL-ASM-LABEL: knownbits_on_priv_to_flat:
+; DAGISEL-ASM: ; %bb.0:
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; GISEL-ASM-LABEL: knownbits_on_priv_to_flat:
+; GISEL-ASM: ; %bb.0:
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+ %ptr.int = ptrtoint ptr addrspace(5) %ptr to i32
+ %ptr.or = and i32 %ptr.int, 65535 ; ensure low bits are zeroes
+ %kb.ptr = inttoptr i32 %ptr.or to ptr addrspace(5)
+ %x = addrspacecast ptr addrspace(5) %kb.ptr to ptr
+ store volatile i32 7, ptr %x
+ ret void
+}
+
+; this would recursive infinitely and we'll give up once we notice it.
+define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
+; OPT-LABEL: define void @recursive_phis(
+; OPT-SAME: i1 [[COND:%.*]], ptr addrspace(5) [[PTR:%.*]]) {
+; OPT-NEXT: entry:
+; OPT-NEXT: [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
+; OPT-NEXT: br i1 [[COND]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; OPT: then:
+; OPT-NEXT: [[PTR_INT:%.*]] = ptrtoint ptr addrspace(5) [[PTR]] to i32
+; OPT-NEXT: [[PTR_OR:%.*]] = and i32 [[PTR_INT]], 65535
+; OPT-NEXT: [[KB_PTR:%.*]] = inttoptr i32 [[PTR_OR]] to ptr addrspace(5)
+; OPT-NEXT: br label [[FINALLY:%.*]]
+; OPT: else:
+; OPT-NEXT: [[OTHER_PHI:%.*]] = phi ptr addrspace(5) [ [[ALLOCA]], [[ENTRY:%.*]] ], [ [[PHI_PTR:%.*]], [[FINALLY]] ]
+; OPT-NEXT: br label [[FINALLY]]
+; OPT: finally:
+; OPT-NEXT: [[PHI_PTR]] = phi ptr addrspace(5) [ [[KB_PTR]], [[THEN]] ], [ [[OTHER_PHI]], [[ELSE]] ]
+; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[PHI_PTR]] to ptr
+; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: br i1 [[COND]], label [[ELSE]], label [[END:%.*]]
+; OPT: end:
+; OPT-NEXT: ret void
+;
+; DAGISEL-ASM-LABEL: recursive_phis:
+; DAGISEL-ASM: ; %bb.0: ; %entry
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
+; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; DAGISEL-ASM-NEXT: ; %bb.1: ; %then
+; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
+; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
+; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1
+; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally
+; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
+; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[6:7]
+; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
+; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; DAGISEL-ASM-NEXT: flat_store_dword v[3:4], v2
+; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
+; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3
+; DAGISEL-ASM-NEXT: ; %bb.4: ; %end
+; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
+; DAGISEL-ASM-NEXT: s_waitcnt lgkmcnt(0)
+; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-ASM-LABEL: recursive_phis:
+; GISEL-ASM: ; %bb.0: ; %entry
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-ASM-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-ASM-NEXT: ; %bb.1: ; %then
+; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
+; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
+; GISEL-ASM-NEXT: s_mov_b64 s[6:7], 0
+; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT: .LBB7_3: ; %finally
+; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5]
+; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
+; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
+; GISEL-ASM-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GISEL-ASM-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
+; GISEL-ASM-NEXT: flat_store_dword v[3:4], v2
+; GISEL-ASM-NEXT: s_waitcnt vmcnt(0)
+; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3
+; GISEL-ASM-NEXT: ; %bb.4: ; %end
+; GISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-ASM-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i8, addrspace(5)
+ br i1 %cond, label %then, label %else
+
+then:
+ %ptr.int = ptrtoint ptr addrspace(5) %ptr to i32
+ %ptr.or = and i32 %ptr.int, 65535 ; ensure low bits are zeroes
+ %kb.ptr = inttoptr i32 %ptr.or to ptr addrspace(5)
+ br label %finally
+
+else:
+ %other.phi = phi ptr addrspace(5) [%alloca, %entry], [%phi.ptr, %finally]
+ br label %finally
+
+finally:
+ %phi.ptr = phi ptr addrspace(5) [%kb.ptr, %then], [%other.phi, %else]
+ %x = addrspacecast ptr addrspace(5) %phi.ptr to ptr
+ store volatile i32 7, ptr %x
+ br i1 %cond, label %else, label %end
+
+end:
+ ret void
+}
>From 772d36d75c7713cc5932716849bb1f9322290088 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 20 Feb 2024 08:12:00 +0100
Subject: [PATCH 2/7] [AMDGPU] Improve detection of non-null addrspacecast
operands
Use IR analysis to infer when an addrspacecast operand is nonnull, then
lower it to an intrinsic that the DAG can use to infer nonnull.
Solves SWDEV-316445
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 7 +
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 82 ++++++++
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 21 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 59 ++++--
llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 +
.../codegen-prepare-addrspacecast-non-null.ll | 194 ++++++------------
6 files changed, 218 insertions(+), 149 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0f29653f1f5bec..051e603c0819d2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3196,4 +3196,11 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
[IntrNoMem, IntrSpeculatable]
>;
+
+/// Emit an addrspacecast without null pointer checking.
+/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
+def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
+ [llvm_anyptr_ty], [llvm_anyptr_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 1c75c5a47c9d27..e5beae49f38503 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -51,6 +51,12 @@ static cl::opt<bool> Widen16BitOps(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<bool> LowerAddrSpaceCast(
+ "amdgpu-codegenprepare-addrspacecast",
+ cl::desc("Detect non-null addrspacecast source and lower them early to "
+ "avoid the null pointer check"),
+ cl::ReallyHidden, cl::init(true));
+
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
cl::desc("Break large PHI nodes for DAGISel"),
@@ -99,6 +105,7 @@ class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
const GCNSubtarget *ST = nullptr;
+ const AMDGPUTargetMachine *TM = nullptr;
const TargetLibraryInfo *TLInfo = nullptr;
AssumptionCache *AC = nullptr;
DominatorTree *DT = nullptr;
@@ -310,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
+ bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -2013,6 +2021,78 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
return true;
}
+bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
+ if (!LowerAddrSpaceCast)
+ return false;
+
+ // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
+ // This is only worthwhile for casts from/to priv/local to flat.
+ const unsigned SrcAS = I.getSrcAddressSpace();
+ const unsigned DstAS = I.getDestAddressSpace();
+
+ bool CanLower = false;
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
+ DstAS == AMDGPUAS::PRIVATE_ADDRESS);
+ else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
+ CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+ if (!CanLower)
+ return false;
+
+ // Check the Src operand, and look through Phis.
+ SmallVector<Value *, 4> WorkList;
+ DenseSet<const PHINode *> SeenPHIs;
+ WorkList.push_back(I.getOperand(0));
+ while (!WorkList.empty()) {
+ Value *Cur = getUnderlyingObject(WorkList.pop_back_val());
+
+ // Pointer cannot be null if it's a block address, GV or alloca.
+ // NOTE: We don't support extern_weak, but if we did, we'd need to check for
+ // it as the symbol could be null in such cases.
+ if (isa<BlockAddress>(Cur) || isa<GlobalValue>(Cur) || isa<AllocaInst>(Cur))
+ continue;
+
+ // Check nonnull arguments.
+ if (const auto *Arg = dyn_cast<Argument>(Cur); Arg && Arg->hasNonNullAttr())
+ continue;
+
+ // TODO: Calls that return nonnull?
+
+ // Look through PHIs - add all incoming values to the queue.
+ if (const auto *Phi = dyn_cast<PHINode>(Cur)) {
+ auto [It, Inserted] = SeenPHIs.insert(Phi);
+ if (!Inserted)
+ return false; // infinite recursion
+
+ for (auto &Inc : Phi->incoming_values())
+ WorkList.push_back(Inc.get());
+ continue;
+ }
+
+ // For all other things, use KnownBits.
+ // We either use 0 or all bits set to indicate null, so check whether the
+ // value can be zero or all ones.
+ auto SrcPtrKB =
+ computeKnownBits(Cur, *DL).trunc(DL->getPointerSizeInBits(SrcAS));
+ const auto NullVal = TM->getNullPointerValue(SrcAS);
+ assert((NullVal == 0 || NullVal == -1) &&
+ "don't know how to check for this null value!");
+ if (NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero())
+ continue;
+
+ // Value is unknown so we can't lower.
+ return false;
+ }
+
+ IRBuilder<> B(&I);
+ auto *Intrin = B.CreateIntrinsic(
+ I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
+ I.replaceAllUsesWith(Intrin);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::bitreverse:
@@ -2196,6 +2276,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
return false;
const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
+ Impl.TM = &TM;
Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -2214,6 +2295,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
AMDGPUCodeGenPrepareImpl Impl;
Impl.Mod = F.getParent();
Impl.DL = &Impl.Mod->getDataLayout();
+ Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 13d7510729139b..fee4b8b46847f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2247,10 +2247,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
+ // MI can either be a G_ADDRSPACE_CAST or a
+ // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
+ assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
+ (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
+ Intrinsic::amdgcn_addrspacecast_nonnull));
+
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
-
+ Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+ : MI.getOperand(1).getReg();
LLT DstTy = MRI.getType(Dst);
LLT SrcTy = MRI.getType(Src);
unsigned DestAS = DstTy.getAddressSpace();
@@ -2263,6 +2269,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const AMDGPUTargetMachine &TM
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
+ // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
+ // G_ADDRSPACE_CAST we need to guess.
+ const bool IsKnownNonNull =
+ isa<GIntrinsic>(MI) ? true : isKnownNonNull(Src, MRI, TM, SrcAS);
+
if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
return true;
@@ -2271,7 +2282,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
- if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+ if (IsKnownNonNull) {
// Extract low 32-bits of the pointer.
B.buildExtract(Dst, Src, 0);
MI.eraseFromParent();
@@ -2308,7 +2319,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// avoid the ptrtoint?
auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
- if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+ if (IsKnownNonNull) {
B.buildCopy(Dst, BuildPtr);
MI.eraseFromParent();
return true;
@@ -7020,6 +7031,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return false;
}
+ case Intrinsic::amdgcn_addrspacecast_nonnull:
+ return legalizeAddrSpaceCast(MI, MRI, B);
case Intrinsic::amdgcn_make_buffer_rsrc:
return legalizePointerAsRsrcIntrin(MI, MRI, B);
case Intrinsic::amdgcn_kernarg_segment_ptr:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..df24aa5245cfe2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1415,6 +1415,24 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
}
+void SITargetLowering::CollectTargetIntrinsicOperands(
+ const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
+ switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
+ case Intrinsic::amdgcn_addrspacecast_nonnull: {
+ // The DAG's ValueType loses the addrspaces.
+ // Add them as 2 extra Constant operands "from" and "to".
+ unsigned SrcAS =
+ I.getOperand(0)->getType()->getScalarType()->getPointerAddressSpace();
+ unsigned DstAS = I.getType()->getScalarType()->getPointerAddressSpace();
+ Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
+ Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
+ break;
+ }
+ default:
+ break;
+ }
+}
+
bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {
@@ -6635,24 +6653,37 @@ static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
- const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
-
- SDValue Src = ASC->getOperand(0);
- SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
- unsigned SrcAS = ASC->getSrcAddressSpace();
const AMDGPUTargetMachine &TM =
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+ unsigned DestAS, SrcAS;
+ SDValue Src;
+ bool KnownNonNull;
+ if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
+ SrcAS = ASC->getSrcAddressSpace();
+ Src = ASC->getOperand(0);
+ DestAS = ASC->getDestAddressSpace();
+ KnownNonNull = isKnownNonNull(Op, DAG, TM, SrcAS);
+ } else {
+ assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+ Op.getConstantOperandVal(0) ==
+ Intrinsic::amdgcn_addrspacecast_nonnull);
+ Src = Op->getOperand(1);
+ SrcAS = Op->getConstantOperandVal(2);
+ DestAS = Op->getConstantOperandVal(3);
+ KnownNonNull = true;
+ }
+
+ SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
// flat -> local/private
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
- unsigned DestAS = ASC->getDestAddressSpace();
-
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
- if (isKnownNonNull(Src, DAG, TM, SrcAS))
+ if (KnownNonNull)
return Ptr;
unsigned NullVal = TM.getNullPointerValue(DestAS);
@@ -6665,16 +6696,16 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}
// local/private -> flat
- if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
- SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
+ SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
SDValue CvtPtr =
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
- if (isKnownNonNull(Src, DAG, TM, SrcAS))
+ if (KnownNonNull)
return CvtPtr;
unsigned NullVal = TM.getNullPointerValue(SrcAS);
@@ -6697,7 +6728,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
}
- if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+ if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
Src.getValueType() == MVT::i64)
return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
@@ -6708,7 +6739,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
- return DAG.getUNDEF(ASC->getValueType(0));
+ return DAG.getUNDEF(Op->getValueType(0));
}
// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
@@ -8325,6 +8356,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
IndexKeyi32, Op.getOperand(7)});
}
+ case Intrinsic::amdgcn_addrspacecast_nonnull:
+ return lowerADDRSPACECAST(Op, DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f6e1d198f40aec..fc90a208fa0b3a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -305,6 +305,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineFunction &MF,
unsigned IntrinsicID) const override;
+ void CollectTargetIntrinsicOperands(const CallInst &I,
+ SmallVectorImpl<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
bool getAddrModeArguments(IntrinsicInst * /*I*/,
SmallVectorImpl<Value*> &/*Ops*/,
Type *&/*AccessTy*/) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index a7c48955b043f1..b5bf369c7dd0c4 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: opt -mtriple=amdgcn-- -amdgpu-codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM
; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM
@@ -11,34 +11,19 @@
define void @local_to_flat_nonnull_arg(ptr addrspace(3) nonnull %ptr) {
; OPT-LABEL: define void @local_to_flat_nonnull_arg(
; OPT-SAME: ptr addrspace(3) nonnull [[PTR:%.*]]) {
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) [[PTR]])
+; OPT-NEXT: store volatile i32 7, ptr [[TMP1]], align 4
; OPT-NEXT: ret void
;
-; DAGISEL-ASM-LABEL: local_to_flat_nonnull_arg:
-; DAGISEL-ASM: ; %bb.0:
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_shared_base
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
-; GISEL-ASM-LABEL: local_to_flat_nonnull_arg:
-; GISEL-ASM: ; %bb.0:
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; GISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; ASM-LABEL: local_to_flat_nonnull_arg:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_mov_b64 s[4:5], src_shared_base
+; ASM-NEXT: v_mov_b32_e32 v1, s5
+; ASM-NEXT: v_mov_b32_e32 v2, 7
+; ASM-NEXT: flat_store_dword v[0:1], v2
+; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
%x = addrspacecast ptr addrspace(3) %ptr to ptr
store volatile i32 7, ptr %x
ret void
@@ -47,34 +32,19 @@ define void @local_to_flat_nonnull_arg(ptr addrspace(3) nonnull %ptr) {
define void @private_to_flat_nonnull_arg(ptr addrspace(5) nonnull %ptr) {
; OPT-LABEL: define void @private_to_flat_nonnull_arg(
; OPT-SAME: ptr addrspace(5) nonnull [[PTR:%.*]]) {
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) [[PTR]])
+; OPT-NEXT: store volatile i32 7, ptr [[TMP1]], align 4
; OPT-NEXT: ret void
;
-; DAGISEL-ASM-LABEL: private_to_flat_nonnull_arg:
-; DAGISEL-ASM: ; %bb.0:
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
-; GISEL-ASM-LABEL: private_to_flat_nonnull_arg:
-; GISEL-ASM: ; %bb.0:
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; GISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; ASM-LABEL: private_to_flat_nonnull_arg:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; ASM-NEXT: v_mov_b32_e32 v1, s5
+; ASM-NEXT: v_mov_b32_e32 v2, 7
+; ASM-NEXT: flat_store_dword v[0:1], v2
+; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
%x = addrspacecast ptr addrspace(5) %ptr to ptr
store volatile i32 7, ptr %x
ret void
@@ -83,16 +53,14 @@ define void @private_to_flat_nonnull_arg(ptr addrspace(5) nonnull %ptr) {
define void @flat_to_local_nonnull_arg(ptr nonnull %ptr) {
; OPT-LABEL: define void @flat_to_local_nonnull_arg(
; OPT-SAME: ptr nonnull [[PTR:%.*]]) {
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
-; OPT-NEXT: store volatile i32 7, ptr addrspace(3) [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(3) @llvm.amdgcn.addrspacecast.nonnull.p3.p0(ptr [[PTR]])
+; OPT-NEXT: store volatile i32 7, ptr addrspace(3) [[TMP1]], align 4
; OPT-NEXT: ret void
;
; ASM-LABEL: flat_to_local_nonnull_arg:
; ASM: ; %bb.0:
; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; ASM-NEXT: v_mov_b32_e32 v1, 7
-; ASM-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; ASM-NEXT: ds_write_b32 v0, v1
; ASM-NEXT: s_waitcnt lgkmcnt(0)
; ASM-NEXT: s_setpc_b64 s[30:31]
@@ -104,16 +72,14 @@ define void @flat_to_local_nonnull_arg(ptr nonnull %ptr) {
define void @flat_to_private_nonnull_arg(ptr nonnull %ptr) {
; OPT-LABEL: define void @flat_to_private_nonnull_arg(
; OPT-SAME: ptr nonnull [[PTR:%.*]]) {
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
-; OPT-NEXT: store volatile i32 7, ptr addrspace(5) [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr [[PTR]])
+; OPT-NEXT: store volatile i32 7, ptr addrspace(5) [[TMP1]], align 4
; OPT-NEXT: ret void
;
; ASM-LABEL: flat_to_private_nonnull_arg:
; ASM: ; %bb.0:
; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; ASM-NEXT: v_mov_b32_e32 v1, 7
-; ASM-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; ASM-NEXT: s_waitcnt vmcnt(0)
; ASM-NEXT: s_setpc_b64 s[30:31]
@@ -126,33 +92,20 @@ define void @private_alloca_to_flat(ptr %ptr) {
; OPT-LABEL: define void @private_alloca_to_flat(
; OPT-SAME: ptr [[PTR:%.*]]) {
; OPT-NEXT: [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr
-; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) [[ALLOCA]])
+; OPT-NEXT: store volatile i32 7, ptr [[TMP1]], align 4
; OPT-NEXT: ret void
;
-; DAGISEL-ASM-LABEL: private_alloca_to_flat:
-; DAGISEL-ASM: ; %bb.0:
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
-; GISEL-ASM-LABEL: private_alloca_to_flat:
-; GISEL-ASM: ; %bb.0:
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; GISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; ASM-LABEL: private_alloca_to_flat:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; ASM-NEXT: v_mov_b32_e32 v1, s5
+; ASM-NEXT: v_mov_b32_e32 v2, 7
+; ASM-NEXT: flat_store_dword v[0:1], v2
+; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i8, addrspace(5)
%x = addrspacecast ptr addrspace(5) %alloca to ptr
store volatile i32 7, ptr %x
@@ -167,28 +120,18 @@ define void @knownbits_on_flat_to_priv(ptr %ptr) {
; OPT-NEXT: [[PTR_INT:%.*]] = ptrtoint ptr [[PTR]] to i64
; OPT-NEXT: [[PTR_OR:%.*]] = or i64 [[PTR_INT]], 15
; OPT-NEXT: [[KB_PTR:%.*]] = inttoptr i64 [[PTR_OR]] to ptr
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr [[KB_PTR]] to ptr addrspace(5)
-; OPT-NEXT: store volatile i32 7, ptr addrspace(5) [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr [[KB_PTR]])
+; OPT-NEXT: store volatile i32 7, ptr addrspace(5) [[TMP1]], align 4
; OPT-NEXT: ret void
;
-; DAGISEL-ASM-LABEL: knownbits_on_flat_to_priv:
-; DAGISEL-ASM: ; %bb.0:
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: v_or_b32_e32 v0, 15, v0
-; DAGISEL-ASM-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, 7
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; DAGISEL-ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
-; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
-; GISEL-ASM-LABEL: knownbits_on_flat_to_priv:
-; GISEL-ASM: ; %bb.0:
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: v_or_b32_e32 v0, 15, v0
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, 7
-; GISEL-ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0)
-; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; ASM-LABEL: knownbits_on_flat_to_priv:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: v_or_b32_e32 v0, 15, v0
+; ASM-NEXT: v_mov_b32_e32 v1, 7
+; ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; ASM-NEXT: s_waitcnt vmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
%ptr.int = ptrtoint ptr %ptr to i64
%ptr.or = or i64 %ptr.int, 15 ; set some low bits
%kb.ptr = inttoptr i64 %ptr.or to ptr
@@ -203,35 +146,22 @@ define void @knownbits_on_priv_to_flat(ptr addrspace(5) %ptr) {
; OPT-NEXT: [[PTR_INT:%.*]] = ptrtoint ptr addrspace(5) [[PTR]] to i32
; OPT-NEXT: [[PTR_OR:%.*]] = and i32 [[PTR_INT]], 65535
; OPT-NEXT: [[KB_PTR:%.*]] = inttoptr i32 [[PTR_OR]] to ptr addrspace(5)
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[KB_PTR]] to ptr
-; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: [[TMP1:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) [[KB_PTR]])
+; OPT-NEXT: store volatile i32 7, ptr [[TMP1]], align 4
; OPT-NEXT: ret void
;
-; DAGISEL-ASM-LABEL: knownbits_on_priv_to_flat:
-; DAGISEL-ASM: ; %bb.0:
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; DAGISEL-ASM-NEXT: s_setpc_b64 s[30:31]
-; GISEL-ASM-LABEL: knownbits_on_priv_to_flat:
-; GISEL-ASM: ; %bb.0:
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_mov_b64 s[4:5], src_private_base
-; GISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-ASM-NEXT: v_mov_b32_e32 v1, s5
-; GISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
-; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
-; GISEL-ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GISEL-ASM-NEXT: s_setpc_b64 s[30:31]
+; ASM-LABEL: knownbits_on_priv_to_flat:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; ASM-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; ASM-NEXT: v_mov_b32_e32 v1, s5
+; ASM-NEXT: v_mov_b32_e32 v2, 7
+; ASM-NEXT: flat_store_dword v[0:1], v2
+; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
%ptr.int = ptrtoint ptr addrspace(5) %ptr to i32
- %ptr.or = and i32 %ptr.int, 65535 ; ensure low bits are zeroes
+ %ptr.or = and i32 %ptr.int, 65535 ; ensure only lower 16 bits can be set.
%kb.ptr = inttoptr i32 %ptr.or to ptr addrspace(5)
%x = addrspacecast ptr addrspace(5) %kb.ptr to ptr
store volatile i32 7, ptr %x
>From 4c9b94d82db00c60406227e85771754d6da0dc0b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 27 Feb 2024 12:33:36 +0100
Subject: [PATCH 3/7] comments
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 67 ++++++++++---------
1 file changed, 34 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e5beae49f38503..7e46e290886140 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -51,12 +51,6 @@ static cl::opt<bool> Widen16BitOps(
cl::ReallyHidden,
cl::init(true));
-static cl::opt<bool> LowerAddrSpaceCast(
- "amdgpu-codegenprepare-addrspacecast",
- cl::desc("Detect non-null addrspacecast source and lower them early to "
- "avoid the null pointer check"),
- cl::ReallyHidden, cl::init(true));
-
static cl::opt<bool>
BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
cl::desc("Break large PHI nodes for DAGISel"),
@@ -2021,10 +2015,39 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
return true;
}
-bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
- if (!LowerAddrSpaceCast)
- return false;
+/// \param V Value to check
+/// \param DL DataLayout
+/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
+/// \param AS Target Address Space
+/// \return true if \p V cannot be the null value of \p AS, false otherwise.
+static bool isPtrKnownNeverNull(Value *V, const DataLayout &DL,
+ const AMDGPUTargetMachine &TM, unsigned AS) {
+ // Pointer cannot be null if it's a block address, GV or alloca.
+ // NOTE: We don't support extern_weak, but if we did, we'd need to check for
+ // it as the symbol could be null in such cases.
+ if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
+ return true;
+ // Check nonnull arguments.
+ if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
+ return true;
+
+ // TODO: Calls that return nonnull?
+
+ // For all other things, use KnownBits.
+ // We either use 0 or all bits set to indicate null, so check whether the
+ // value can be zero or all ones.
+ //
+ // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
+ // address spaces have non-zero null values.
+ auto SrcPtrKB = computeKnownBits(V, DL).trunc(DL.getPointerSizeInBits(AS));
+ const auto NullVal = TM.getNullPointerValue(AS);
+ assert((NullVal == 0 || NullVal == -1) &&
+ "don't know how to check for this null value!");
+ return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
+}
+
+bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
// This is only worthwhile for casts from/to priv/local to flat.
const unsigned SrcAS = I.getSrcAddressSpace();
@@ -2040,25 +2063,13 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
if (!CanLower)
return false;
- // Check the Src operand, and look through Phis.
+ // Check the Src operand, looking through any PHIs.
SmallVector<Value *, 4> WorkList;
DenseSet<const PHINode *> SeenPHIs;
WorkList.push_back(I.getOperand(0));
while (!WorkList.empty()) {
Value *Cur = getUnderlyingObject(WorkList.pop_back_val());
- // Pointer cannot be null if it's a block address, GV or alloca.
- // NOTE: We don't support extern_weak, but if we did, we'd need to check for
- // it as the symbol could be null in such cases.
- if (isa<BlockAddress>(Cur) || isa<GlobalValue>(Cur) || isa<AllocaInst>(Cur))
- continue;
-
- // Check nonnull arguments.
- if (const auto *Arg = dyn_cast<Argument>(Cur); Arg && Arg->hasNonNullAttr())
- continue;
-
- // TODO: Calls that return nonnull?
-
// Look through PHIs - add all incoming values to the queue.
if (const auto *Phi = dyn_cast<PHINode>(Cur)) {
auto [It, Inserted] = SeenPHIs.insert(Phi);
@@ -2070,18 +2081,8 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
continue;
}
- // For all other things, use KnownBits.
- // We either use 0 or all bits set to indicate null, so check whether the
- // value can be zero or all ones.
- auto SrcPtrKB =
- computeKnownBits(Cur, *DL).trunc(DL->getPointerSizeInBits(SrcAS));
- const auto NullVal = TM->getNullPointerValue(SrcAS);
- assert((NullVal == 0 || NullVal == -1) &&
- "don't know how to check for this null value!");
- if (NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero())
+ if (isPtrKnownNeverNull(Cur, *DL, *TM, SrcAS))
continue;
-
- // Value is unknown so we can't lower.
return false;
}
>From 6ec7658d7f94eb4d2483da9f48bbc1eb27786048 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 1 Mar 2024 10:58:30 +0100
Subject: [PATCH 4/7] comments
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 30 +++-----
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 +-
...n-prepare-addrspacecast-non-null-vector.ll | 13 ++++
.../codegen-prepare-addrspacecast-non-null.ll | 23 +++----
.../llvm.amdgcn.addrspacecast.nonnull.ll | 69 +++++++++++++++++++
6 files changed, 101 insertions(+), 40 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null-vector.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7e46e290886140..c11702b2c1c0bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -2020,7 +2020,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
/// \param AS Target Address Space
/// \return true if \p V cannot be the null value of \p AS, false otherwise.
-static bool isPtrKnownNeverNull(Value *V, const DataLayout &DL,
+static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
const AMDGPUTargetMachine &TM, unsigned AS) {
// Pointer cannot be null if it's a block address, GV or alloca.
// NOTE: We don't support extern_weak, but if we did, we'd need to check for
@@ -2048,6 +2048,10 @@ static bool isPtrKnownNeverNull(Value *V, const DataLayout &DL,
}
bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
+ // Intrinsic doesn't support vectors, also it seems that it's often difficult to prove that a vector cannot have any nulls in it so it's unclear if it's worth supporting.
+ if (I.getType()->isVectorTy())
+ return false;
+
// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
// This is only worthwhile for casts from/to priv/local to flat.
const unsigned SrcAS = I.getSrcAddressSpace();
@@ -2063,28 +2067,10 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
if (!CanLower)
return false;
- // Check the Src operand, looking through any PHIs.
- SmallVector<Value *, 4> WorkList;
- DenseSet<const PHINode *> SeenPHIs;
- WorkList.push_back(I.getOperand(0));
- while (!WorkList.empty()) {
- Value *Cur = getUnderlyingObject(WorkList.pop_back_val());
-
- // Look through PHIs - add all incoming values to the queue.
- if (const auto *Phi = dyn_cast<PHINode>(Cur)) {
- auto [It, Inserted] = SeenPHIs.insert(Phi);
- if (!Inserted)
- return false; // infinite recursion
-
- for (auto &Inc : Phi->incoming_values())
- WorkList.push_back(Inc.get());
- continue;
- }
-
- if (isPtrKnownNeverNull(Cur, *DL, *TM, SrcAS))
- continue;
+ SmallVector<const Value *, 4> WorkList;
+ getUnderlyingObjects(I.getOperand(0), WorkList);
+ if(!all_of(WorkList, [&](const Value* V) { return isPtrKnownNeverNull(V, *DL, *TM, SrcAS); }))
return false;
- }
IRBuilder<> B(&I);
auto *Intrin = B.CreateIntrinsic(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fee4b8b46847f6..759ecc8fed88ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2272,7 +2272,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
// G_ADDRSPACE_CAST we need to guess.
const bool IsKnownNonNull =
- isa<GIntrinsic>(MI) ? true : isKnownNonNull(Src, MRI, TM, SrcAS);
+ isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS);
if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index df24aa5245cfe2..67e10c53fe8c49 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1422,8 +1422,8 @@ void SITargetLowering::CollectTargetIntrinsicOperands(
// The DAG's ValueType loses the addrspaces.
// Add them as 2 extra Constant operands "from" and "to".
unsigned SrcAS =
- I.getOperand(0)->getType()->getScalarType()->getPointerAddressSpace();
- unsigned DstAS = I.getType()->getScalarType()->getPointerAddressSpace();
+ I.getOperand(0)->getType()->getPointerAddressSpace();
+ unsigned DstAS = I.getType()->getPointerAddressSpace();
Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
break;
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null-vector.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null-vector.ll
new file mode 100644
index 00000000000000..94c571a29f9911
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null-vector.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-- -amdgpu-codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
+
+; Check that CGP doesn't try to create a amdgcn.addrspace.nonnull of vector, as that's not supported.
+
+define <4 x ptr> @vec_of_local_to_flat_nonnull_arg() {
+; OPT-LABEL: define <4 x ptr> @vec_of_local_to_flat_nonnull_arg() {
+; OPT-NEXT: [[X:%.*]] = addrspacecast <4 x ptr addrspace(3)> zeroinitializer to <4 x ptr>
+; OPT-NEXT: ret <4 x ptr> [[X]]
+;
+ %x = addrspacecast <4 x ptr addrspace(3)> zeroinitializer to <4 x ptr>
+ ret <4 x ptr> %x
+}
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index b5bf369c7dd0c4..561002d2d77bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -168,7 +168,6 @@ define void @knownbits_on_priv_to_flat(ptr addrspace(5) %ptr) {
ret void
}
-; this would recursive infinitely and we'll give up once we notice it.
define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; OPT-LABEL: define void @recursive_phis(
; OPT-SAME: i1 [[COND:%.*]], ptr addrspace(5) [[PTR:%.*]]) {
@@ -185,8 +184,8 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; OPT-NEXT: br label [[FINALLY]]
; OPT: finally:
; OPT-NEXT: [[PHI_PTR]] = phi ptr addrspace(5) [ [[KB_PTR]], [[THEN]] ], [ [[OTHER_PHI]], [[ELSE]] ]
-; OPT-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(5) [[PHI_PTR]] to ptr
-; OPT-NEXT: store volatile i32 7, ptr [[X]], align 4
+; OPT-NEXT: [[TMP0:%.*]] = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) [[PHI_PTR]])
+; OPT-NEXT: store volatile i32 7, ptr [[TMP0]], align 4
; OPT-NEXT: br i1 [[COND]], label [[ELSE]], label [[END:%.*]]
; OPT: end:
; OPT-NEXT: ret void
@@ -202,19 +201,16 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1
; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split
; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5]
-; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1
; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0
-; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
+; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base
; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7
; DAGISEL-ASM-NEXT: .LBB7_3: ; %finally
; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
-; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[6:7]
-; DAGISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
-; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
-; DAGISEL-ASM-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
-; DAGISEL-ASM-NEXT: flat_store_dword v[3:4], v2
+; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7]
+; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
+; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9
+; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2
; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0)
; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5]
; DAGISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3
@@ -242,11 +238,8 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
; GISEL-ASM-NEXT: .LBB7_3: ; %finally
; GISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-ASM-NEXT: s_and_b64 s[8:9], exec, s[4:5]
-; GISEL-ASM-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; GISEL-ASM-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
-; GISEL-ASM-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
-; GISEL-ASM-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
-; GISEL-ASM-NEXT: flat_store_dword v[3:4], v2
+; GISEL-ASM-NEXT: flat_store_dword v[0:1], v2
; GISEL-ASM-NEXT: s_waitcnt vmcnt(0)
; GISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GISEL-ASM-NEXT: s_cbranch_execnz .LBB7_3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll
new file mode 100644
index 00000000000000..265353675b349c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.addrspacecast.nonnull.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,DAGISEL-ASM
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=ASM,GISEL-ASM
+
+define void @local_to_flat(ptr addrspace(3) %ptr) {
+; ASM-LABEL: local_to_flat:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_mov_b64 s[4:5], src_shared_base
+; ASM-NEXT: v_mov_b32_e32 v1, s5
+; ASM-NEXT: v_mov_b32_e32 v2, 7
+; ASM-NEXT: flat_store_dword v[0:1], v2
+; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+ store volatile i32 7, ptr %1, align 4
+ ret void
+}
+
+define void @private_to_flat(ptr addrspace(5) %ptr) {
+; ASM-LABEL: private_to_flat:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_mov_b64 s[4:5], src_private_base
+; ASM-NEXT: v_mov_b32_e32 v1, s5
+; ASM-NEXT: v_mov_b32_e32 v2, 7
+; ASM-NEXT: flat_store_dword v[0:1], v2
+; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
+ %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %ptr)
+ store volatile i32 7, ptr %1, align 4
+ ret void
+}
+
+define void @flat_to_local(ptr %ptr) {
+; ASM-LABEL: flat_to_local:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: v_mov_b32_e32 v1, 7
+; ASM-NEXT: ds_write_b32 v0, v1
+; ASM-NEXT: s_waitcnt lgkmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
+ %1 = call ptr addrspace(3) @llvm.amdgcn.addrspacecast.nonnull.p3.p0(ptr %ptr)
+ store volatile i32 7, ptr addrspace(3) %1, align 4
+ ret void
+}
+
+define void @flat_to_private(ptr %ptr) {
+; ASM-LABEL: flat_to_private:
+; ASM: ; %bb.0:
+; ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ASM-NEXT: v_mov_b32_e32 v1, 7
+; ASM-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; ASM-NEXT: s_waitcnt vmcnt(0)
+; ASM-NEXT: s_setpc_b64 s[30:31]
+ %1 = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr %ptr)
+ store volatile i32 7, ptr addrspace(5) %1, align 4
+ ret void
+}
+
+declare ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3))
+declare ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5))
+declare ptr addrspace(3) @llvm.amdgcn.addrspacecast.nonnull.p3.p0(ptr)
+declare ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr)
+
+declare <4 x ptr> @llvm.amdgcn.addrspacecast.nonnull.v4p0.v4p3(<4 x ptr addrspace(3)>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; DAGISEL-ASM: {{.*}}
+; GISEL-ASM: {{.*}}
>From 89f869dc6e50f1de7677366831785776893d0475 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 1 Mar 2024 11:31:21 +0100
Subject: [PATCH 5/7] clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 8 ++++++--
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +--
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index c11702b2c1c0bb..0edbbf7cb0af54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -2048,7 +2048,9 @@ static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
}
bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
- // Intrinsic doesn't support vectors, also it seems that it's often difficult to prove that a vector cannot have any nulls in it so it's unclear if it's worth supporting.
+ // Intrinsic doesn't support vectors, also it seems that it's often difficult
+ // to prove that a vector cannot have any nulls in it so it's unclear if it's
+ // worth supporting.
if (I.getType()->isVectorTy())
return false;
@@ -2069,7 +2071,9 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
SmallVector<const Value *, 4> WorkList;
getUnderlyingObjects(I.getOperand(0), WorkList);
- if(!all_of(WorkList, [&](const Value* V) { return isPtrKnownNeverNull(V, *DL, *TM, SrcAS); }))
+ if (!all_of(WorkList, [&](const Value *V) {
+ return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
+ }))
return false;
IRBuilder<> B(&I);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 67e10c53fe8c49..1d90e003fea8c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1421,8 +1421,7 @@ void SITargetLowering::CollectTargetIntrinsicOperands(
case Intrinsic::amdgcn_addrspacecast_nonnull: {
// The DAG's ValueType loses the addrspaces.
// Add them as 2 extra Constant operands "from" and "to".
- unsigned SrcAS =
- I.getOperand(0)->getType()->getPointerAddressSpace();
+ unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
unsigned DstAS = I.getType()->getPointerAddressSpace();
Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
>From 5a7732be200a1b2671e4ffe8619cc7e5ac56685e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 1 Mar 2024 11:46:38 +0100
Subject: [PATCH 6/7] defer checkss
---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 13 ++++++-------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++++-----
2 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 759ecc8fed88ee..4c3b983f2960df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2269,11 +2269,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const AMDGPUTargetMachine &TM
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
- // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
- // G_ADDRSPACE_CAST we need to guess.
- const bool IsKnownNonNull =
- isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS);
-
if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
return true;
@@ -2282,7 +2277,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
- if (IsKnownNonNull) {
+ // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
+ // G_ADDRSPACE_CAST we need to guess.
+ if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
// Extract low 32-bits of the pointer.
B.buildExtract(Dst, Src, 0);
MI.eraseFromParent();
@@ -2319,7 +2316,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// avoid the ptrtoint?
auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
- if (IsKnownNonNull) {
+ // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
+ // G_ADDRSPACE_CAST we need to guess.
+ if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
B.buildCopy(Dst, BuildPtr);
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1d90e003fea8c8..34c6038115329f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6658,12 +6658,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
unsigned DestAS, SrcAS;
SDValue Src;
- bool KnownNonNull;
+ bool IsNonNull = false;
if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
SrcAS = ASC->getSrcAddressSpace();
Src = ASC->getOperand(0);
DestAS = ASC->getDestAddressSpace();
- KnownNonNull = isKnownNonNull(Op, DAG, TM, SrcAS);
} else {
assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
Op.getConstantOperandVal(0) ==
@@ -6671,7 +6670,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
Src = Op->getOperand(1);
SrcAS = Op->getConstantOperandVal(2);
DestAS = Op->getConstantOperandVal(3);
- KnownNonNull = true;
+ IsNonNull = true;
}
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
@@ -6682,7 +6681,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
- if (KnownNonNull)
+ if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return Ptr;
unsigned NullVal = TM.getNullPointerValue(DestAS);
@@ -6704,7 +6703,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
- if (KnownNonNull)
+ if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
return CvtPtr;
unsigned NullVal = TM.getNullPointerValue(SrcAS);
>From 63f6d50c51ff238918f7db91cc4f4af6f852589a Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 1 Mar 2024 13:58:01 +0100
Subject: [PATCH 7/7] undef -> poison
---
.../CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index 561002d2d77bb1..bcdfb75ab1ef98 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -112,7 +112,7 @@ define void @private_alloca_to_flat(ptr %ptr) {
ret void
}
- at lds = internal unnamed_addr addrspace(3) global i8 undef, align 4
+ at lds = internal unnamed_addr addrspace(3) global i8 poison, align 4
define void @knownbits_on_flat_to_priv(ptr %ptr) {
; OPT-LABEL: define void @knownbits_on_flat_to_priv(
More information about the llvm-commits
mailing list