[llvm] [AMDGPU] Stop combining arbitrary offsets into PAL relocs (PR #80034)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 30 09:05:03 PST 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/80034
PAL uses ELF REL (not RELA) relocations which can only store a 32-bit
addend in the instruction, even for reloc types like R_AMDGPU_ABS32_HI
which require the upper 32 bits of a 64-bit address calculation to be
correct. This means that it is not safe to fold an arbitrary offset into
a GlobalAddressSDNode, so stop doing that.
In practice this is mostly a problem for small negative offsets which do
not work as expected because PAL treats the 32-bit addend as unsigned.
>From 4d86442758e05cdf474e3157b1b513e2b3c9d639 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 30 Jan 2024 16:55:59 +0000
Subject: [PATCH] [AMDGPU] Stop combining arbitrary offsets into PAL relocs
PAL uses ELF REL (not RELA) relocations which can only store a 32-bit
addend in the instruction, even for reloc types like R_AMDGPU_ABS32_HI
which require the upper 32 bits of a 64-bit address calculation to be
correct. This means that it is not safe to fold an arbitrary offset into
a GlobalAddressSDNode, so stop doing that.
In practice this is mostly a problem for small negative offsets which do
not work as expected because PAL treats the 32-bit addend as unsigned.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++++++++++
llvm/test/CodeGen/AMDGPU/global-constant.ll | 20 ++++++++++----------
llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll | 5 ++---
llvm/test/CodeGen/AMDGPU/rel32.ll | 19 ++++++++++++++++---
4 files changed, 39 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7ab062bcc4da7..e6ca3bd0d9f34 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7133,6 +7133,17 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // Subtargets that use ELF REL relocations (instead of RELA) can only store a
+ // 32-bit addend in the instruction, so it is not safe to allow offset folding
+ // which can create arbitrary 64-bit addends. (This is only a problem for
+ // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
+ // the high 32 bits of the addend.)
+ //
+ // This should be kept in sync with how HasRelocationAddend is initialized in
+ // the constructor of ELFAMDGPUAsmBackend.
+ if (!Subtarget->isAmdHsaOS())
+ return false;
+
// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 336f012ec80e4..38b9c5df7faa1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -19,14 +19,14 @@
; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2 at rel32@hi+12
; MESA uses absolute relocations.
-; GCN-MESA: s_add_u32 s2, s4, private1 at abs32@lo
-; GCN-MESA: s_addc_u32 s3, s5, private1 at abs32@hi
+; GCN-MESA: s_add_u32 s2, private1 at abs32@lo, s4
+; GCN-MESA: s_addc_u32 s3, private1 at abs32@hi, s5
; PAL uses absolute relocations.
-; GCN-PAL: s_add_u32 s2, s4, private1 at abs32@lo
-; GCN-PAL: s_addc_u32 s3, s5, private1 at abs32@hi
-; GCN-PAL: s_add_u32 s4, s4, private2 at abs32@lo
-; GCN-PAL: s_addc_u32 s5, s5, private2 at abs32@hi
+; GCN-PAL: s_add_u32 s2, private1 at abs32@lo, s4
+; GCN-PAL: s_addc_u32 s3, private1 at abs32@hi, s5
+; GCN-PAL: s_add_u32 s4, private2 at abs32@lo, s4
+; GCN-PAL: s_addc_u32 s5, private2 at abs32@hi, s5
; R600-LABEL: private_test
define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
@@ -44,13 +44,13 @@ define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally at gotpcrel32@lo+4
; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally at gotpcrel32@hi+12
-; GCN-MESA: s_mov_b32 s1, available_externally at abs32@hi+4
-; GCN-MESA: s_mov_b32 s0, available_externally at abs32@lo+4
+; GCN-MESA: s_mov_b32 s1, available_externally at abs32@hi
+; GCN-MESA: s_mov_b32 s0, available_externally at abs32@lo
; R600-LABEL: available_externally_test
-; GCN-PAL: s_mov_b32 s3, available_externally at abs32@hi+4
-; GCN-PAL: s_mov_b32 s2, available_externally at abs32@lo+4
+; GCN-PAL: s_mov_b32 s3, available_externally at abs32@hi
+; GCN-PAL: s_mov_b32 s2, available_externally at abs32@lo
define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) {
%ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1
%val = load i32, ptr addrspace(4) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
index c7cefb37281ed..41e8762311306 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -315,10 +315,9 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(ptr ad
; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
; SI: s_getpc_b64
-; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4 at rel32@lo+20
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4 at rel32@lo+4
; SI: s_addc_u32
-; SI-DAG: s_load_dwordx4
-; SI-DAG: s_load_dwordx4
+; SI-DAG: s_load_dwordx8
; SI-DAG: s_load_dwordx2
; SI-DAG: buffer_store_dwordx4
; SI-DAG: buffer_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index aa2d269523d80..59d64f3ad2b5e 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -6,9 +6,22 @@
; CHECK-LABEL: rel32_neg_offset:
; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
-; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+12
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], -16
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], -1
define ptr addrspace(4) @rel32_neg_offset() {
- %r = getelementptr i32, ptr addrspace(4) @g, i64 -2
+ %r = getelementptr i32, ptr addrspace(4) @g, i64 -4
+ ret ptr addrspace(4) %r
+}
+
+; CHECK-LABEL: rel32_large_offset:
+; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+12
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], 0x502f9000
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], 9
+define ptr addrspace(4) @rel32_large_offset() {
+ %r = getelementptr i32, ptr addrspace(4) @g, i64 10000000000
ret ptr addrspace(4) %r
}
More information about the llvm-commits
mailing list