[llvm] [AMDGPU] Stop combining arbitrary offsets into PAL relocs (PR #80034)

Tue Jan 30 09:05:03 PST 2024

https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/80034

PAL uses ELF REL (not RELA) relocations which can only store a 32-bit
addend in the instruction, even for reloc types like R_AMDGPU_ABS32_HI
which require the upper 32 bits of a 64-bit address calculation to be
correct. This means that it is not safe to fold an arbitrary offset into
a GlobalAddressSDNode, so stop doing that.

In practice this is mostly a problem for small negative offsets which do
not work as expected because PAL treats the 32-bit addend as unsigned.


>From 4d86442758e05cdf474e3157b1b513e2b3c9d639 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 30 Jan 2024 16:55:59 +0000
Subject: [PATCH] [AMDGPU] Stop combining arbitrary offsets into PAL relocs

PAL uses ELF REL (not RELA) relocations which can only store a 32-bit
addend in the instruction, even for reloc types like R_AMDGPU_ABS32_HI
which require the upper 32 bits of a 64-bit address calculation to be
correct. This means that it is not safe to fold an arbitrary offset into
a GlobalAddressSDNode, so stop doing that.

In practice this is mostly a problem for small negative offsets which do
not work as expected because PAL treats the 32-bit addend as unsigned.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   | 11 +++++++++++
 llvm/test/CodeGen/AMDGPU/global-constant.ll | 20 ++++++++++----------
 llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll     |  5 ++---
 llvm/test/CodeGen/AMDGPU/rel32.ll           | 19 ++++++++++++++++---
 4 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7ab062bcc4da7..e6ca3bd0d9f34 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7133,6 +7133,17 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
 
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // Subtargets that use ELF REL relocations (instead of RELA) can only store a
+  // 32-bit addend in the instruction, so it is not safe to allow offset folding
+  // which can create arbitrary 64-bit addends. (This is only a problem for
+  // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
+  // the high 32 bits of the addend.)
+  //
+  // This should be kept in sync with how HasRelocationAddend is initialized in
+  // the constructor of ELFAMDGPUAsmBackend.
+  if (!Subtarget->isAmdHsaOS())
+    return false;
+
   // We can fold offsets for anything that doesn't require a GOT relocation.
   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 336f012ec80e4..38b9c5df7faa1 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -19,14 +19,14 @@
 ; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2 at rel32@hi+12
 
 ; MESA uses absolute relocations.
-; GCN-MESA: s_add_u32 s2, s4, private1 at abs32@lo
-; GCN-MESA: s_addc_u32 s3, s5, private1 at abs32@hi
+; GCN-MESA: s_add_u32 s2, private1 at abs32@lo, s4
+; GCN-MESA: s_addc_u32 s3, private1 at abs32@hi, s5
 
 ; PAL uses absolute relocations.
-; GCN-PAL:    s_add_u32 s2, s4, private1 at abs32@lo
-; GCN-PAL:    s_addc_u32 s3, s5, private1 at abs32@hi
-; GCN-PAL:    s_add_u32 s4, s4, private2 at abs32@lo
-; GCN-PAL:    s_addc_u32 s5, s5, private2 at abs32@hi
+; GCN-PAL:    s_add_u32 s2, private1 at abs32@lo, s4
+; GCN-PAL:    s_addc_u32 s3, private1 at abs32@hi, s5
+; GCN-PAL:    s_add_u32 s4, private2 at abs32@lo, s4
+; GCN-PAL:    s_addc_u32 s5, private2 at abs32@hi, s5
 
 ; R600-LABEL: private_test
 define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
@@ -44,13 +44,13 @@ define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
 ; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally at gotpcrel32@lo+4
 ; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally at gotpcrel32@hi+12
 
-; GCN-MESA:    s_mov_b32 s1, available_externally at abs32@hi+4
-; GCN-MESA:    s_mov_b32 s0, available_externally at abs32@lo+4
+; GCN-MESA:    s_mov_b32 s1, available_externally at abs32@hi
+; GCN-MESA:    s_mov_b32 s0, available_externally at abs32@lo
 
 ; R600-LABEL: available_externally_test
 
-; GCN-PAL:    s_mov_b32 s3, available_externally at abs32@hi+4
-; GCN-PAL:    s_mov_b32 s2, available_externally at abs32@lo+4
+; GCN-PAL:    s_mov_b32 s3, available_externally at abs32@hi
+; GCN-PAL:    s_mov_b32 s2, available_externally at abs32@lo
 define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) {
   %ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1
   %val = load i32, ptr addrspace(4) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
index c7cefb37281ed..41e8762311306 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -315,10 +315,9 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(ptr ad
 
 ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
 ; SI: s_getpc_b64
-; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4 at rel32@lo+20
+; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4 at rel32@lo+4
 ; SI: s_addc_u32
-; SI-DAG: s_load_dwordx4
-; SI-DAG: s_load_dwordx4
+; SI-DAG: s_load_dwordx8
 ; SI-DAG: s_load_dwordx2
 ; SI-DAG: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index aa2d269523d80..59d64f3ad2b5e 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -6,9 +6,22 @@
 
 ; CHECK-LABEL: rel32_neg_offset:
 ; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo-4
-; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+4
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+12
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], -16
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], -1
 define ptr addrspace(4) @rel32_neg_offset() {
-  %r = getelementptr i32, ptr addrspace(4) @g, i64 -2
+  %r = getelementptr i32, ptr addrspace(4) @g, i64 -4
+  ret ptr addrspace(4) %r
+}
+
+; CHECK-LABEL: rel32_large_offset:
+; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g at rel32@hi+12
+; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], 0x502f9000
+; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], 9
+define ptr addrspace(4) @rel32_large_offset() {
+  %r = getelementptr i32, ptr addrspace(4) @g, i64 10000000000
   ret ptr addrspace(4) %r
 }