[llvm] [AMDGPU] Add target feature require-naturally-aligned-buffer-access (PR #115479)
Piotr Sobczak via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 21 07:40:57 PST 2024
https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/115479
>From 9099a051d9b3ce7d9eb36ed3def65d3b1a4dd338 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 8 Nov 2024 12:21:43 +0100
Subject: [PATCH 1/4] [AMDGPU] Add target feature
require-naturally-aligned-buffer-access
Add a new target feature require-naturally-aligned-buffer-access to guarantee robust
out-of-bounds behavior. When set, it will disallow buffer accesses with alignment
lower than natural alignment.
This is needed to specifically address the edge case where an access starts
out-of-bounds and then enter in-bounds, as the hardware would treat the entire access
as out-of-bounds. This is normally not needed for most users (hence the target
feature), but at least one graphics device extension (VK_EXT_robustness2) has
very strict requirements - in-bounds accesses must return correct value,
and out-of-bounds accesses must return zero.
The direct result of the patch is that, when the new target feature is set, a buffer
access at a negative address will not be merged with one at a positive address.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 ++++
.../AMDGPU/unaligned-buffer.ll | 80 +++++++++++++++++++
4 files changed, 105 insertions(+)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index bde61a1f7e58df..8a184a92f016e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -119,6 +119,12 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;
+def FeatureRequireNaturallyAlignedBufferAccess : SubtargetFeature<"require-naturally-aligned-buffer-access",
+ "RequireNaturallyAlignedBufferAccess",
+ "true",
+ "Requires natural alignment of buffer accesses"
+>;
+
def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6ff964077d8fd0..541e3c0f399e30 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -76,6 +76,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool BackOffBarrier = false;
bool UnalignedScratchAccess = false;
bool UnalignedAccessMode = false;
+ bool RequireNaturallyAlignedBufferAccess = false;
bool HasApertureRegs = false;
bool SupportsXNACK = false;
bool KernargPreload = false;
@@ -600,6 +601,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return UnalignedAccessMode;
}
+ bool requiresNaturallyAlignedBufferAccess() const {
+ return RequireNaturallyAlignedBufferAccess;
+ }
+
bool hasApertureRegs() const {
return HasApertureRegs;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 419414e5bd993d..d4321eb682dd9b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1840,6 +1840,20 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
Subtarget->hasUnalignedBufferAccessEnabled();
}
+ // Check natural alignment of buffer if the target requires it. This is needed
+ // only if robust out-of-bounds guarantees are needed. Normally hardware will
+ // ensure proper out-of-bounds behavior, but in the edge case where an access
+ // starts out-of-bounds and then enter in-bounds, the entire access would be
+ // treated as out-of-bounds. Requiring the natural alignment avoids the
+ // problem.
+ if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
+ AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
+ AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
+ if (Subtarget->requiresNaturallyAlignedBufferAccess() &&
+ Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
+ return false;
+ }
+
// Smaller than dword value must be aligned.
if (Size < 32)
return false;
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
new file mode 100644
index 00000000000000..0d8a98feecb82b
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+require-naturally-aligned-buffer-access -S -o - %s | FileCheck --check-prefix=ALIGNED %s
+; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=UNALIGNED %s
+
+; The test checks that require-naturally-aligned-buffer-access target feature prevents merging loads if the target load would not be naturally aligned.
+
+define amdgpu_kernel void @merge_align_4(ptr addrspace(7) nocapture %p, ptr addrspace(7) nocapture %p2) #0 {
+;
+; ALIGNED-LABEL: define amdgpu_kernel void @merge_align_4(
+; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) #[[ATTR0:[0-9]+]] {
+; ALIGNED-NEXT: [[ENTRY:.*:]]
+; ALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; ALIGNED-NEXT: [[LD_M8:%.*]] = load i32, ptr addrspace(7) [[GEP_M8]], align 4
+; ALIGNED-NEXT: [[GEP_M4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -4
+; ALIGNED-NEXT: [[LD_M4:%.*]] = load i32, ptr addrspace(7) [[GEP_M4]], align 4
+; ALIGNED-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 0
+; ALIGNED-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(7) [[GEP_0]], align 4
+; ALIGNED-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i64 4
+; ALIGNED-NEXT: [[LD_4:%.*]] = load i32, ptr addrspace(7) [[GEP_4]], align 4
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: define amdgpu_kernel void @merge_align_4(
+; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) {
+; UNALIGNED-NEXT: [[ENTRY:.*:]]
+; UNALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; UNALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
+; UNALIGNED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; UNALIGNED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; UNALIGNED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; UNALIGNED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; UNALIGNED-NEXT: ret void
+;
+entry:
+ %gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
+ %ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 4
+ %gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
+ %ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
+ %gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld_0 = load i32, ptr addrspace(7) %gep_0, align 4
+ %gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
+ %ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
+ ret void
+}
+
+; The test checks that require-naturally-aligned-buffer-access target feature does not prevent merging loads if the target load would be naturally aligned.
+
+define amdgpu_kernel void @merge_align_16(ptr addrspace(7) nocapture %p, ptr addrspace(7) nocapture %p2) #0 {
+; ALIGNED-LABEL: define amdgpu_kernel void @merge_align_16(
+; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) #[[ATTR0]] {
+; ALIGNED-NEXT: [[ENTRY:.*:]]
+; ALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; ALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
+; ALIGNED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; ALIGNED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; ALIGNED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; ALIGNED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: define amdgpu_kernel void @merge_align_16(
+; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) {
+; UNALIGNED-NEXT: [[ENTRY:.*:]]
+; UNALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; UNALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
+; UNALIGNED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; UNALIGNED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; UNALIGNED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; UNALIGNED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; UNALIGNED-NEXT: ret void
+;
+entry:
+ %gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
+ %ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 16
+ %gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
+ %ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
+ %gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld_0 = load i32, ptr addrspace(7) %gep_0, align 8
+ %gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
+ %ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
+ ret void
+}
>From b78e15ba5b6d969a3b5ccd7e8c653ec944ff26bf Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 8 Nov 2024 17:49:28 +0100
Subject: [PATCH 2/4] Rename function.
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 541e3c0f399e30..56a027febcac0a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -601,7 +601,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return UnalignedAccessMode;
}
- bool requiresNaturallyAlignedBufferAccess() const {
+ bool hasRequireNaturallyAlignedBufferAccess() const {
return RequireNaturallyAlignedBufferAccess;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d4321eb682dd9b..adc9c9224000d2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1849,7 +1849,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
- if (Subtarget->requiresNaturallyAlignedBufferAccess() &&
+ if (Subtarget->hasRequireNaturallyAlignedBufferAccess() &&
Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
return false;
}
>From adbd8b3c09046607a983c56ff7b2dac788d2ae73 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Thu, 21 Nov 2024 15:51:25 +0100
Subject: [PATCH 3/4] Simplify test
---
.../LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
index 0d8a98feecb82b..980fc739780d10 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
@@ -4,10 +4,10 @@
; The test checks that require-naturally-aligned-buffer-access target feature prevents merging loads if the target load would not be naturally aligned.
-define amdgpu_kernel void @merge_align_4(ptr addrspace(7) nocapture %p, ptr addrspace(7) nocapture %p2) #0 {
+define amdgpu_kernel void @merge_align_4(ptr addrspace(7) nocapture %p) #0 {
;
; ALIGNED-LABEL: define amdgpu_kernel void @merge_align_4(
-; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) #[[ATTR0:[0-9]+]] {
+; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; ALIGNED-NEXT: [[ENTRY:.*:]]
; ALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; ALIGNED-NEXT: [[LD_M8:%.*]] = load i32, ptr addrspace(7) [[GEP_M8]], align 4
@@ -20,7 +20,7 @@ define amdgpu_kernel void @merge_align_4(ptr addrspace(7) nocapture %p, ptr addr
; ALIGNED-NEXT: ret void
;
; UNALIGNED-LABEL: define amdgpu_kernel void @merge_align_4(
-; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) {
+; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]]) {
; UNALIGNED-NEXT: [[ENTRY:.*:]]
; UNALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; UNALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
@@ -44,9 +44,9 @@ entry:
; The test checks that require-naturally-aligned-buffer-access target feature does not prevent merging loads if the target load would be naturally aligned.
-define amdgpu_kernel void @merge_align_16(ptr addrspace(7) nocapture %p, ptr addrspace(7) nocapture %p2) #0 {
+define amdgpu_kernel void @merge_align_16(ptr addrspace(7) nocapture %p) #0 {
; ALIGNED-LABEL: define amdgpu_kernel void @merge_align_16(
-; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) #[[ATTR0]] {
+; ALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]]) #[[ATTR0]] {
; ALIGNED-NEXT: [[ENTRY:.*:]]
; ALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; ALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
@@ -57,7 +57,7 @@ define amdgpu_kernel void @merge_align_16(ptr addrspace(7) nocapture %p, ptr add
; ALIGNED-NEXT: ret void
;
; UNALIGNED-LABEL: define amdgpu_kernel void @merge_align_16(
-; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]], ptr addrspace(7) nocapture [[P2:%.*]]) {
+; UNALIGNED-SAME: ptr addrspace(7) nocapture [[P:%.*]]) {
; UNALIGNED-NEXT: [[ENTRY:.*:]]
; UNALIGNED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; UNALIGNED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
>From 52b3d959cc5b34f25b7a9ff6c1591bc907dae1a4 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Thu, 21 Nov 2024 16:03:44 +0100
Subject: [PATCH 4/4] Add codegen test for problems with underaligned memory
accesses
---
llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll | 51 ++++++++++++++++++++
1 file changed, 51 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
new file mode 100644
index 00000000000000..7331cbd25c8157
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+require-naturally-aligned-buffer-access < %s | FileCheck %s
+
+; Check that with require-naturally-aligned-buffer-access attribute, the underaligned loads and stores get split.
+; FIXME: The loads/stores do not get split (extend amdgpu-lower-buffer-fat-pointers?).
+
+define amdgpu_ps void @split_underaligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
+; CHECK-LABEL: split_underaligned_load:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s9
+; CHECK-NEXT: s_mov_b32 s15, s8
+; CHECK-NEXT: s_mov_b32 s14, s7
+; CHECK-NEXT: s_mov_b32 s13, s6
+; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; CHECK-NEXT: s_mov_b32 s12, s5
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; CHECK-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld = load i64, ptr addrspace(7) %gep, align 4
+
+ %gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
+ store i64 %ld, ptr addrspace(7) %gep2, align 4
+ ret void
+}
+
+; Check that even with require-naturally-aligned-buffer-access attribute, the naturally aligned loads and stores do not get split.
+
+define amdgpu_ps void @do_not_split_aligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
+; CHECK-LABEL: do_not_split_aligned_load:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s9
+; CHECK-NEXT: s_mov_b32 s15, s8
+; CHECK-NEXT: s_mov_b32 s14, s7
+; CHECK-NEXT: s_mov_b32 s13, s6
+; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; CHECK-NEXT: s_mov_b32 s12, s5
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; CHECK-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld = load i64, ptr addrspace(7) %gep, align 8
+
+ %gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
+ store i64 %ld, ptr addrspace(7) %gep2, align 8
+ ret void
+}
More information about the llvm-commits
mailing list