[llvm] [AMDGPU][Draft] OOB mode - module flag (PR #160922)
Piotr Sobczak via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 9 06:17:17 PDT 2026
https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/160922
>From 59a853360c32552ad4b233cc698806a01f17b383 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 26 Sep 2025 17:58:13 +0200
Subject: [PATCH 1/4] [AMDGPU][Draft] OOB mode - module flag
Draft of a solution based on a module flag to replace the subtarget
feature with module flag.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 6 ------
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 ++++++++++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 15 +++++++++++++--
3 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index eaa1870f4be28..d158a10c01738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -128,12 +128,6 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;
-def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
- "RelaxedBufferOOBMode",
- "true",
- "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
->;
-
def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..fdfc2f9a079f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1095,6 +1095,15 @@ bool AMDGPUTargetMachine::splitModule(
return true;
}
+static unsigned getOOBModeFromModule(const Module *M) {
+ unsigned Mode = 0;
+ if (M)
+ if (Metadata *MD = M->getModuleFlag("amdgpu.oob.mode"))
+ if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
+ Mode = CI->getZExtValue();
+ return Mode;
+}
+
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@@ -1125,6 +1134,7 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const {
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+ I->setOOBMode(getOOBModeFromModule(F.getParent()));
return I.get();
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a54d6651c25c1..cf50a0c6d088b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -78,7 +78,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool BackOffBarrier = false;
bool UnalignedScratchAccess = false;
bool UnalignedAccessMode = false;
- bool RelaxedBufferOOBMode = false;
bool HasApertureRegs = false;
bool SupportsXNACK = false;
bool KernargPreload = false;
@@ -291,6 +290,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
+ // Module flag features.
+
+ // Out-Of-Bounds mode flags.
+ // Setting a bit enables a relaxed mode that disables strict OOB guarantees;
+ // an out-of-bounds access may cause a neighboring in-bounds access to be
+ // treated as OOB.
+ // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding check strict.
+ // OOBMode{0} - untyped buffers (buffer_load)
+ // OOBMode{1} - typed buffers (tbuffer_load)
+ unsigned OOBMode = 0;
+
private:
SIInstrInfo InstrInfo;
SITargetLowering TLInfo;
@@ -646,7 +656,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return UnalignedAccessMode;
}
- bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
+ bool hasRelaxedBufferOOBMode() const { return OOBMode == 1; // TODO: Use named const/enum.}
+ void setOOBMode(unsigned val) { OOBMode = val; }
bool hasApertureRegs() const {
return HasApertureRegs;
>From 5394c17828838b46dcb3ba3563cd47f84ae40e81 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Mar 2026 13:56:30 +0100
Subject: [PATCH 2/4] Address review comments
---
llvm/docs/AMDGPUUsage.rst | 55 +++++++++++++++
llvm/lib/IR/Verifier.cpp | 14 ++++
llvm/lib/Target/AMDGPU/AMDGPU.td | 5 --
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 22 +++---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 35 ++++++----
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 ++--
llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll | 4 +-
.../AMDGPU/merge-vectors.ll | 8 ++-
.../AMDGPU/unaligned-buffer.ll | 15 +++--
.../Verifier/AMDGPU/module-flag-oob-mode.ll | 67 +++++++++++++++++++
10 files changed, 195 insertions(+), 43 deletions(-)
create mode 100644 llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 036b4461ec06d..6dae7c661a747 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -812,6 +812,61 @@ For example:
=============== ============================ ==================================================
+.. _amdgpu-module-flags:
+
+Module Flags
+------------
+
+AMDGPU-specific behaviour can be controlled via LLVM module flags (see
+`Module Flags Metadata
+<https://llvm.org/docs/LangRef.html#module-flags-metadata>`_ in the language
+reference). These flags are set by frontends and are
+consumed by the AMDGPU backend during code generation.
+
+.. table:: AMDGPU Module Flags
+ :name: amdgpu-module-flags-table
+
+ =================== =========== ===== ================================================
+ Flag Name Type Merge Description
+ Behaviour
+ =================== =========== ===== ================================================
+ ``amdgpu.oob.mode`` ``i32`` Min Bitmask controlling relaxation of out-of-bounds
+ (OOB) buffer access semantics. When a bit is
+ **cleared** (strict mode, the default), the
+ backend ensures that misaligned buffer accesses
+ that straddle an OOB boundary are not merged,
+ preserving correct per-byte robustness guarantees
+ (e.g. required by Vulkan ``robustBufferAccess2``).
+ When a bit is **set** (relaxed mode), the backend
+ may merge such accesses for performance, which
+ is safe for workloads that do not
+ require strict OOB byte-level isolation.
+
+ Bits:
+
+ * ``0x1`` — relax OOB handling for **untyped**
+ buffer instructions (``buffer_load`` /
+ ``buffer_store``).
+ * ``0x2`` — relax OOB handling for **typed**
+ buffer instructions (``tbuffer_load`` /
+ ``tbuffer_store``).
+
+ The ``Min`` merge behaviour means that when
+ modules are linked, the strictest (smallest)
+ value wins: a strict module (value ``0``) linked
+ with a relaxed module always produces strict
+ semantics.
+
+ .. note::
+
+ Frontends that require misaligned-access
+ merging for performance must set this flag
+ (bits ``0x3``). Frontends that require strict
+ per-byte OOB guarantees (e.g. Vulkan
+ ``robustBufferAccess2``) should leave the
+ flag absent or set to ``0``.
+ =================== =========== ===== ================================================
+
.. _amdgpu-target-id:
Target ID
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3784ee00811f8..3b5b462c5bed2 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2070,6 +2070,20 @@ Verifier::visitModuleFlag(const MDNode *Op,
"SemanticInterposition metadata requires constant integer argument");
}
+ if (ID->getString() == "amdgpu.oob.mode") {
+ Check(MFB == Module::Min,
+ "'amdgpu.oob.mode' module flag must use 'min' merge behaviour");
+ ConstantInt *Value =
+ mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
+ Check(Value,
+ "'amdgpu.oob.mode' module flag must have a constant integer value");
+ if (Value) {
+ constexpr uint32_t KnownBits = 0x3; // UntypedBuffer | TypedBuffer
+ Check((Value->getZExtValue() & ~KnownBits) == 0,
+ "'amdgpu.oob.mode' module flag has unknown bits set");
+ }
+ }
+
if (ID->getString() == "CG Profile") {
for (const MDOperand &MDO : cast<MDNode>(Op->getOperand(2))->operands())
visitModuleFlagCGProfileEntry(MDO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ac561327a79d4..6ad730d51405a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -155,11 +155,6 @@ defm UnalignedDSAccess : AMDGPUSubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;
-defm RelaxedBufferOOBMode : AMDGPUSubtargetFeature<"relaxed-buffer-oob-mode",
- "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially"
- "cause an adjacent access to be treated as if it were also OOB"
->;
-
defm DX10ClampAndIEEEMode : AMDGPUSubtargetFeature<"dx10-clamp-and-ieee-mode",
"Target has DX10_CLAMP and IEEE_MODE kernel descriptor bits"
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 16fc06ed27c2a..ae12c5c0c7775 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -90,6 +90,7 @@
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
@@ -1188,15 +1189,6 @@ bool AMDGPUTargetMachine::splitModule(
return true;
}
-static unsigned getOOBModeFromModule(const Module *M) {
- unsigned Mode = 0;
- if (M)
- if (Metadata *MD = M->getModuleFlag("amdgpu.oob.mode"))
- if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
- Mode = CI->getZExtValue();
- return Mode;
-}
-
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
@@ -1209,13 +1201,23 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
CodeGenOptLevel OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+/// Returns the value of the "amdgpu.oob.mode" module flag, or 0 if absent.
+/// See AMDGPUOOBMode for the bit definitions.
+static unsigned getOOBModeFromModule(const Module &M) {
+ const auto *Flag =
+ mdconst::dyn_extract_or_null<ConstantInt>(M.getModuleFlag("amdgpu.oob.mode"));
+ return Flag ? static_cast<unsigned>(Flag->getZExtValue()) : 0u;
+}
+
const TargetSubtargetInfo *
GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
StringRef FS = getFeatureString(F);
+ unsigned OOBMode = getOOBModeFromModule(*F.getParent());
SmallString<128> SubtargetKey(GPU);
SubtargetKey.append(FS);
+ SubtargetKey.append((",oob=" + Twine(OOBMode)).str());
auto &I = SubtargetMap[SubtargetKey];
if (!I) {
@@ -1224,10 +1226,10 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
+ I->setOOBMode(OOBMode);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
- I->setOOBMode(getOOBModeFromModule(F.getParent()));
return I.get();
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index fda5867c0f928..609c7af8582ab 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -30,6 +30,22 @@ namespace llvm {
class GCNTargetMachine;
+/// Bit flags for the "amdgpu.oob.mode" LLVM module flag.
+/// These control per-module relaxation of out-of-bounds (OOB) buffer access
+/// semantics. Using Module::Min merge behaviour, a stricter module always
+/// overrides a more relaxed one at link time.
+namespace AMDGPUOOBMode {
+enum : unsigned {
+ /// Relax OOB handling for untyped buffer instructions (buffer_load /
+ /// buffer_store). When set, the backend may merge misaligned accesses across
+ /// an OOB boundary, which would be incorrect under strict Vulkan robustness.
+ UntypedBuffer = 0x1,
+ /// Relax OOB handling for typed buffer instructions (tbuffer_load /
+ /// tbuffer_store).
+ TypedBuffer = 0x2,
+};
+} // namespace AMDGPUOOBMode
+
class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
public:
@@ -70,6 +86,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool DynamicVGPR = false;
bool DynamicVGPRBlockSize32 = false;
bool ScalarizeGlobal = false;
+ unsigned OOBMode = 0;
/// The maximum number of instructions that may be placed within an S_CLAUSE,
/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
@@ -80,17 +97,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool ATTRIBUTE = DEFAULT;
#include "AMDGPUGenSubtargetInfo.inc"
- // Module flag features.
-
- // Out-Of-Bounds mode flags.
- // Setting a bit enables a relaxed mode that disables strict OOB guarantees;
- // an out-of-bounds access may cause a neighboring in-bounds access to be
- // treated as OOB.
- // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding check strict.
- // OOBMode{0} - untyped buffers (buffer_load)
- // OOBMode{1} - typed buffers (tbuffer_load)
- unsigned OOBMode = 0;
-
private:
SIInstrInfo InstrInfo;
SITargetLowering TLInfo;
@@ -327,10 +333,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
- bool hasRelaxedBufferOOBMode() const { return OOBMode == 1; // TODO: Use named const/enum.}
- void setOOBMode(unsigned val) { OOBMode = val; }
bool isTgSplitEnabled() const { return EnableTgSplit; }
+ bool hasRelaxedBufferOOBMode() const {
+ return OOBMode & AMDGPUOOBMode::UntypedBuffer;
+ }
+ void setOOBMode(unsigned Val) { OOBMode = Val; }
+
bool isCuModeEnabled() const { return EnableCuMode; }
bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1ee43ab8d8172..1e27afe1589cc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2205,12 +2205,13 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
Subtarget->hasUnalignedBufferAccessEnabled();
}
- // Ensure robust out-of-bounds guarantees for buffer accesses are met if
- // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
- // out-of-bounds behavior, but in the edge case where an access starts
- // out-of-bounds and then enter in-bounds, the entire access would be treated
- // as out-of-bounds. Prevent misaligned memory accesses by requiring the
- // natural alignment of buffer accesses.
+ // Ensure robust out-of-bounds guarantees for buffer accesses are met when the
+ // "amdgpu.oob.mode" module flag has not enabled relaxed untyped-buffer OOB
+ // semantics. Normally hardware will ensure proper out-of-bounds behavior, but
+ // in the edge case where an access starts out-of-bounds and then enters
+ // in-bounds, the entire access would be treated as out-of-bounds. Prevent
+ // misaligned memory accesses by requiring the natural alignment of buffer
+ // accesses.
if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
index b518d5e738031..8954ea4e9fd10 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
@@ -2,7 +2,7 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
-; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the underaligned loads and stores get split.
+; Check that in strict OOB mode for buffers ("amdgpu.oob.mode" module flag not set) the underaligned loads and stores get split.
; FIXME: The loads/stores do not get split (extend amdgpu-lower-buffer-fat-pointers?).
define amdgpu_ps void @split_underaligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
@@ -52,7 +52,7 @@ entry:
ret void
}
-; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the naturally aligned loads and stores do not get split.
+; Check that in strict OOB mode for buffers ("amdgpu.oob.mode" module flag not set) the naturally aligned loads and stores do not get split.
define amdgpu_ps void @do_not_split_aligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
; CHECK-LABEL: do_not_split_aligned_load:
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index d6b51039d5b44..6d201b0e96db7 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s \
+; RUN: | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
+; RUN: cp %s %t.relaxed.ll
+; RUN: printf '\n!llvm.module.flags = !{!0}\n!0 = !{i32 8, !"amdgpu.oob.mode", i32 1}\n' >> %t.relaxed.ll
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %t.relaxed.ll \
+; RUN: | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
define amdgpu_kernel void @merge_v2i32_v2i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 {
; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32(
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
index d590a4a403fb7..08f7375beef09 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefix=OOB-STRICT %s
-; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=OOB-RELAXED %s
+; RUN: cp %s %t.relaxed.ll
+; RUN: printf '\n!llvm.module.flags = !{!0}\n!0 = !{i32 8, !"amdgpu.oob.mode", i32 1}\n' >> %t.relaxed.ll
+; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %t.relaxed.ll \
+; RUN: | FileCheck --check-prefixes=OOB-RELAXED %s
-; The test checks that relaxed-buffer-oob-mode allows merging loads even if the target load is not naturally aligned.
+; The test checks that the "amdgpu.oob.mode" module flag (UntypedBuffer bit)
+; allows merging loads even if the target load is not naturally aligned.
define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0 {
;
@@ -20,7 +24,7 @@ define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0
; OOB-STRICT-NEXT: ret void
;
; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_4(
-; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
@@ -42,7 +46,8 @@ entry:
ret void
}
-; The test checks that strict OOB mode (relaxed-buffer-oob-mode not set) allows merging loads if the target load is naturally aligned.
+; The test checks that strict OOB mode ("amdgpu.oob.mode" absent or 0) allows
+; merging loads when the target load is naturally aligned.
define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0 {
; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_16(
@@ -57,7 +62,7 @@ define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0
; OOB-STRICT-NEXT: ret void
;
; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_16(
-; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0]] {
+; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
diff --git a/llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll b/llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll
new file mode 100644
index 0000000000000..86bff5c5b3220
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll
@@ -0,0 +1,67 @@
+; Tests for IR verifier enforcement of the "amdgpu.oob.mode" module flag.
+; The flag must use Module::Min (i32 8) merge behaviour, carry a constant
+; integer value, and have no bits set outside the currently defined mask (0x3).
+
+; RUN: split-file %s %t
+
+; --- Negative: wrong merge behaviour (Override=4 instead of Min=8) ---
+; RUN: not llvm-as %t/wrong-behavior.ll --disable-output 2>&1 \
+; RUN: | FileCheck %s --check-prefix=WRONG-BEHAVIOR
+
+; --- Negative: non-integer value ---
+; RUN: not llvm-as %t/non-integer.ll --disable-output 2>&1 \
+; RUN: | FileCheck %s --check-prefix=NON-INTEGER
+
+; --- Negative: unknown bits set ---
+; RUN: not llvm-as %t/unknown-bits.ll --disable-output 2>&1 \
+; RUN: | FileCheck %s --check-prefix=UNKNOWN-BITS
+
+; --- Positive: absent flag (no error expected) ---
+; RUN: llvm-as %t/absent.ll --disable-output 2>&1 | count 0
+
+; --- Positive: valid relaxed value 0x1 ---
+; RUN: llvm-as %t/valid-0x1.ll --disable-output 2>&1 | count 0
+
+; --- Positive: valid relaxed value 0x3 ---
+; RUN: llvm-as %t/valid-0x3.ll --disable-output 2>&1 | count 0
+
+; --- Positive: explicit strict value 0x0 ---
+; RUN: llvm-as %t/valid-0x0.ll --disable-output 2>&1 | count 0
+
+; WRONG-BEHAVIOR: 'amdgpu.oob.mode' module flag must use 'min' merge behaviour
+; NON-INTEGER: invalid value for 'min' module flag (expected constant non-negative integer)
+; UNKNOWN-BITS: 'amdgpu.oob.mode' module flag has unknown bits set
+
+;--- wrong-behavior.ll
+; Override (i32 4) is not Min (i32 8).
+!0 = !{i32 4, !"amdgpu.oob.mode", i32 1}
+!llvm.module.flags = !{!0}
+
+;--- non-integer.ll
+; Min behaviour but float value instead of integer.
+!0 = !{i32 8, !"amdgpu.oob.mode", float 1.0}
+!llvm.module.flags = !{!0}
+
+;--- unknown-bits.ll
+; Bit 2 (0x4) is not defined in AMDGPUOOBMode.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 4}
+!llvm.module.flags = !{!0}
+
+;--- absent.ll
+; No "amdgpu.oob.mode" flag at all -- should be accepted.
+define void @f() { ret void }
+
+;--- valid-0x1.ll
+; UntypedBuffer bit only.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 1}
+!llvm.module.flags = !{!0}
+
+;--- valid-0x3.ll
+; Both UntypedBuffer and TypedBuffer bits.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 3}
+!llvm.module.flags = !{!0}
+
+;--- valid-0x0.ll
+; Explicit strict mode.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 0}
+!llvm.module.flags = !{!0}
>From 6138d0ed6311d9433801e4ee0d9d7b4a9266ccb1 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Mar 2026 14:09:13 +0100
Subject: [PATCH 3/4] Fix formatter
---
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ae12c5c0c7775..3c03fa8f227d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1204,8 +1204,8 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
/// Returns the value of the "amdgpu.oob.mode" module flag, or 0 if absent.
/// See AMDGPUOOBMode for the bit definitions.
static unsigned getOOBModeFromModule(const Module &M) {
- const auto *Flag =
- mdconst::dyn_extract_or_null<ConstantInt>(M.getModuleFlag("amdgpu.oob.mode"));
+ const auto *Flag = mdconst::dyn_extract_or_null<ConstantInt>(
+ M.getModuleFlag("amdgpu.oob.mode"));
return Flag ? static_cast<unsigned>(Flag->getZExtValue()) : 0u;
}
>From 8ee63d1d610b00cbf888dcfc114fe4ee01a90754 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Mar 2026 14:16:29 +0100
Subject: [PATCH 4/4] Fix doc
---
llvm/docs/AMDGPUUsage.rst | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6dae7c661a747..1c43ac5a4340a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -828,7 +828,6 @@ consumed by the AMDGPU backend during code generation.
=================== =========== ===== ================================================
Flag Name Type Merge Description
- Behaviour
=================== =========== ===== ================================================
``amdgpu.oob.mode`` ``i32`` Min Bitmask controlling relaxation of out-of-bounds
(OOB) buffer access semantics. When a bit is
More information about the llvm-commits
mailing list