[llvm] [AMDGPU][Draft] OOB mode - module flag (PR #160922)

Mon Mar 9 06:17:17 PDT 2026

https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/160922

>From 59a853360c32552ad4b233cc698806a01f17b383 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 26 Sep 2025 17:58:13 +0200
Subject: [PATCH 1/4] [AMDGPU][Draft] OOB mode - module flag

Draft of a solution based on a module flag to replace the subtarget
feature with module flag.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td               |  6 ------
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 ++++++++++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h          | 15 +++++++++++++--
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index eaa1870f4be28..d158a10c01738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -128,12 +128,6 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
   "Hardware supports unaligned local and region loads and stores"
 >;
 
-def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
-  "RelaxedBufferOOBMode",
-  "true",
-  "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
->;
-
 def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
   "HasApertureRegs",
   "true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..fdfc2f9a079f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1095,6 +1095,15 @@ bool AMDGPUTargetMachine::splitModule(
   return true;
 }
 
+static unsigned getOOBModeFromModule(const Module *M) {
+  unsigned Mode = 0;
+  if (M)
+    if (Metadata *MD = M->getModuleFlag("amdgpu.oob.mode"))
+      if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
+        Mode = CI->getZExtValue();
+  return Mode;
+}
+
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
@@ -1125,6 +1134,7 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   }
 
   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+  I->setOOBMode(getOOBModeFromModule(F.getParent()));
 
   return I.get();
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a54d6651c25c1..cf50a0c6d088b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -78,7 +78,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool BackOffBarrier = false;
   bool UnalignedScratchAccess = false;
   bool UnalignedAccessMode = false;
-  bool RelaxedBufferOOBMode = false;
   bool HasApertureRegs = false;
   bool SupportsXNACK = false;
   bool KernargPreload = false;
@@ -291,6 +290,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
 
+  // Module flag features.
+
+  // Out-Of-Bounds mode flags.
+  // Setting a bit enables a relaxed mode that disables strict OOB guarantees;
+  // an out-of-bounds access may cause a neighboring in-bounds access to be
+  // treated as OOB.
+  // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding check strict.
+  // OOBMode{0} - untyped buffers (buffer_load)
+  // OOBMode{1} - typed buffers (tbuffer_load)
+  unsigned OOBMode = 0;
+
 private:
   SIInstrInfo InstrInfo;
   SITargetLowering TLInfo;
@@ -646,7 +656,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return UnalignedAccessMode;
   }
 
-  bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
+  bool hasRelaxedBufferOOBMode() const { return OOBMode == 1; // TODO: Use named const/enum.}
+  void setOOBMode(unsigned val) { OOBMode = val; }
 
   bool hasApertureRegs() const {
     return HasApertureRegs;

>From 5394c17828838b46dcb3ba3563cd47f84ae40e81 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Mar 2026 13:56:30 +0100
Subject: [PATCH 2/4] Address review comments

---
 llvm/docs/AMDGPUUsage.rst                     | 55 +++++++++++++++
 llvm/lib/IR/Verifier.cpp                      | 14 ++++
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  5 --
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 22 +++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         | 35 ++++++----
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 13 ++--
 llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll  |  4 +-
 .../AMDGPU/merge-vectors.ll                   |  8 ++-
 .../AMDGPU/unaligned-buffer.ll                | 15 +++--
 .../Verifier/AMDGPU/module-flag-oob-mode.ll   | 67 +++++++++++++++++++
 10 files changed, 195 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 036b4461ec06d..6dae7c661a747 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -812,6 +812,61 @@ For example:
 
      =============== ============================ ==================================================
 
+.. _amdgpu-module-flags:
+
+Module Flags
+------------
+
+AMDGPU-specific behaviour can be controlled via LLVM module flags (see
+`Module Flags Metadata
+<https://llvm.org/docs/LangRef.html#module-flags-metadata>`_ in the language
+reference). These flags are set by frontends and are
+consumed by the AMDGPU backend during code generation.
+
+.. table:: AMDGPU Module Flags
+   :name: amdgpu-module-flags-table
+
+   =================== =========== ===== ================================================
+   Flag Name           Type        Merge Description
+                                   Behaviour
+   =================== =========== ===== ================================================
+   ``amdgpu.oob.mode`` ``i32``     Min   Bitmask controlling relaxation of out-of-bounds
+                                         (OOB) buffer access semantics.  When a bit is
+                                         **cleared** (strict mode, the default), the
+                                         backend ensures that misaligned buffer accesses
+                                         that straddle an OOB boundary are not merged,
+                                         preserving correct per-byte robustness guarantees
+                                         (e.g. required by Vulkan ``robustBufferAccess2``).
+                                         When a bit is **set** (relaxed mode), the backend
+                                         may merge such accesses for performance, which
+                                         is safe for workloads that do not
+                                         require strict OOB byte-level isolation.
+
+                                         Bits:
+
+                                         * ``0x1`` — relax OOB handling for **untyped**
+                                           buffer instructions (``buffer_load`` /
+                                           ``buffer_store``).
+                                         * ``0x2`` — relax OOB handling for **typed**
+                                           buffer instructions (``tbuffer_load`` /
+                                           ``tbuffer_store``).
+
+                                         The ``Min`` merge behaviour means that when
+                                         modules are linked, the strictest (smallest)
+                                         value wins: a strict module (value ``0``) linked
+                                         with a relaxed module always produces strict
+                                         semantics.
+
+                                         .. note::
+
+                                           Frontends that require misaligned-access
+                                           merging for performance must set this flag
+                                           (bits ``0x3``). Frontends that require strict
+                                           per-byte OOB guarantees (e.g. Vulkan
+                                           ``robustBufferAccess2``) should leave the
+                                           flag absent or set to ``0``.
+   =================== =========== ===== ================================================
+
 .. _amdgpu-target-id:
 
 Target ID
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3784ee00811f8..3b5b462c5bed2 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2070,6 +2070,20 @@ Verifier::visitModuleFlag(const MDNode *Op,
           "SemanticInterposition metadata requires constant integer argument");
   }
 
+  if (ID->getString() == "amdgpu.oob.mode") {
+    Check(MFB == Module::Min,
+          "'amdgpu.oob.mode' module flag must use 'min' merge behaviour");
+    ConstantInt *Value =
+        mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
+    Check(Value,
+          "'amdgpu.oob.mode' module flag must have a constant integer value");
+    if (Value) {
+      constexpr uint32_t KnownBits = 0x3; // UntypedBuffer | TypedBuffer
+      Check((Value->getZExtValue() & ~KnownBits) == 0,
+            "'amdgpu.oob.mode' module flag has unknown bits set");
+    }
+  }
+
   if (ID->getString() == "CG Profile") {
     for (const MDOperand &MDO : cast<MDNode>(Op->getOperand(2))->operands())
       visitModuleFlagCGProfileEntry(MDO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ac561327a79d4..6ad730d51405a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -155,11 +155,6 @@ defm UnalignedDSAccess : AMDGPUSubtargetFeature<"unaligned-ds-access",
   "Hardware supports unaligned local and region loads and stores"
 >;
 
-defm RelaxedBufferOOBMode : AMDGPUSubtargetFeature<"relaxed-buffer-oob-mode",
-  "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially"
-  "cause an adjacent access to be treated as if it were also OOB"
->;
-
 defm DX10ClampAndIEEEMode : AMDGPUSubtargetFeature<"dx10-clamp-and-ieee-mode",
   "Target has DX10_CLAMP and IEEE_MODE kernel descriptor bits"
 >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 16fc06ed27c2a..ae12c5c0c7775 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -90,6 +90,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
@@ -1188,15 +1189,6 @@ bool AMDGPUTargetMachine::splitModule(
   return true;
 }
 
-static unsigned getOOBModeFromModule(const Module *M) {
-  unsigned Mode = 0;
-  if (M)
-    if (Metadata *MD = M->getModuleFlag("amdgpu.oob.mode"))
-      if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
-        Mode = CI->getZExtValue();
-  return Mode;
-}
-
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
@@ -1209,13 +1201,23 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
                                    CodeGenOptLevel OL, bool JIT)
     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
 
+/// Returns the value of the "amdgpu.oob.mode" module flag, or 0 if absent.
+/// See AMDGPUOOBMode for the bit definitions.
+static unsigned getOOBModeFromModule(const Module &M) {
+  const auto *Flag =
+      mdconst::dyn_extract_or_null<ConstantInt>(M.getModuleFlag("amdgpu.oob.mode"));
+  return Flag ? static_cast<unsigned>(Flag->getZExtValue()) : 0u;
+}
+
 const TargetSubtargetInfo *
 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef GPU = getGPUName(F);
   StringRef FS = getFeatureString(F);
 
+  unsigned OOBMode = getOOBModeFromModule(*F.getParent());
   SmallString<128> SubtargetKey(GPU);
   SubtargetKey.append(FS);
+  SubtargetKey.append((",oob=" + Twine(OOBMode)).str());
 
   auto &I = SubtargetMap[SubtargetKey];
   if (!I) {
@@ -1224,10 +1226,10 @@ GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
+    I->setOOBMode(OOBMode);
   }
 
   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
-  I->setOOBMode(getOOBModeFromModule(F.getParent()));
 
   return I.get();
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index fda5867c0f928..609c7af8582ab 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -30,6 +30,22 @@ namespace llvm {
 
 class GCNTargetMachine;
 
+/// Bit flags for the "amdgpu.oob.mode" LLVM module flag.
+/// These control per-module relaxation of out-of-bounds (OOB) buffer access
+/// semantics. Using Module::Min merge behaviour, a stricter module always
+/// overrides a more relaxed one at link time.
+namespace AMDGPUOOBMode {
+enum : unsigned {
+  /// Relax OOB handling for untyped buffer instructions (buffer_load /
+  /// buffer_store). When set, the backend may merge misaligned accesses across
+  /// an OOB boundary, which would be incorrect under strict Vulkan robustness.
+  UntypedBuffer = 0x1,
+  /// Relax OOB handling for typed buffer instructions (tbuffer_load /
+  /// tbuffer_store).
+  TypedBuffer = 0x2,
+};
+} // namespace AMDGPUOOBMode
+
 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
                            public AMDGPUSubtarget {
 public:
@@ -70,6 +86,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool DynamicVGPR = false;
   bool DynamicVGPRBlockSize32 = false;
   bool ScalarizeGlobal = false;
+  unsigned OOBMode = 0;
 
   /// The maximum number of instructions that may be placed within an S_CLAUSE,
   /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
@@ -80,17 +97,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool ATTRIBUTE = DEFAULT;
 #include "AMDGPUGenSubtargetInfo.inc"
 
-  // Module flag features.
-
-  // Out-Of-Bounds mode flags.
-  // Setting a bit enables a relaxed mode that disables strict OOB guarantees;
-  // an out-of-bounds access may cause a neighboring in-bounds access to be
-  // treated as OOB.
-  // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding check strict.
-  // OOBMode{0} - untyped buffers (buffer_load)
-  // OOBMode{1} - typed buffers (tbuffer_load)
-  unsigned OOBMode = 0;
-
 private:
   SIInstrInfo InstrInfo;
   SITargetLowering TLInfo;
@@ -327,10 +333,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
 
-  bool hasRelaxedBufferOOBMode() const { return OOBMode == 1; // TODO: Use named const/enum.}
-  void setOOBMode(unsigned val) { OOBMode = val; }
   bool isTgSplitEnabled() const { return EnableTgSplit; }
 
+  bool hasRelaxedBufferOOBMode() const {
+    return OOBMode & AMDGPUOOBMode::UntypedBuffer;
+  }
+  void setOOBMode(unsigned Val) { OOBMode = Val; }
+
   bool isCuModeEnabled() const { return EnableCuMode; }
 
   bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1ee43ab8d8172..1e27afe1589cc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2205,12 +2205,13 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
            Subtarget->hasUnalignedBufferAccessEnabled();
   }
 
-  // Ensure robust out-of-bounds guarantees for buffer accesses are met if
-  // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
-  // out-of-bounds behavior, but in the edge case where an access starts
-  // out-of-bounds and then enter in-bounds, the entire access would be treated
-  // as out-of-bounds. Prevent misaligned memory accesses by requiring the
-  // natural alignment of buffer accesses.
+  // Ensure robust out-of-bounds guarantees for buffer accesses are met when the
+  // "amdgpu.oob.mode" module flag has not enabled relaxed untyped-buffer OOB
+  // semantics. Normally hardware will ensure proper out-of-bounds behavior, but
+  // in the edge case where an access starts out-of-bounds and then enters
+  // in-bounds, the entire access would be treated as out-of-bounds.  Prevent
+  // misaligned memory accesses by requiring the natural alignment of buffer
+  // accesses.
   if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
       AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
       AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
index b518d5e738031..8954ea4e9fd10 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
 
-; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the underaligned loads and stores get split.
+; Check that in strict OOB mode for buffers ("amdgpu.oob.mode" module flag not set) the underaligned loads and stores get split.
 ; FIXME: The loads/stores do not get split (extend amdgpu-lower-buffer-fat-pointers?).
 
 define amdgpu_ps void @split_underaligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
@@ -52,7 +52,7 @@ entry:
   ret void
 }
 
-; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the naturally aligned loads and stores do not get split.
+; Check that in strict OOB mode for buffers ("amdgpu.oob.mode" module flag not set) the naturally aligned loads and stores do not get split.
 
 define amdgpu_ps void @do_not_split_aligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
 ; CHECK-LABEL: do_not_split_aligned_load:
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index d6b51039d5b44..6d201b0e96db7 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -1,6 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s \
+; RUN:   | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
+; RUN: cp %s %t.relaxed.ll
+; RUN: printf '\n!llvm.module.flags = !{!0}\n!0 = !{i32 8, !"amdgpu.oob.mode", i32 1}\n' >> %t.relaxed.ll
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %t.relaxed.ll \
+; RUN:   | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
 
 define amdgpu_kernel void @merge_v2i32_v2i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32(
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
index d590a4a403fb7..08f7375beef09 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefix=OOB-STRICT %s
-; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=OOB-RELAXED %s
+; RUN: cp %s %t.relaxed.ll
+; RUN: printf '\n!llvm.module.flags = !{!0}\n!0 = !{i32 8, !"amdgpu.oob.mode", i32 1}\n' >> %t.relaxed.ll
+; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %t.relaxed.ll \
+; RUN:   | FileCheck --check-prefixes=OOB-RELAXED %s
 
-; The test checks that relaxed-buffer-oob-mode allows merging loads even if the target load is not naturally aligned.
+; The test checks that the "amdgpu.oob.mode" module flag (UntypedBuffer bit)
+; allows merging loads even if the target load is not naturally aligned.
 
 define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0 {
 ;
@@ -20,7 +24,7 @@ define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0
 ; OOB-STRICT-NEXT:    ret void
 ;
 ; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_4(
-; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
 ; OOB-RELAXED-NEXT:  [[ENTRY:.*:]]
 ; OOB-RELAXED-NEXT:    [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
 ; OOB-RELAXED-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
@@ -42,7 +46,8 @@ entry:
   ret void
 }
 
-; The test checks that strict OOB mode (relaxed-buffer-oob-mode not set) allows merging loads if the target load is naturally aligned.
+; The test checks that strict OOB mode ("amdgpu.oob.mode" absent or 0) allows
+; merging loads when the target load is naturally aligned.
 
 define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0 {
 ; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_16(
@@ -57,7 +62,7 @@ define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0
 ; OOB-STRICT-NEXT:    ret void
 ;
 ; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_16(
-; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0]] {
+; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
 ; OOB-RELAXED-NEXT:  [[ENTRY:.*:]]
 ; OOB-RELAXED-NEXT:    [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
 ; OOB-RELAXED-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
diff --git a/llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll b/llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll
new file mode 100644
index 0000000000000..86bff5c5b3220
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/module-flag-oob-mode.ll
@@ -0,0 +1,67 @@
+; Tests for IR verifier enforcement of the "amdgpu.oob.mode" module flag.
+; The flag must use Module::Min (i32 8) merge behaviour, carry a constant
+; integer value, and have no bits set outside the currently defined mask (0x3).
+
+; RUN: split-file %s %t
+
+; --- Negative: wrong merge behaviour (Override=4 instead of Min=8) ---
+; RUN: not llvm-as %t/wrong-behavior.ll --disable-output 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=WRONG-BEHAVIOR
+
+; --- Negative: non-integer value ---
+; RUN: not llvm-as %t/non-integer.ll --disable-output 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=NON-INTEGER
+
+; --- Negative: unknown bits set ---
+; RUN: not llvm-as %t/unknown-bits.ll --disable-output 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=UNKNOWN-BITS
+
+; --- Positive: absent flag (no error expected) ---
+; RUN: llvm-as %t/absent.ll --disable-output 2>&1 | count 0
+
+; --- Positive: valid relaxed value 0x1 ---
+; RUN: llvm-as %t/valid-0x1.ll --disable-output 2>&1 | count 0
+
+; --- Positive: valid relaxed value 0x3 ---
+; RUN: llvm-as %t/valid-0x3.ll --disable-output 2>&1 | count 0
+
+; --- Positive: explicit strict value 0x0 ---
+; RUN: llvm-as %t/valid-0x0.ll --disable-output 2>&1 | count 0
+
+; WRONG-BEHAVIOR: 'amdgpu.oob.mode' module flag must use 'min' merge behaviour
+; NON-INTEGER:    invalid value for 'min' module flag (expected constant non-negative integer)
+; UNKNOWN-BITS:   'amdgpu.oob.mode' module flag has unknown bits set
+
+;--- wrong-behavior.ll
+; Override (i32 4) is not Min (i32 8).
+!0 = !{i32 4, !"amdgpu.oob.mode", i32 1}
+!llvm.module.flags = !{!0}
+
+;--- non-integer.ll
+; Min behaviour but float value instead of integer.
+!0 = !{i32 8, !"amdgpu.oob.mode", float 1.0}
+!llvm.module.flags = !{!0}
+
+;--- unknown-bits.ll
+; Bit 2 (0x4) is not defined in AMDGPUOOBMode.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 4}
+!llvm.module.flags = !{!0}
+
+;--- absent.ll
+; No "amdgpu.oob.mode" flag at all -- should be accepted.
+define void @f() { ret void }
+
+;--- valid-0x1.ll
+; UntypedBuffer bit only.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 1}
+!llvm.module.flags = !{!0}
+
+;--- valid-0x3.ll
+; Both UntypedBuffer and TypedBuffer bits.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 3}
+!llvm.module.flags = !{!0}
+
+;--- valid-0x0.ll
+; Explicit strict mode.
+!0 = !{i32 8, !"amdgpu.oob.mode", i32 0}
+!llvm.module.flags = !{!0}

>From 6138d0ed6311d9433801e4ee0d9d7b4a9266ccb1 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Mar 2026 14:09:13 +0100
Subject: [PATCH 3/4] Fix formatter

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ae12c5c0c7775..3c03fa8f227d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1204,8 +1204,8 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
 /// Returns the value of the "amdgpu.oob.mode" module flag, or 0 if absent.
 /// See AMDGPUOOBMode for the bit definitions.
 static unsigned getOOBModeFromModule(const Module &M) {
-  const auto *Flag =
-      mdconst::dyn_extract_or_null<ConstantInt>(M.getModuleFlag("amdgpu.oob.mode"));
+  const auto *Flag = mdconst::dyn_extract_or_null<ConstantInt>(
+      M.getModuleFlag("amdgpu.oob.mode"));
   return Flag ? static_cast<unsigned>(Flag->getZExtValue()) : 0u;
 }
 

>From 8ee63d1d610b00cbf888dcfc114fe4ee01a90754 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Mar 2026 14:16:29 +0100
Subject: [PATCH 4/4] Fix doc

---
 llvm/docs/AMDGPUUsage.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6dae7c661a747..1c43ac5a4340a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -828,7 +828,6 @@ consumed by the AMDGPU backend during code generation.
 
    =================== =========== ===== ================================================
    Flag Name           Type        Merge Description
-                                   Behaviour
    =================== =========== ===== ================================================
    ``amdgpu.oob.mode`` ``i32``     Min   Bitmask controlling relaxation of out-of-bounds
                                          (OOB) buffer access semantics.  When a bit is