[llvm] [clang] [RFC][AMDGPU] Add OpenCL-specific fence address space masks (PR #78572)

Thu Jan 18 04:59:18 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

Note: please only review the last commit!

Using MMRAs, implement `builtin_amdgcn_fence_opencl` to allow device libs to emit fences that only target one or more address spaces, instead of fencing all address spaces at once.


---

Patch is 231.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78572.diff


44 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1) 
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+27) 
- (modified) clang/lib/CodeGen/CodeGenFunction.h (+2) 
- (modified) clang/lib/Sema/SemaChecking.cpp (+6-1) 
- (added) clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp (+108) 
- (added) llvm/docs/MemoryModelRelaxationAnnotations.rst (+476) 
- (modified) llvm/docs/Reference.rst (+4) 
- (modified) llvm/include/llvm/Analysis/VectorUtils.h (+1-1) 
- (modified) llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h (+9) 
- (modified) llvm/include/llvm/CodeGen/MachineFunction.h (+2-1) 
- (modified) llvm/include/llvm/CodeGen/MachineInstr.h (+37-10) 
- (modified) llvm/include/llvm/CodeGen/MachineInstrBuilder.h (+32-14) 
- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+12) 
- (modified) llvm/include/llvm/IR/FixedMetadataKinds.def (+1) 
- (added) llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h (+101) 
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+10-1) 
- (modified) llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp (+1) 
- (modified) llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp (+3-1) 
- (modified) llvm/lib/CodeGen/MIRPrinter.cpp (+7) 
- (modified) llvm/lib/CodeGen/MachineFunction.cpp (+2-2) 
- (modified) llvm/lib/CodeGen/MachineInstr.cpp (+37-12) 
- (modified) llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (+9) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+1-1) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (+11-6) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp (+7) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp (+2) 
- (modified) llvm/lib/IR/CMakeLists.txt (+1) 
- (modified) llvm/lib/IR/Instruction.cpp (+13-1) 
- (added) llvm/lib/IR/MemoryModelRelaxationAnnotations.cpp (+201) 
- (modified) llvm/lib/IR/Verifier.cpp (+27) 
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+23-8) 
- (modified) llvm/lib/Transforms/Scalar/GVNHoist.cpp (+2-1) 
- (modified) llvm/lib/Transforms/Utils/FunctionComparator.cpp (+5) 
- (modified) llvm/lib/Transforms/Utils/Local.cpp (+14-1) 
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll (+49) 
- (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll (+1428) 
- (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll (+1188) 
- (added) llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-private.ll (+1188) 
- (added) llvm/test/CodeGen/AMDGPU/mmra.ll (+254) 
- (added) llvm/test/Verifier/mmra-allowed.ll (+33) 
- (added) llvm/test/Verifier/mmra.ll (+37) 
- (modified) llvm/unittests/CodeGen/MachineInstrTest.cpp (+51) 
- (modified) llvm/unittests/IR/CMakeLists.txt (+1) 
- (added) llvm/unittests/IR/MemoryModelRelaxationAnnotationsTest.cpp (+34) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..b9cd21bbfe514c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -68,6 +68,7 @@ BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
 BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
 BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
 BUILTIN(__builtin_amdgcn_fence, "vUicC*", "n")
+BUILTIN(__builtin_amdgcn_fence_opencl, "vUiUicC*", "n")
 BUILTIN(__builtin_amdgcn_groupstaticsize, "Ui", "n")
 
 BUILTIN(__builtin_amdgcn_atomic_inc32, "UZiUZiD*UZiUicC*", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index de48b15645b1ab..050539b3816e54 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -55,6 +55,7 @@
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MatrixBuilder.h"
+#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -17859,6 +17860,25 @@ llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
   return Arg;
 }
 
+void CodeGenFunction::AddAMDGCNAddressSpaceMMRA(llvm::Instruction *Inst,
+                                                llvm::Value *ASMask) {
+  constexpr const char *Tag = "opencl-fence-mem";
+
+  uint64_t Mask = cast<llvm::ConstantInt>(ASMask)->getZExtValue();
+  if (Mask == 0)
+    return;
+
+  // 3 bits can be set: local, global, image in that order.
+  MMRAMetadata MMRAs;
+  if (Mask & (1 << 0))
+    MMRAs.addTag(Tag, "local");
+  if (Mask & (1 << 1))
+    MMRAs.addTag(Tag, "global");
+  if (Mask & (1 << 2))
+    MMRAs.addTag(Tag, "image");
+  Inst->setMetadata(LLVMContext::MD_MMRA, MMRAs.getAsMD(getLLVMContext()));
+}
+
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -18348,6 +18368,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                             EmitScalarExpr(E->getArg(1)), AO, SSID);
     return Builder.CreateFence(AO, SSID);
   }
+  case AMDGPU::BI__builtin_amdgcn_fence_opencl: {
+    ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(1)),
+                            EmitScalarExpr(E->getArg(2)), AO, SSID);
+    FenceInst *Fence = Builder.CreateFence(AO, SSID);
+    AddAMDGCNAddressSpaceMMRA(Fence, EmitScalarExpr(E->getArg(0)));
+    return Fence;
+  }
   case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
   case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
   case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 143ad64e8816b1..93a11c379e82c9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4401,6 +4401,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitHexagonBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitRISCVBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
                                     ReturnValueSlot ReturnValue);
+
+  void AddAMDGCNAddressSpaceMMRA(llvm::Instruction *Inst, llvm::Value *ASMask);
   void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope,
                                llvm::AtomicOrdering &AO,
                                llvm::SyncScope::ID &SSID);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ace3e386988f00..0a98f5504ff4f6 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5098,6 +5098,10 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
     OrderIndex = 0;
     ScopeIndex = 1;
     break;
+  case AMDGPU::BI__builtin_amdgcn_fence_opencl:
+    OrderIndex = 1;
+    ScopeIndex = 2;
+    break;
   default:
     return false;
   }
@@ -5120,7 +5124,8 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   switch (static_cast<llvm::AtomicOrderingCABI>(Ord)) {
   case llvm::AtomicOrderingCABI::relaxed:
   case llvm::AtomicOrderingCABI::consume:
-    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_fence)
+    if (BuiltinID == AMDGPU::BI__builtin_amdgcn_fence ||
+        BuiltinID == AMDGPU::BI__builtin_amdgcn_fence_opencl)
       return Diag(ArgExpr->getBeginLoc(),
                   diag::warn_atomic_op_has_invalid_memory_order)
              << 0 << ArgExpr->getSourceRange();
diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp
new file mode 100644
index 00000000000000..b4ac7ca8c178d9
--- /dev/null
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-fence-opencl.cpp
@@ -0,0 +1,108 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 3
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
+// RUN:   -triple=amdgcn-amd-amdhsa -Qn -mcode-object-version=none | FileCheck %s
+
+#define LOCAL_MASK (1 << 0)
+#define GLOBAL_MASK (1 << 1)
+#define IMAGE_MASK (1 << 2)
+
+//.
+// CHECK: @.str = private unnamed_addr addrspace(4) constant [10 x i8] c"workgroup\00", align 1
+// CHECK: @.str.1 = private unnamed_addr addrspace(4) constant [6 x i8] c"agent\00", align 1
+// CHECK: @.str.2 = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
+//.
+// CHECK-LABEL: define dso_local void @_Z10test_localv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META1:![0-9]+]]
+// CHECK-NEXT:    fence syncscope("agent") acquire, !mmra [[META1]]
+// CHECK-NEXT:    fence seq_cst, !mmra [[META1]]
+// CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META1]]
+// CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META1]]
+// CHECK-NEXT:    ret void
+//
+void test_local() {
+
+  __builtin_amdgcn_fence_opencl(LOCAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
+
+  __builtin_amdgcn_fence_opencl(LOCAL_MASK,__ATOMIC_ACQUIRE, "agent");
+
+  __builtin_amdgcn_fence_opencl(LOCAL_MASK,__ATOMIC_SEQ_CST, "");
+
+  __builtin_amdgcn_fence_opencl(LOCAL_MASK, 4, "agent");
+
+  __builtin_amdgcn_fence_opencl(LOCAL_MASK, 3, "workgroup");
+}
+
+// CHECK-LABEL: define dso_local void @_Z11test_globalv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META2:![0-9]+]]
+// CHECK-NEXT:    fence syncscope("agent") acquire, !mmra [[META2]]
+// CHECK-NEXT:    fence seq_cst, !mmra [[META2]]
+// CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META2]]
+// CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META2]]
+// CHECK-NEXT:    ret void
+//
+void test_global() {
+
+  __builtin_amdgcn_fence_opencl(GLOBAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
+
+  __builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_ACQUIRE, "agent");
+
+  __builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_SEQ_CST, "");
+
+  __builtin_amdgcn_fence_opencl(GLOBAL_MASK, 4, "agent");
+
+  __builtin_amdgcn_fence_opencl(GLOBAL_MASK, 3, "workgroup");
+}
+
+// CHECK-LABEL: define dso_local void @_Z10test_imagev(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
+// CHECK-NEXT:    fence syncscope("agent") acquire, !mmra [[META3]]
+// CHECK-NEXT:    fence seq_cst, !mmra [[META2]]
+// CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
+// CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// CHECK-NEXT:    ret void
+//
+void test_image() {
+
+  __builtin_amdgcn_fence_opencl(IMAGE_MASK, __ATOMIC_SEQ_CST, "workgroup");
+
+  __builtin_amdgcn_fence_opencl(IMAGE_MASK,__ATOMIC_ACQUIRE, "agent");
+
+  __builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_SEQ_CST, "");
+
+  __builtin_amdgcn_fence_opencl(IMAGE_MASK, 4, "agent");
+
+  __builtin_amdgcn_fence_opencl(IMAGE_MASK, 3, "workgroup");
+}
+
+// CHECK-LABEL: define dso_local void @_Z10test_mixedv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
+// CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
+// CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
+// CHECK-NEXT:    ret void
+//
+void test_mixed() {
+
+  __builtin_amdgcn_fence_opencl(IMAGE_MASK | GLOBAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
+  __builtin_amdgcn_fence_opencl(IMAGE_MASK | GLOBAL_MASK | LOCAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
+
+  __builtin_amdgcn_fence_opencl(0xFF,__ATOMIC_SEQ_CST, "workgroup");
+}
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1]] = !{!"opencl-fence-mem", !"local"}
+// CHECK: [[META2]] = !{!"opencl-fence-mem", !"global"}
+// CHECK: [[META3]] = !{!"opencl-fence-mem", !"image"}
+// CHECK: [[META4]] = !{[[META3]], [[META2]]}
+// CHECK: [[META5]] = !{[[META3]], [[META2]], [[META1]]}
+//.
diff --git a/llvm/docs/MemoryModelRelaxationAnnotations.rst b/llvm/docs/MemoryModelRelaxationAnnotations.rst
new file mode 100644
index 00000000000000..e83e365a3a57de
--- /dev/null
+++ b/llvm/docs/MemoryModelRelaxationAnnotations.rst
@@ -0,0 +1,476 @@
+===================================
+Memory Model Relaxation Annotations
+===================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Memory Model Relaxation Annotations (MMRAs) are target-defined properties
+on instructions that can be used to selectively relax constraints placed
+by the memory model. For example:
+
+* The use of ``VulkanMemoryModel`` in a SPIRV program allows certain
+  memory operations to be reordered across ``acquire`` or ``release``
+  operations.
+* OpenCL APIs expose primitives to only fence a specific set of address
+  spaces, carrying that information to the backend can enable the
+  use of faster synchronization instructions, rather than fencing all
+  address spaces.
+
+MMRAs offer an opt-in system for targets to relax the default LLVM
+memory model.
+As such, they are attached to an operation using LLVM metadata which
+can always be dropped without affecting correctness.
+
+Definitions
+===========
+
+memory operation
+    A load, a store, an atomic, or a function call that is marked as
+    accessing memory.
+
+synchronizing operation
+    An instruction that synchronizes memory with other threads (e.g.
+    an atomic or a fence).
+
+tag
+    Metadata attached to a memory or synchronizing operation
+    that represents some target-defined property regarding memory
+    synchronization.
+
+    An operation may have multiple tags that each represent a different
+    property.
+
+    A tag is composed of a pair of metadata nodes:
+
+    * a *prefix* string.
+    * a *suffix* integer or string.
+
+    In LLVM IR, the pair is represented using a metadata tuple.
+    In other cases (comments, documentation, etc.), we may use the
+    ``prefix:suffix`` notation.
+    For example:
+
+    .. code-block::
+      :caption: Example: Tags in Metadata
+
+      !0 = !{!"scope", !"workgroup"}  # scope:workgroup
+      !1 = !{!"scope", !"device"}     # scope:device
+      !2 = !{!"scope", !"system"}     # scope:system
+      !3 = !{!"sync-as", i32 2}  # sync-as:2
+      !4 = !{!"sync-as", i32 1}  # sync-as:1
+      !5 = !{!"sync-as", i32 0}  # sync-as:0
+
+    .. note::
+
+      The only semantics relevant to the optimizer is the
+      "compatibility" relation defined below. All other
+      semantics are target defined.
+
+    Tags can also be organised in lists to allow operations
+    to specify all of the tags they belong to. Such a list
+    is referred to as a "set of tags".
+
+    .. code-block::
+      :caption: Example: Set of Tags in Metadata
+
+      !0 = !{!"scope", !"workgroup"}
+      !1 = !{!"sync-as", !"private"}
+      !2 = !{!0, !2}
+
+    .. note::
+
+      If an operation does not have MMRA metadata, it's treated as if
+      it has an empty list (``!{}``) of tags.
+
+    Note that it is not an error if a tag is not recognized by the
+    instruction it is applied to, or by the current target.
+    Such tags are simply ignored.
+
+    Both synchronizing operations and memory operations can have
+    zero or more tags attached to them using the ``!mmra`` syntax.
+
+    For the sake of readability in examples below,
+    we use a (non-functional) short syntax to represent MMMRA metadata:
+
+    .. code-block::
+      :caption: Short Syntax Example
+
+      store %ptr1 # foo:bar
+      store %ptr1 !mmra !{!"foo", !"bar"}
+
+    These two notations can be used in this document and are strictly
+    equivalent. However, only the second version is functional.
+
+compatibility
+    Two sets of tags are said to be *compatible* iff, for every unique
+    tag prefix P present in at least one set:
+
+    - the other set contains no tag with prefix P, or
+    - at least one tag with prefix P is common to both sets.
+
+    The above definition implies that an empty set is always compatible
+    with any other set. This is an important property as it ensures that
+    if a transform drops the metadata on an operation, it can never affect
+    correctness. In other words, the memory model cannot be relaxed further
+    by deleting metadata from instructions.
+
+.. _HappensBefore:
+
+The *happens-before* Relation
+==============================
+
+Compatibility checks can be used to opt out of the *happens-before* relation
+established between two instructions.
+
+Ordering
+    When two instructions' metadata are not compatible, any program order
+    between them are not in *happens-before*.
+
+    For example, consider two tags ``foo:bar`` and
+    ``foo:baz`` exposed by a target:
+
+    .. code-block::
+
+       A: store %ptr1                 # foo:bar
+       B: store %ptr2                 # foo:baz
+       X: store atomic release %ptr3  # foo:bar
+
+    In the above figure, ``A`` is compatible with ``X``, and hence ``A``
+    happens-before ``X``. But ``B`` is not compatible with
+    ``X``, and hence it is not happens-before ``X``.
+
+Synchronization
+    If an synchronizing operation has one or more tags, then whether it
+    participate in the  ``seq_cst`` order with other operations is target
+    dependent.
+
+    .. code-block::
+
+       ; Depending on the semantics of foo:bar & foo:bux, this may not
+       ; synchronize with another sequence.
+       fence release               # foo:bar
+       store atomic %ptr1          # foo:bux
+
+Examples
+--------
+
+.. code-block:: text
+  :caption: Example 1
+
+   A: store ptr addrspace(1) %ptr2                  # sync-as:1 vulkan:nonprivate
+   B: store atomic release ptr addrspace(1) %ptr3   # sync-as:0 vulkan:nonprivate
+
+A and B are not ordered relative to each other
+(no *happens-before*) because their sets of tags are not compatible.
+
+Note that the ``sync-as`` value does not have to match the ``addrspace`` value.
+e.g. In Example 1, a store-release to a location in ``addrspace(1)`` wants to
+only synchronize with operations happening in ``addrspace(0)``.
+
+.. code-block:: text
+  :caption: Example 2
+
+   A: store ptr addrspace(1) %ptr2                 # sync-as:1 vulkan:nonprivate
+   B: store atomic release ptr addrspace(1) %ptr3  # sync-as:1 vulkan:nonprivate
+
+The ordering of A and B is unaffected because their set of tags are
+compatible.
+
+Note that A and B may or may not be in *happens-before* due to other reasons.
+
+.. code-block:: text
+  :caption: Example 3
+
+   A: store ptr addrspace(1) %ptr2                 # sync-as:1 vulkan:nonprivate
+   B: store atomic release ptr addrspace(1) %ptr3  # vulkan:nonprivate
+
+The ordering of A and B is unaffected because their set of tags are
+compatible.
+
+.. code-block:: text
+  :caption: Example 3
+
+   A: store ptr addrspace(1) %ptr2                 # sync-as:1
+   B: store atomic release ptr addrspace(1) %ptr3  # sync-as:2
+
+A and B do not have to be ordered relative to each other
+(no *happens-before*) because their sets of tags are not compatible.
+
+Use-cases
+=========
+
+SPIRV ``NonPrivatePointer``
+---------------------------
+
+MMRAs can support the SPIRV capability
+``VulkanMemoryModel``, where synchronizing operations only affect
+memory operations that specify ``NonPrivatePointer`` semantics.
+
+The example below is generated from a SPIRV program using the
+following recipe:
+
+- Add ``vulkan:nonprivate`` to every synchronizing operation.
+- Add ``vulkan:nonprivate`` to every non-atomic memory operation
+  that is marked ``NonPrivatePointer``.
+- Add ``vulkan:private`` to tags of every non-atomic memory operation
+  that is not marked ``NonPrivatePointer``.
+
+.. code-block::
+
+   Thread T1:
+    A: store %ptr1                 # vulkan:nonprivate
+    B: store %ptr2                 # vulkan:private
+    X: store atomic release %ptr3  # vulkan:nonprivate
+
+   Thread T2:
+    Y: load atomic acquire %ptr3   # vulkan:nonprivate
+    C: load %ptr2                  # vulkan:private
+    D: load %ptr1                  # vulkan:nonprivate
+
+Compatibility ensures that operation ``A`` is ordered
+relative to ``X`` while operation ``D`` is ordered relative to ``Y``.
+If ``X`` synchronizes with ``Y``, then ``A`` happens-before ``D``.
+No such relation can be inferred about operations ``B`` and ``C``.
+
+.. note::
+   The `Vulkan Memory Model <https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#memory-model-non-private>`_
+   considers all atomic operation non-private.
+
+   Whether ``vulkan:nonprivate`` would be specified on atomic operations is
+   an implementation detail, as an atomic operation is always ``nonprivate``.
+   The implementation may choose to be explicit and emit IR with
+   ``vulkan:nonprivate`` on every atomic operation, or it could choose to
+   only emit ``vulkan::private`` and assume ``vulkan:nonprivate``
+   by default.
+
+Operations marked with ``vulkan:private`` effectively opt out of the
+happens-before order in a SPIRV program since they are incompatible
+with every synchronizing operation. Note that SPIRV operations that
+are not marked ``NonPrivatePointer`` are not entirely private to the
+thread --- they are implicitly synchronized at the start or end of a
+thread by the Vulkan *system-synchronizes-with* relationship. This
+example assumes that the target-defined semantics of
+``vulkan:private`` correctly implements this property.
+
+This scheme is general enough to express the interoperability of SPIRV
+programs with other environments.
+
+.. code-block::
+
+   Thread T1:
+   A: store %ptr1                 # vulkan:nonprivate
+   X: store atomic release %ptr2  # vulkan:nonprivate
+
+   Thread T2:
+   Y: load atomic acquire %ptr2   # foo:bar
+   B: load %ptr1
+
+In the above example, thread ``T1`` originates from a SPIRV program
+while thread ``T2`` originates from a non-SPIRV program. Whether ``X``
+can synchronize with ``Y`` is target defined.  If ``X`` synchronizes
+with ``Y``, then ``A`` happens before ``B`` (because A/X and
+Y/B are compatible).
+
+Implementation Example
+~~~~~~~~~~~~~~~~~~~~~~
+
+Consider the implementation of SPIRV ``NonPrivatePointer`` on a target
+where all memory operations are cached, and the entire cache is
+flushed or invalidated at a ``release`` or ``acquire`` respectively. A
+possible scheme is that when translating a SPIRV program, memory
+operations marked ``NonPrivatePointer`` should not be cached, and the
+cache contents should not be touched during an ``acquire`` and
+``release`` operation.
+
+This could be implemented using the tags that share the ``vulkan:`` prefix,
+as follows:
+
+- For memory operations:
+
+  - Operations with ``vulkan:nonprivate`` should bypass the cache.
+  - Operations with ``vulkan:private`` should be cached.
+  - Operations that specify neither or both should conservatively
+    bypass the cache to ensure correctness.
+
+- For synchronizing operations:
+
+  - Operations with ``vulkan:nonprivate`` should not flush or
+    invalidate the cache.
+  - Operations with ``vulkan:private`` should flush or invalidate the...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/78572