[llvm-branch-commits] [clang] [lld] [llvm] [mlir] [RFC][AMDGPU] Add BARRIER address space (PR #195613)

Pierre van Houtryve via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Jun 15 05:26:45 PDT 2026


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/195613

>From c3b854d2e576c03c038d9ace55ff4eb8ec6c96a5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 22 Apr 2026 11:35:22 +0200
Subject: [PATCH 1/6] [RFC][AMDGPU] Add BARRIER address space

Add a new BARRIER address space that is used for global variables that are used to represent the barrier IDs in GFX12.5.

These barrier addresses just have values corresponding 1-1 to barrier IDs. They are still implemented on top of LDS, but the offsetting happens during an addrspacecast to generic, not whenever the barrier GV is used.

The motivation for this is to make the relation between LDS and barrier GVs explicit in the compiler. It does add a bit more complexity, but that complexity was already there, just hidden by pretending barrier GVs were actual LDS.
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  15 +-
 clang/test/CodeGen/target-data.c              |   4 +-
 clang/test/CodeGenHIP/amdgpu-barrier-type.hip |  34 +-
 clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl |   2 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx12.cl    |  16 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |   4 +-
 lld/test/ELF/lto/amdgcn-oses.ll               |   6 +-
 lld/test/ELF/lto/amdgcn.ll                    |   2 +-
 llvm/docs/AMDGPUUsage.rst                     |  34 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  11 +-
 llvm/include/llvm/Support/AMDGPUAddrSpace.h   |  16 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  28 +-
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  44 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  24 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  66 ++-
 .../lib/Target/AMDGPU/AMDGPULowerExecSync.cpp |  29 +-
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       |  10 -
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |   4 +-
 .../AMDGPU/AMDGPUMachineFunctionInfo.cpp      |  42 +-
 .../Target/AMDGPU/AMDGPUMachineFunctionInfo.h |   9 +-
 llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp  |  12 +-
 llvm/lib/Target/AMDGPU/SIDefines.h            |   4 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 107 +++--
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |   2 +-
 llvm/lib/TargetParser/TargetDataLayout.cpp    |   5 +-
 .../AMDGPU/always_uniform.ll                  |   6 +-
 .../CodeGen/AMDGPU/addrspacecast-barrier.ll   | 442 ++++++++++++++++++
 .../amdgpu-lower-exec-sync-and-module-lds.ll  |  64 +--
 .../amdgpu-lower-exec-sync-and-sw-lds.ll      |  39 +-
 .../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll  |  64 +--
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   8 +-
 ...utor-flatscratchinit-undefined-behavior.ll |   4 +-
 .../AMDGPU/attributor-flatscratchinit.ll      |  20 +-
 .../AMDGPU/attributor-noalias-addrspace.ll    |   4 +-
 .../AMDGPU/barrier-addrspace-dereference.ll   |  16 +
 .../lds-link-time-codegen-named-barrier.ll    |  13 +-
 .../AMDGPU/lds-link-time-named-barrier.ll     |  14 +-
 .../CodeGen/AMDGPU/null-named-barrier-gv.ll   |  31 ++
 .../s-barrier-lowering-bad-absolute-symbol.ll |  16 +
 .../s-barrier-lowering-wrong-gv-signature.ll  |  27 ++
 .../test/CodeGen/AMDGPU/s-barrier-lowering.ll |  66 +--
 llvm/test/CodeGen/AMDGPU/s-barrier.ll         |  85 ++--
 llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll  |  14 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |   2 +-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  21 +-
 mlir/test/Dialect/LLVMIR/rocdl.mlir           |  30 +-
 mlir/test/Target/LLVMIR/rocdl.mlir            |  30 +-
 48 files changed, 1107 insertions(+), 440 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 5a65d23ec4938..4f5f846a603c9 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -478,8 +478,21 @@ AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
 
 LangAS AMDGPUTargetCodeGenInfo::adjustGlobalVarAddressSpace(
     CodeGenModule &CGM, const VarDecl *D, std::optional<LangAS> AS) const {
-  if (AS)
+  if (AS) {
+    // NamedWorkgroupBarrier GVs are declared as __shared__, but the back-end
+    // models them as a separate address space.
+    const LangOptions &LangOpts = CGM.getLangOpts();
+    if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+        AS == LangAS::cuda_shared) {
+      const Type *Ty = D->getType().getCanonicalType().getTypePtr();
+      if (Ty->isArrayType())
+        Ty = Ty->getBaseElementTypeUnsafe();
+      const BuiltinType *BTy = dyn_cast<BuiltinType>(Ty);
+      if (BTy && BTy->getKind() == BuiltinType::AMDGPUNamedWorkgroupBarrier)
+        return getLangASFromTargetAS(llvm::AMDGPUAS::BARRIER);
+    }
     return *AS;
+  }
 
   LangAS DefaultGlobalAS = getLangASFromTargetAS(
       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index a5e0b814c7042..1a0e8a0a7fc3f 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
 
 // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 
 // Test default -target-cpu
 // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
index 947ceb56d279e..684ceb85f9185 100644
--- a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
+++ b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
@@ -1,6 +1,6 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
- // REQUIRES: amdgpu-registered-target
- // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
 
 #define __shared__ __attribute__((shared))
 
@@ -11,8 +11,15 @@ __shared__ struct {
   __amdgpu_named_workgroup_barrier_t y;
 } str;
 
-__amdgpu_named_workgroup_barrier_t *getBar();
-void useBar(__amdgpu_named_workgroup_barrier_t *);
+//.
+// CHECK: @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef, align 4
+// CHECK: @arr = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef, align 4
+// CHECK: @str = internal addrspace(3) global %struct.anon undef, align 4
+// CHECK: @__hip_cuid_ = addrspace(1) global i8 0
+// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
+//.
+__attribute__((device)) __amdgpu_named_workgroup_barrier_t *getBar();
+__attribute__((device)) void useBar(__amdgpu_named_workgroup_barrier_t *);
 
 // CHECK-LABEL: define {{[^@]+}}@_Z7testSemPu34__amdgpu_named_workgroup_barrier_t
 // CHECK-SAME: (ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -22,15 +29,15 @@ void useBar(__amdgpu_named_workgroup_barrier_t *);
 // CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[TMP0]]) #[[ATTR2:[0-9]+]]
-// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(1) @bar to ptr)) #[[ATTR2]]
-// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(1) @arr to ptr), i64 16)) #[[ATTR2]]
-// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(1) @str to ptr), i64 16)) #[[ATTR2]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(15) @bar to ptr)) #[[ATTR2]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(15) @arr to ptr), i64 16)) #[[ATTR2]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @str to ptr), i64 16)) #[[ATTR2]]
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
 // CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[CALL]]) #[[ATTR2]]
 // CHECK-NEXT:    [[CALL1:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
 // CHECK-NEXT:    ret ptr [[CALL1]]
 //
-__amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
+__attribute__((device)) __amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
   useBar(p);
   useBar(&bar);
   useBar(&arr[1]);
@@ -38,3 +45,12 @@ __amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *
   useBar(getBar());
   return getBar();
 }
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "uniform-work-group-size" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "uniform-work-group-size" }
+// CHECK: attributes #[[ATTR2]] = { convergent nounwind "uniform-work-group-size" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..fcee9b3b20813 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
 
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 void foo(void) {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 332a2fa94ee92..8ea4d4e2b32e2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -83,9 +83,9 @@ void test_s_barrier_signal()
 // CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_s_barrier_signal_var(void *bar, int a)
@@ -132,9 +132,9 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
 // CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
-// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_s_barrier_init(void *bar, int a)
@@ -147,8 +147,8 @@ void test_s_barrier_init(void *bar, int a)
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_s_barrier_join(void *bar)
@@ -189,8 +189,8 @@ unsigned test_s_get_barrier_state(int a)
 // CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) [[TMP1]])
 // CHECK-NEXT:    store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
 // CHECK-NEXT:    ret i32 [[TMP3]]
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 8b09216057167..9368c2971a643 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -1362,8 +1362,8 @@ void test_s_cluster_barrier()
 // CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT:    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT:    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_s_wakeup_barrier(void *bar)
diff --git a/lld/test/ELF/lto/amdgcn-oses.ll b/lld/test/ELF/lto/amdgcn-oses.ll
index b3caf0f0de3b9..7c2101266a85c 100644
--- a/lld/test/ELF/lto/amdgcn-oses.ll
+++ b/lld/test/ELF/lto/amdgcn-oses.ll
@@ -25,7 +25,7 @@
 
 ;--- amdhsa.ll
 target triple = "amdgcn-amd-amdhsa"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
@@ -36,7 +36,7 @@ define void @_start() {
 
 ;--- amdpal.ll
 target triple = "amdgcn-amd-amdpal"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 define amdgpu_cs void @_start() {
   ret void
@@ -44,7 +44,7 @@ define amdgpu_cs void @_start() {
 
 ;--- mesa3d.ll
 target triple = "amdgcn-amd-mesa3d"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 define void @_start() {
   ret void
diff --git a/lld/test/ELF/lto/amdgcn.ll b/lld/test/ELF/lto/amdgcn.ll
index 186185c44a2c2..1dc2d86c48364 100644
--- a/lld/test/ELF/lto/amdgcn.ll
+++ b/lld/test/ELF/lto/amdgcn.ll
@@ -5,7 +5,7 @@
 ; Make sure the amdgcn triple is handled
 
 target triple = "amdgcn-amd-amdhsa"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 define void @_start() {
   ret void
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index a82cfe9240403..fe39f20cea5f6 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -979,6 +979,7 @@ supported for the ``amdgcn`` target.
      *reserved for downstream use (LLPC)*  12
      *reserved for future use*             13
      *reserved for future use*             14
+     Barrier                               15              N/A         N/A              32      0
      *reserved for future use*             16
      Streamout Registers                   128             N/A         GS_REGS
      ===================================== =============== =========== ================ ======= ============================
@@ -1192,6 +1193,23 @@ supported for the ``amdgcn`` target.
   a buffer strided pointer, this means that the base pointer is ``align(4)``, that
   the offset is a multiple of 4 bytes, and that the stride is a multiple of 4.
 
+**Barrier**
+  This address space represents barrier IDs (introduced in GFX12) as addresses.
+  It does not map directly to any addressable memory, thus pointers into this address space:
+
+  * Never alias with any other pointers outside this address space.
+  * Cannot be dereferenced.
+  * Can only be consumed by intrinsics.
+  * Are always uniform.
+
+  Pointer are 32 bits and directly correspond to valid barrier IDs. All barrier pointers must,
+  when interpreted as signed 32 bit integers, have a value corresponding to a valid barrier ID
+  on the target. Otherwise, the behavior is undefined.
+
+  Due to these pointers being a compiler abstraction without a corresponding hardware aperture,
+  the back-end handles them as-if they were local pointers with a very large offset as to not
+  overlap with any addressable local memory.
+
 **Streamout Registers**
   Dedicated registers used by the GS NGG Streamout Instructions. The register
   file is modelled as a memory in a distinct address space because it is indexed
@@ -1353,10 +1371,8 @@ Named barriers are fixed function hardware barrier objects that are available
 in gfx12.5+ in addition to the traditional default barriers.
 
 In LLVM IR, named barriers are represented by global variables of type
-``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier
-global variables do not occupy actual LDS memory, but their lifetime and
-allocation scope matches that of global variables in LDS. Programs in LLVM IR
-refer to named barriers using pointers.
+``target("amdgcn.named.barrier", 0)`` in the Execution Synchronization Resources
+address space. Programs in LLVM IR refer to named barriers using pointers.
 
 The following named barrier types are supported in global variables, defined
 recursively:
@@ -1367,14 +1383,14 @@ recursively:
 
 .. code-block:: llvm
 
-      @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef
-      @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef
-      @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef
+      @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef
+      @foo = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef
+      @baz = addrspace(15) global { target("amdgcn.named.barrier", 0) } undef
 
       ...
 
-      %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i
-      call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0)
+      %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(15) @foo, i32 0, i32 %i
+      call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %foo.i, i32 0)
 
 Named barrier types may not be used in ``alloca``.
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f1659f0cd803a..0c9e761039ca5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -13,6 +13,7 @@
 def flat_ptr_ty : LLVMQualPointerType<0>;
 def global_ptr_ty : LLVMQualPointerType<1>;
 def local_ptr_ty : LLVMQualPointerType<3>;
+def barrier_ptr_ty : LLVMQualPointerType<15>;
 
 // The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
 // by the backend cause whole-program undefined behavior when violated, such as
@@ -295,7 +296,7 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa
 // If %memberCnt is 0, the member count is retained from the previous
 // s_barrier_init or s_barrier_signal operation.
 def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
-  Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+  Intrinsic<[], [barrier_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
                                 IntrNoCallback, IntrNoFree]>;
 
 // bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType)
@@ -307,19 +308,19 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri
 // void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
 // The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
 def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
-  Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+  Intrinsic<[], [barrier_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
                                 IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
 // The %barrier argument must be uniform, otherwise behavior is undefined.
 def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
-  Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+  Intrinsic<[], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
                                 IntrNoCallback, IntrNoFree]>;
 
 // void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
 // The %barrier argument must be uniform, otherwise behavior is undefined.
 def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
-  Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+  Intrinsic<[], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
                                 IntrNoCallback, IntrNoFree]>;
 
 // void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
@@ -342,7 +343,7 @@ def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrie
 // uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier)
 // The %barrier argument must be uniform, otherwise behavior is undefined.
 def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">,
-  Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+  Intrinsic<[llvm_i32_ty], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
                                 IntrNoCallback, IntrNoFree]>;
 
 def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 01b1510524d0f..89a6f7f2dc9b2 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -26,8 +26,7 @@ namespace llvm {
 /// memory locations.
 namespace AMDGPUAS {
 enum : unsigned {
-  // The maximum value for flat, generic, local, private, constant and region.
-  MAX_AMDGPU_ADDRESS = 9,
+  MAX_AMDGPU_ADDRESS = 15,
 
   FLAT_ADDRESS = 0,   ///< Address space for flat memory.
   GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
@@ -47,6 +46,14 @@ enum : unsigned {
   BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
                               ///< pointers with an additional index.
 
+  RESERVED_0 = 10,
+  RESERVED_1 = 11,
+  RESERVED_2 = 12,
+  RESERVED_3 = 13,
+  RESERVED_4 = 14,
+
+  BARRIER = 15, ///< Address space for modeling barrier IDs as addresses.
+
   RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
 
   /// Internal address spaces. Can be freely renumbered.
@@ -84,6 +91,11 @@ enum : unsigned {
   // Some places use this if the address space can't be determined.
   UNKNOWN_ADDRESS_SPACE = ~0u,
 };
+
+/// The BARRIER AS is does not have an aperture in HW, so when converting
+/// BARRIER addresses from/to generic, we represent them as LDS addresses
+/// offset by a large amount so they can never alias with real LDS memory.
+static constexpr unsigned BarrierAddrLDSOffset = 0x802000u;
 } // end namespace AMDGPUAS
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c6dd1dbb62449..e6eb95247d09b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -622,17 +622,23 @@ static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
 
   // clang-format off
   static const bool ASAliasRules[][AMDGPUAS::MAX_AMDGPU_ADDRESS + 1] = {
-    /*                       Flat   Global Region  Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
-    /* Flat     */            {true,  true,  false, true,  true,  true,  true,  true,  true,  true},
-    /* Global   */            {true,  true,  false, false, true,  false, true,  true,  true,  true},
-    /* Region   */            {false, false, true,  false, false, false, false, false, false, false},
-    /* Local    */            {true,  false, false, true,  false, false, false, false, false, false},
-    /* Constant */            {true,  true,  false, false, false, false, true,  true,  true,  true},
-    /* Private  */            {true,  false, false, false, false, true,  false, false, false, false},
-    /* Constant 32-bit */     {true,  true,  false, false, true,  false, false, true,  true,  true},
-    /* Buffer Fat Ptr  */     {true,  true,  false, false, true,  false, true,  true,  true,  true},
-    /* Buffer Resource */     {true,  true,  false, false, true,  false, true,  true,  true,  true},
-    /* Buffer Strided Ptr  */ {true,  true,  false, false, true,  false, true,  true,  true,  true},
+    /*                       Flat   Global Region  Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr Reserved Reserved Reserved Reserved Reserved Barrier */
+    /* Flat     */            {true,  true,  false, true,  true,  true,  true,  true,  true,  true, false, false, false, false, false, false},
+    /* Global   */            {true,  true,  false, false, true,  false, true,  true,  true,  true, false, false, false, false, false, false},
+    /* Region   */            {false, false, true,  false, false, false, false, false, false, false, false, false, false, false, false, false},
+    /* Local    */            {true,  false, false, true,  false, false, false, false, false, false, false, false, false, false, false, false},
+    /* Constant */            {true,  true,  false, false, false, false, true,  true,  true,  true, false, false, false, false, false, false},
+    /* Private  */            {true,  false, false, false, false, true,  false, false, false, false, false, false, false, false, false, false},
+    /* Constant 32-bit */     {true,  true,  false, false, true,  false, false, true,  true,  true, false, false, false, false, false, false},
+    /* Buffer Fat Ptr  */     {true,  true,  false, false, true,  false, true,  true,  true,  true, false, false, false, false, false, false},
+    /* Buffer Resource */     {true,  true,  false, false, true,  false, true,  true,  true,  true, false, false, false, false, false, false},
+    /* Buffer Strided Ptr  */ {true,  true,  false, false, true,  false, true,  true,  true,  true, false, false, false, false, false, false},
+    /* Reserved  */          {false,  false,  false, false, false,  false, false,  false,  false,  false, false, false, false, false, false, false},
+    /* Reserved  */          {false,  false,  false, false, false,  false, false,  false,  false,  false, false, false, false, false, false, false},
+    /* Reserved  */          {false,  false,  false, false, false,  false, false,  false,  false,  false, false, false, false, false, false, false},
+    /* Reserved  */          {false,  false,  false, false, false,  false, false,  false,  false,  false, false, false, false, false, false, false},
+    /* Reserved  */          {false,  false,  false, false, false,  false, false,  false,  false,  false, false, false, false, false, false, false},
+    /* Barrier  */           {false,  false,  false, false, false,  false, false,  false,  false,  false, false, false, false, false, false, true},
   };
   // clang-format on
   static_assert(std::size(ASAliasRules) == AMDGPUAS::MAX_AMDGPU_ADDRESS + 1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index c517155a71ec5..0086c02b6151e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -18,6 +18,7 @@ def p3 : PtrValueType<i32, 3>;
 def p4 : PtrValueType<i64, 4>;
 def p5 : PtrValueType<i32, 5>;
 def p6 : PtrValueType<i32, 6>;
+def p15 : PtrValueType<i32, 15>;
 
 //===-----------------------------------------------------------------------===//
 // AMDGPU Subtarget Feature (device properties)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 278c2dc94c56b..0704dbd8f7b85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Target/TargetMachine.h"
@@ -1543,25 +1544,42 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
+  const auto TrapAndPoison = [&] {
+    SDLoc DL(Op);
+    SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
+    SDValue OutputChain =
+        DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Trap, DAG.getRoot());
+    DAG.setRoot(OutputChain);
+    return DAG.getPOISON(Op.getValueType());
+  };
+
+  if (G->getAddressSpace() == AMDGPUAS::BARRIER) {
+    const GlobalVariable *GVar = cast<GlobalVariable>(GV);
+
+    if (!AMDGPU::isNamedBarrier(*GVar)) {
+      const Function &Fn = DAG.getMachineFunction().getFunction();
+      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+          Fn, "Unsupported use of BARRIER address space!",
+          SDLoc(Op).getDebugLoc(), DS_Error));
+      return TrapAndPoison();
+    }
+
+    unsigned Offset = MFI->allocateBarrierGlobal(DL, *cast<GlobalVariable>(GV));
+    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+  }
+
   if (!MFI->isModuleEntryFunction()) {
-    auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
     if (std::optional<uint32_t> Address =
-            AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
-      if (IsNamedBarrier) {
-        unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
-        MFI->recordNumNamedBarriers(Address.value(), BarCnt);
-      }
+            AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+                *GV, AMDGPUAS::LOCAL_ADDRESS)) {
       return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
-    } else if (IsNamedBarrier) {
-      llvm_unreachable("named barrier should have an assigned address");
     }
   }
 
   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isModuleEntryFunction() &&
-        GV->getName() != "llvm.amdgcn.module.lds" &&
-        !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+        GV->getName() != "llvm.amdgcn.module.lds") {
       SDLoc DL(Op);
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
@@ -1573,11 +1591,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
       // functions that use local objects. However, if these dead functions are
       // not eliminated, we don't want a compile time error. Just emit a warning
       // and a trap, since there should be no callable path here.
-      SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
-      SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                        Trap, DAG.getRoot());
-      DAG.setRoot(OutputChain);
-      return DAG.getPOISON(Op.getValueType());
+      return TrapAndPoison();
     }
 
     // XXX: What does the value of G->getOffset() mean?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 77ef0eca23743..3d4b13da054d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -7355,7 +7355,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
       std::optional<int64_t> BarValImm =
           getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
       if (BarValImm) {
-        auto BarID = ((*BarValImm) >> 4) & 0x3F;
+        uint32_t BarID = *BarValImm & 0x3F;
         BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
             .addImm(BarID);
         I.eraseFromParent();
@@ -7364,16 +7364,10 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
     }
   }
 
-  // BarID = (BarOp >> 4) & 0x3F
-  Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
-      .add(BarOp)
-      .addImm(4u)
-      .setOperandDead(3); // Dead scc
-
+  // BarID = BarOp & 0x3F
   Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
-      .addReg(TmpReg0)
+      .add(BarOp)
       .addImm(0x3F)
       .setOperandDead(3); // Dead scc
 
@@ -7422,16 +7416,10 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
 
   if (!BarValImm) {
-    // BarID = (BarOp >> 4) & 0x3F
-    Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
-        .addReg(BarOp.getReg())
-        .addImm(4u)
-        .setOperandDead(3); // Dead scc;
-
+    // BarID = BarOp & 0x3F
     Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
-        .addReg(TmpReg0)
+        .addReg(BarOp.getReg())
         .addImm(0x3F)
         .setOperandDead(3); // Dead scc;
 
@@ -7454,7 +7442,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
   }
 
   if (BarValImm) {
-    auto BarId = ((*BarValImm) >> 4) & 0x3F;
+    uint32_t BarId = *BarValImm & 0x3F;
     MIB.addImm(BarId);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3c8f81ea5b11e..9c40cbcc7439a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 
 #define DEBUG_TYPE "amdgpu-legalinfo"
 
@@ -2416,15 +2417,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
 
-  assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
+  bool IsLDS = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER);
+  assert(IsLDS || AS == AMDGPUAS::PRIVATE_ADDRESS);
 
   if (ST.hasApertureRegs()) {
     // Note: this register is somewhat broken. When used as a 32-bit operand,
     // it only returns zeroes. The real value is in the upper 32 bits.
     // Thus, we must emit extract the high 32 bits.
-    const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
-                                       ? AMDGPU::SRC_SHARED_BASE
-                                       : AMDGPU::SRC_PRIVATE_BASE;
+    const unsigned ApertureRegNo =
+        IsLDS ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE;
     assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
             !ST.hasGloballyAddressableScratch()) &&
            "Cannot use src_private_base with globally addressable scratch!");
@@ -2479,7 +2480,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
-  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+  uint32_t StructOffset = IsLDS ? 0x40 : 0x44;
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       PtrInfo,
@@ -2547,7 +2548,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   }
 
   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
-      (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+      (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::BARRIER ||
        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
     auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
       if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -2565,7 +2566,17 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
         return B.buildIntToPtr(Dst, Sub).getReg(0);
       }
 
-      // Extract low 32-bits of the pointer.
+      if (DestAS == AMDGPUAS::BARRIER) {
+        // flat -> barrier: extract the low 32 bits, then sub the barrier AS
+        // offset.
+        Register LoBits = B.buildExtract(S32, Src, 0).getReg(0);
+        Register Sub =
+            B.buildSub(S32, LoBits,
+                       B.buildConstant(S32, AMDGPUAS::BarrierAddrLDSOffset))
+                .getReg(0);
+        return B.buildIntToPtr(Dst, Sub).getReg(0);
+      }
+
       return B.buildExtract(Dst, Src, 0).getReg(0);
     };
 
@@ -2594,7 +2605,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   }
 
   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
-      (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+      (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::BARRIER ||
        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
     auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
       // Coerce the type of the low half of the result so we can use
@@ -2636,6 +2647,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
       if (!ApertureReg.isValid())
         return false;
 
+      if (SrcAS == AMDGPUAS::BARRIER) {
+        // barrier -> flat: add the barrier AS offset
+        SrcAsInt =
+            B.buildAdd(S32, SrcAsInt,
+                       B.buildConstant(S32, AMDGPUAS::BarrierAddrLDSOffset))
+                .getReg(0);
+      }
+
       // TODO: Should we allow mismatched types but matching sizes in merges to
       // avoid the ptrtoint?
       return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -3331,10 +3350,32 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
   MachineFunction &MF = B.getMF();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
+  const auto TrapAndPoison = [&] {
+    B.buildTrap();
+    B.buildUndef(DstReg);
+    MI.eraseFromParent();
+    return true;
+  };
+
+  if (AS == AMDGPUAS::BARRIER) {
+    const GlobalVariable *GVar = cast<GlobalVariable>(GV);
+    if (!AMDGPU::isNamedBarrier(*GVar)) {
+      const Function &Fn = MF.getFunction();
+      Fn.getContext().diagnose(DiagnosticInfoUnsupported(
+          Fn, "Unsupported use of BARRIER address space!", MI.getDebugLoc(),
+          DS_Error));
+      return TrapAndPoison();
+    }
+
+    B.buildConstant(DstReg,
+                    MFI->allocateBarrierGlobal(B.getDataLayout(), *GVar));
+    MI.eraseFromParent();
+    return true;
+  }
+
   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isModuleEntryFunction() &&
-        GV->getName() != "llvm.amdgcn.module.lds" &&
-        !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+        GV->getName() != "llvm.amdgcn.module.lds") {
       const Function &Fn = MF.getFunction();
       Fn.getContext().diagnose(DiagnosticInfoUnsupported(
           Fn, "local memory global used by non-kernel function",
@@ -3345,10 +3386,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
       // functions that use local objects. However, if these dead functions are
       // not eliminated, we don't want a compile time error. Just emit a warning
       // and a trap, since there should be no callable path here.
-      B.buildTrap();
-      B.buildUndef(DstReg);
-      MI.eraseFromParent();
-      return true;
+      return TrapAndPoison();
     }
 
     // TODO: We could emit code to handle the initialization somewhere.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 707c3ec975d73..95ba2586efb09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -6,12 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// Lower global variables with target extension type "amdgpu.named.barrier"
 // that require specialized address assignment. It assigns a unique
-// barrier identifier to each named-barrier LDS variable and encodes
+// barrier identifier to each named-barrier variable and encodes
 // this identifier within the !absolute_symbol metadata of that global.
-// This encoding ensures that subsequent LDS lowering passes can process these
-// barriers correctly without conflicts.
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,10 +35,14 @@ using namespace AMDGPU;
 
 namespace {
 
+static bool isNamedBarrierToLower(const GlobalVariable &GV) {
+  return isNamedBarrier(GV) && !GV.isAbsoluteSymbolRef();
+}
+
 // Write the specified address into metadata where it can be retrieved by
 // the assembler. Format is a half open range, [Address Address+1)
-static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
-                                     uint32_t Address) {
+static void recordAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                  uint32_t Address) {
   LLVMContext &Ctx = M->getContext();
   auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
   auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
@@ -137,14 +139,12 @@ static bool lowerExecSyncGlobalVariables(Module &M, GVUsesInfoTy &GVUsesInfo) {
       LLVM_DEBUG(GV->printAsOperand(dbgs(), false);
                  dbgs() << " was assigned barrier id: " << BarID
                         << " id-count: " << BarCnt << "\n");
-      // 4 bits for alignment, 5 bits for the barrier num,
-      // 3 bits for the barrier scope
-      Offset = 0x802000u | BarrierScope << 9 | BarID << 4;
+      Offset = BarID;
     } else {
       llvm_unreachable("Unhandled special variable type.");
     }
 
-    recordLDSAbsoluteAddress(&M, GV, Offset);
+    recordAbsoluteAddress(&M, GV, Offset);
   }
 
   // Also erase those special LDS variables from indirect_access.
@@ -214,15 +214,16 @@ static bool runLowerExecSyncGlobals(Module &M) {
   CallGraph CG = CallGraph(M);
   bool Changed = false;
   Changed |=
-      eliminateGVConstantExprUsesFromAllInstructions(M, isLDSVariableToLower);
+      eliminateGVConstantExprUsesFromAllInstructions(M, isNamedBarrierToLower);
 
   // For each kernel, what variables does it access directly or through
   // callees
-  GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
+  GVUsesInfoTy BarrierUsesInfo =
+      getTransitiveUsesOfGV(CG, M, isNamedBarrierToLower);
 
-  if (hasBarrierToLower(LDSUsesInfo)) {
+  if (hasBarrierToLower(BarrierUsesInfo)) {
     // Special LDS variables need special address assignment
-    Changed |= lowerExecSyncGlobalVariables(M, LDSUsesInfo);
+    Changed |= lowerExecSyncGlobalVariables(M, BarrierUsesInfo);
   }
 
   return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index ef86d279d193b..12e2478e055b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -937,16 +937,6 @@ class AMDGPULowerModuleLDS {
     for (auto &[F, Vars] : FunctionLDSUses)
       AllLDSUses[F].insert(Vars.begin(), Vars.end());
 
-    // Named barriers are handled by AMDGPULowerExecSync; filter them out.
-    for (auto &[F, Vars] : AllLDSUses) {
-      SmallVector<GlobalVariable *> Barriers;
-      for (GlobalVariable *V : Vars)
-        if (AMDGPU::isNamedBarrier(*V))
-          Barriers.push_back(V);
-      for (GlobalVariable *V : Barriers)
-        Vars.erase(V);
-    }
-
     // Build reverse map: LDS variable -> functions that use it.
     DenseMap<GlobalVariable *, SmallVector<Function *, 4>> VarToFuncs;
     for (auto &[F, Vars] : AllLDSUses) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ffd33922ec511..37617b51e465f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
@@ -285,7 +286,8 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
   // Intercept LDS variables with known addresses
   if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) {
     if (std::optional<uint32_t> Address =
-            AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
+            AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+                *GV, AMDGPUAS::LOCAL_ADDRESS)) {
       auto *IntTy = Type::getInt32Ty(CV->getContext());
       return AsmPrinter::lowerConstant(ConstantInt::get(IntTy, *Address),
                                        BaseCV, Offset);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
index 3e8a75a7eb840..540e0981a95a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -97,17 +98,8 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
 
   unsigned Offset;
   if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-    if (AMDGPU::isNamedBarrier(GV)) {
-      std::optional<unsigned> BarAddr = getLDSAbsoluteAddress(GV);
-      if (!BarAddr)
-        llvm_unreachable("named barrier should have an assigned address");
-      Entry.first->second = BarAddr.value();
-      unsigned BarCnt = GV.getGlobalSize(DL) / 16;
-      recordNumNamedBarriers(BarAddr.value(), BarCnt);
-      return BarAddr.value();
-    }
-
-    std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
+    std::optional<uint32_t> MaybeAbs =
+        get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
     if (MaybeAbs) {
       // Absolute address LDS variables that exist prior to the LDS lowering
       // pass raise a fatal error in that pass. These failure modes are only
@@ -165,6 +157,26 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
   return Offset;
 }
 
+unsigned
+AMDGPUMachineFunctionInfo::allocateBarrierGlobal(const DataLayout &DL,
+                                                 const GlobalVariable &GV) {
+  assert(AMDGPU::isNamedBarrier(GV));
+  std::optional<unsigned> BarAddr =
+      get32BitAbsoluteAddress(GV, AMDGPUAS::BARRIER);
+  if (!BarAddr)
+    llvm_unreachable("named barrier should have an assigned address");
+  if (*BarAddr == 0) {
+    // We cannot allow this because some places in CodeGen (rightfully) assume a
+    // GV address is never null. For example, there are no null checks on
+    // addrspacecast if the pointer is a GV pointer.
+    report_fatal_error(
+        "named barrier GV cannot be used to represent the NULL named barrier");
+  }
+  unsigned BarCnt = GV.getGlobalSize(DL) / 16;
+  recordNumNamedBarriers(BarAddr.value(), BarCnt);
+  return BarAddr.value();
+}
+
 std::optional<uint32_t>
 AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
   // TODO: Would be more consistent with the abs symbols to use a range
@@ -182,8 +194,9 @@ AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
 }
 
 std::optional<uint32_t>
-AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(const GlobalValue &GV) {
-  if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(const GlobalValue &GV,
+                                                   unsigned AS) {
+  if (GV.getAddressSpace() != AS)
     return {};
 
   std::optional<ConstantRange> AbsSymRange = GV.getAbsoluteSymbolRange();
@@ -221,7 +234,8 @@ void AMDGPUMachineFunctionInfo::setDynLDSAlign(const Function &F,
   const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
   if (Dyn) {
     unsigned Offset = LDSSize; // return this?
-    std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+    std::optional<uint32_t> Expect =
+        get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
     if (!Expect || (Offset != *Expect)) {
       report_fatal_error("Inconsistent metadata on dynamic LDS variable");
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
index 36db6c2dd0d12..c65592bd965ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
@@ -82,7 +82,7 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
 
   void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
     NumNamedBarriers =
-        std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+        std::max(NumNamedBarriers, (GVAddr & 0x1ff) + BarCnt - 1);
   }
   uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
 
@@ -109,8 +109,13 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
                              Align Trailing);
 
+  unsigned allocateBarrierGlobal(const DataLayout &DL,
+                                 const GlobalVariable &GV);
+
   static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
-  static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
+
+  static std::optional<uint32_t> get32BitAbsoluteAddress(const GlobalValue &GV,
+                                                         unsigned AS);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index eacce0d5242e1..d3b815008f4cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/ReplaceConstant.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 
 #define DEBUG_TYPE "amdgpu-memory-utils"
 
@@ -76,6 +77,8 @@ static TargetExtType *getTargetExtType(const GlobalVariable &GV) {
 }
 
 TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
+  if (GV.getAddressSpace() != AMDGPUAS::BARRIER)
+    return nullptr;
   if (TargetExtType *Ty = getTargetExtType(GV))
     return Ty->getName() == "amdgcn.named.barrier" ? Ty : nullptr;
   return nullptr;
@@ -292,15 +295,6 @@ GVUsesInfoTy getTransitiveUsesOfLDSForLowering(const CallGraph &CG, Module &M) {
         if (IsDirectMapDynLDSGV)
           continue;
 
-        // TODO: Remove once barriers are no longer in the LDS AS.
-        if (isNamedBarrier(*GV)) {
-          if (IsAbsolute) {
-            UsesInfo.DirectAccess[Fn].erase(GV);
-            UsesInfo.IndirectAccess[Fn].erase(GV);
-          }
-          continue;
-        }
-
         if (HasAbsoluteGVs.has_value()) {
           if (*HasAbsoluteGVs != IsAbsolute) {
             reportFatalUsageError(
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 6a5d82af41fb3..509c21b1e7788 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1136,10 +1136,6 @@ enum Type {
   NAMED_BARRIER_LAST = 16,
 };
 
-enum {
-  BARRIER_SCOPE_WORKGROUP = 0,
-};
-
 } // namespace Barrier
 } // namespace AMDGPU
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2817f1ac8c3ad..27aa19c330a2a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/ModRef.h"
@@ -8474,9 +8475,11 @@ bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
   // linker can assign their offsets.
   if (AMDGPUTargetMachine::EnableObjectLinking) {
     if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
-      if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-        assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
-                                        "when object linking is enabled");
+      if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+          GVar->getAddressSpace() == AMDGPUAS::BARRIER) {
+        assert(GVar->isDeclaration() &&
+               "AS 3 & 13 GVs should be declaration here "
+               "when object linking is enabled");
         return false;
       }
     }
@@ -9193,10 +9196,11 @@ SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                              SelectionDAG &DAG) const {
+  const bool IsLDS = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER);
+
   if (Subtarget->hasApertureRegs()) {
-    const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
-                                       ? AMDGPU::SRC_SHARED_BASE
-                                       : AMDGPU::SRC_PRIVATE_BASE;
+    const unsigned ApertureRegNo =
+        IsLDS ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE;
     assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
             !Subtarget->hasGloballyAddressableScratch()) &&
            "Cannot use src_private_base with globally addressable scratch!");
@@ -9218,8 +9222,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
   // implicit kernargs.
   const Module *M = DAG.getMachineFunction().getFunction().getParent();
   if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
-    ImplicitParameter Param =
-        (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
+    ImplicitParameter Param = IsLDS ? SHARED_BASE : PRIVATE_BASE;
     return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
   }
 
@@ -9237,7 +9240,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
-  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+  uint32_t StructOffset = IsLDS ? 0x40 : 0x44;
 
   SDValue Ptr =
       DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
@@ -9293,10 +9296,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
 
   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
 
-  // flat -> local/private
+  // flat -> local/private/barrier
   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
-        DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
+        DestAS == AMDGPUAS::PRIVATE_ADDRESS || DestAS == AMDGPUAS::BARRIER) {
       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
       if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -9309,6 +9312,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
                 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
             0);
         Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+      } else if (DestAS == AMDGPUAS::BARRIER) {
+        // flat -> barrier: sub the barrier AS offset.
+        Ptr = DAG.getNode(
+            ISD::SUB, SL, MVT::i32, Ptr,
+            DAG.getConstant(AMDGPUAS::BarrierAddrLDSOffset, SL, MVT::i32));
       }
 
       if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
@@ -9323,10 +9331,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
     }
   }
 
-  // local/private -> flat
+  // local/private/barrier -> flat
   if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
-        SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
+        SrcAS == AMDGPUAS::PRIVATE_ADDRESS || SrcAS == AMDGPUAS::BARRIER) {
       SDValue CvtPtr;
       if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
           Subtarget->hasGloballyAddressableScratch()) {
@@ -9358,7 +9366,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
         CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
       } else {
         SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
-        CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+        if (SrcAS == AMDGPUAS::BARRIER) {
+          // barrier -> flat: add the barrier AS offset.
+          SDValue SrcOffset = DAG.getNode(
+              ISD::ADD, SL, MVT::i32, Src,
+              DAG.getConstant(AMDGPUAS::BarrierAddrLDSOffset, SL, MVT::i32));
+          CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, SrcOffset,
+                               Aperture);
+        } else {
+          CvtPtr =
+              DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+        }
+
         CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
       }
 
@@ -9905,12 +9925,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
   EVT PtrVT = Op.getValueType();
 
   const GlobalValue *GV = GSD->getGlobal();
-  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+  const unsigned AS = GSD->getAddressSpace();
+  if (((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER) &&
        shouldUseLDSConstAddress(GV)) ||
-      GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
-      GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
-        GV->hasExternalLinkage()) {
+      AS == AMDGPUAS::REGION_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) {
+    if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
       const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
       // HIP uses an unsized array `extern __shared__ T s[]` or similar
       // zero-sized type in other languages to declare the dynamic shared
@@ -9930,7 +9949,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
   }
 
-  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+  if (AS == AMDGPUAS::BARRIER) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+                                            SIInstrInfo::MO_ABS32_LO);
+    return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0);
+  }
+
+  if (AS == AMDGPUAS::LOCAL_ADDRESS) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
                                             SIInstrInfo::MO_ABS32_LO);
     return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
@@ -11944,7 +11969,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     if (isa<ConstantSDNode>(Op->getOperand(2))) {
       uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
       if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
-        BarID = (BarID >> 4) & 0x3F;
+        BarID = BarID & 0x3F;
       Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
       Ops.push_back(K);
@@ -11952,13 +11977,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     } else {
       Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
       if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
-        SDValue M0Val;
-        M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
-                            DAG.getShiftAmountConstant(4, MVT::i32, DL));
-        M0Val = SDValue(
-            DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
-                               DAG.getTargetConstant(0x3F, DL, MVT::i32)),
-            0);
+        SDValue M0Val =
+            SDValue(DAG.getMachineNode(
+                        AMDGPU::S_AND_B32, DL, MVT::i32, Op->getOperand(2),
+                        DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+                    0);
         Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
       } else
         Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
@@ -12566,12 +12589,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
         BarVal = C->getZExtValue();
       else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
-        if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(
-                *GA->getGlobal()))
+        if (auto Addr = AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+                *GA->getGlobal(), AMDGPUAS::BARRIER))
           BarVal = *Addr + GA->getOffset();
 
       if (BarVal) {
-        unsigned BarID = (*BarVal >> 4) & 0x3F;
+        unsigned BarID = *BarVal & 0x3F;
         Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
         Ops.push_back(Chain);
         auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
@@ -12591,12 +12614,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
                        ? AMDGPU::S_BARRIER_INIT_M0
                        : AMDGPU::S_BARRIER_SIGNAL_M0;
-    // extract the BarrierID from bits 4-9 of BarOp
-    SDValue BarID;
-    BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
-                        DAG.getShiftAmountConstant(4, MVT::i32, DL));
-    BarID =
-        SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
+    // extract the BarrierID from bits 0-5 of BarOp
+    SDValue BarID =
+        SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarOp,
                                    DAG.getTargetConstant(0x3F, DL, MVT::i32)),
                 0);
     // Member count should be put into M0[ShAmt:+6]
@@ -12641,8 +12661,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
         break;
       }
-      // extract the BarrierID from bits 4-9 of the immediate
-      unsigned BarID = (BarVal >> 4) & 0x3F;
+      // extract the BarrierID from bits 0-5 of the immediate
+      unsigned BarID = BarVal & 0x3F;
       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
       Ops.push_back(K);
       Ops.push_back(Chain);
@@ -12657,12 +12677,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
         break;
       }
-      // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
-      SDValue M0Val;
-      M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
-                          DAG.getShiftAmountConstant(4, MVT::i32, DL));
-      M0Val =
-          SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+      // extract the BarrierID from bits 0-5 of BarOp, copy to M0[5:0]
+      SDValue M0Val =
+          SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarOp,
                                      DAG.getTargetConstant(0x3F, DL, MVT::i32)),
                   0);
       Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 31f927709e682..95afe4f075dfb 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -596,7 +596,7 @@ class RegisterTypes<list<ValueType> reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32DataTypes: RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16]>;
-def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6]>;
+def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6, p15]>;
 def Reg32Types : RegisterTypes<!listconcat(Reg32DataTypes.types, Reg32PtrTypes.types)>;
 def Reg64DataTypes: RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16]>;
 def Reg64PtrTypes: RegisterTypes<[p0, p1, p4]>;
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..a846ad17ccf3c 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -274,8 +274,9 @@ static std::string computeAMDDataLayout(const Triple &TT) {
   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
   // like getelementptr.
   return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
-         "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
+         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32"
+         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:"
+         "512-"
          "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
 }
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 6e75d4ead8cdd..d24fe03993482 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -160,10 +160,10 @@ define i32 @s_get_barrier_state(i32 %bar) {
 }
 
 ; CHECK-LABEL: for function 's_get_named_barrier_state':
-; CHECK: DIVERGENT: ptr addrspace(3) %bar
+; CHECK: DIVERGENT: ptr addrspace(15) %bar
 ; CHECK-NOT: DIVERGENT
-define i32 @s_get_named_barrier_state(ptr addrspace(3) %bar) {
-  %result = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %bar)
+define i32 @s_get_named_barrier_state(ptr addrspace(15) %bar) {
+  %result = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %bar)
   ret i32 %result
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
new file mode 100644
index 0000000000000..34f9a51a0455c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
@@ -0,0 +1,442 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s                         | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s    | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s                         | FileCheck -check-prefixes=GFX1030,GFX1030-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s    | FileCheck -check-prefixes=GFX1030,GFX1030-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s                         | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s    | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s                         | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s    | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+define amdgpu_kernel void @barrier_to_generic(ptr addrspace(15) %bar, ptr %out) {
+; GFX942-SDAG-LABEL: barrier_to_generic:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_add_i32 s4, s0, 0x802000
+; GFX942-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-SDAG-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX942-SDAG-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: barrier_to_generic:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_add_u32 s0, s6, 0x802000
+; GFX942-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX942-GISEL-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_to_generic:
+; GFX1030-SDAG:       ; %bb.0:
+; GFX1030-SDAG-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT:    s_clause 0x1
+; GFX1030-SDAG-NEXT:    s_load_dword s0, s[8:9], 0x0
+; GFX1030-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; GFX1030-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT:    s_add_i32 s4, s0, 0x802000
+; GFX1030-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1030-SDAG-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT:    s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_to_generic:
+; GFX1030-GISEL:       ; %bb.0:
+; GFX1030-GISEL-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT:    s_clause 0x1
+; GFX1030-GISEL-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX1030-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; GFX1030-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1030-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT:    s_add_u32 s0, s4, 0x802000
+; GFX1030-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-GISEL-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT:    s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_to_generic:
+; GFX1200-SDAG:       ; %bb.0:
+; GFX1200-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1200-SDAG-NEXT:    s_clause 0x1
+; GFX1200-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1200-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    s_add_co_i32 s4, s0, 0x802000
+; GFX1200-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-SDAG-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1200-SDAG-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-SDAG-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT:    s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_to_generic:
+; GFX1200-GISEL:       ; %bb.0:
+; GFX1200-GISEL-NEXT:    s_clause 0x1
+; GFX1200-GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x0
+; GFX1200-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX1200-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    s_add_co_u32 s0, s6, 0x802000
+; GFX1200-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-GISEL-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-GISEL-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: barrier_to_generic:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8 nv
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s0, 0x802000
+; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: barrier_to_generic:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8 nv
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s6, 0x802000
+; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %res = addrspacecast ptr addrspace(15) %bar to ptr
+  store ptr %res, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @barrier_gv_to_generic(ptr %out) {
+; GFX942-SDAG-LABEL: barrier_gv_to_generic:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0x802001
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: barrier_gv_to_generic:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0x802001
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1030-SDAG:       ; %bb.0:
+; GFX1030-SDAG-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v0, 0x802001
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT:    s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1030-GISEL:       ; %bb.0:
+; GFX1030-GISEL-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1030-GISEL-NEXT:    s_mov_b32 s0, 0x802001
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT:    s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1200-SDAG:       ; %bb.0:
+; GFX1200-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1200-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v0, 0x802001 :: v_dual_mov_b32 v1, s1
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-SDAG-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT:    s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1200-GISEL:       ; %bb.0:
+; GFX1200-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1200-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1200-GISEL-NEXT:    s_mov_b32 s0, 0x802001
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-GISEL-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0x802001
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, 0x802001
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %res = addrspacecast ptr addrspace(15) @bar to ptr
+  store ptr %res, ptr %out
+  ret void
+}
+
+
+define amdgpu_kernel void @barrier_null_to_generic(ptr %out) {
+; GFX942-LABEL: barrier_null_to_generic:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_null_to_generic:
+; GFX1030-SDAG:       ; %bb.0:
+; GFX1030-SDAG-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1030-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1030-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1030-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT:    s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_null_to_generic:
+; GFX1030-GISEL:       ; %bb.0:
+; GFX1030-GISEL-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1030-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX1030-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1030-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT:    s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_null_to_generic:
+; GFX1200-SDAG:       ; %bb.0:
+; GFX1200-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s0
+; GFX1200-SDAG-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT:    s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_null_to_generic:
+; GFX1200-GISEL:       ; %bb.0:
+; GFX1200-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1200-GISEL-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: barrier_null_to_generic:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 nv
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
+  %res = addrspacecast ptr addrspace(15) null to ptr
+  store ptr %res, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @generic_to_barrier(ptr %generic, ptr %out) {
+; GFX942-SDAG-LABEL: generic_to_barrier:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT:    s_add_i32 s2, s0, 0xff7fe000
+; GFX942-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: generic_to_barrier:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_add_i32 s4, s0, 0xff7fe000
+; GFX942-GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-GISEL-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: generic_to_barrier:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_add_u32 s12, s12, s17
+; GFX1030-NEXT:    s_addc_u32 s13, s13, 0
+; GFX1030-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    s_add_i32 s4, s0, 0xff7fe000
+; GFX1030-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1030-NEXT:    flat_store_dword v[0:1], v2
+; GFX1030-NEXT:    s_endpgm
+;
+; GFX1200-SDAG-LABEL: generic_to_barrier:
+; GFX1200-SDAG:       ; %bb.0:
+; GFX1200-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT:    s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1200-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1200-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1200-SDAG-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1200-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1200-SDAG-NEXT:    flat_store_b32 v[0:1], v2
+; GFX1200-SDAG-NEXT:    s_endpgm
+;
+; GFX1200-GISEL-LABEL: generic_to_barrier:
+; GFX1200-GISEL:       ; %bb.0:
+; GFX1200-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT:    s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1200-GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1200-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1200-GISEL-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1200-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s0
+; GFX1200-GISEL-NEXT:    flat_store_b32 v[0:1], v2
+; GFX1200-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: generic_to_barrier:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-SDAG-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: generic_to_barrier:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-GISEL-NEXT:    flat_store_b32 v1, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %res = addrspacecast ptr %generic to ptr addrspace(15)
+  store ptr addrspace(15) %res, ptr %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1200: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
index 0f9cfb48b577d..4a7d7330200b7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
@@ -6,28 +6,28 @@
 ; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
 @lds1 = internal addrspace(3) global [1 x i8] poison, align 4
 
 ;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]]
 ; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ;.
 define void @func1() #0 {
 ; CHECK-LABEL: define void @func1(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   ret void
 }
@@ -35,14 +35,14 @@ define void @func1() #0 {
 define void @func2() #0 {
 ; CHECK-LABEL: define void @func2(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   store i8 7, ptr addrspace(3) @lds1, align 4
   ret void
@@ -52,20 +52,20 @@ define amdgpu_kernel void @kernel1() #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel1(
 ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+; CHECK-NEXT:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    call void @func1()
 ; CHECK-NEXT:    call void @func2()
 ; CHECK-NEXT:    store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
-  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
   call void @llvm.amdgcn.s.barrier()
   call void @func1()
   call void @func2()
@@ -77,15 +77,15 @@ define amdgpu_kernel void @kernel2() #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel2(
 ; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    call void @func2()
 ; CHECK-NEXT:    store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   call void @func2()
   store i8 10, ptr addrspace(3) @lds1, align 4
@@ -95,13 +95,13 @@ define amdgpu_kernel void @kernel2() #0 {
 declare void @llvm.amdgcn.s.barrier() #1
 declare void @llvm.amdgcn.s.barrier.wait(i16) #1
 declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
 declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
 declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
@@ -113,8 +113,8 @@ attributes #2 = { nounwind readnone }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
-; CHECK: [[META2]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 7, i32 8}
+; CHECK: [[META2]] = !{i32 1, i32 2}
 ; CHECK: [[META3]] = !{i32 0, i32 1}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
index 16533e0a204d4..905b1174db711 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
@@ -4,24 +4,24 @@
 ; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario,
 ; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
 @lds1 = internal addrspace(3) global [1 x i8] poison, align 4
 
 ;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
 ;
 define void @bar() #0 {
 ; CHECK-LABEL: define void @bar(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
 ; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK:    store i8 7, ptr addrspace(1) {{.*}}, align 4
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   store i8 7, ptr addrspace(3) @lds1, align 4
   ret void
@@ -32,32 +32,21 @@ define amdgpu_kernel void @barkernel() #0 {
 ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
 ; CHECK:    {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}})
 ; CHECK:    call void @llvm.amdgcn.s.barrier()
-; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
 ; CHECK:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK:    call void @bar()
 ; CHECK:    store i8 10, ptr addrspace(1) {{.*}}, align 4
 ; CHECK:    call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}})
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   call void @bar()
   store i8 10, ptr addrspace(3) @lds1, align 4
   ret void
 }
 
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
-
 attributes #0 = { nounwind sanitize_address }
 attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
@@ -68,6 +57,6 @@ attributes #2 = { nounwind readnone }
 ; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address }
 ; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" }
 ;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 1, i32 2}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index ddf58900518b5..fd11c076eebcb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -4,37 +4,37 @@
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
 
 ;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
 ;.
 define void @func1() {
 ; CHECK-LABEL: define void @func1() {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   ret void
 }
 
 define void @func2() {
 ; CHECK-LABEL: define void @func2() {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   ret void
 }
@@ -42,19 +42,19 @@ define void @func2() {
 define amdgpu_kernel void @kernel1() #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel1(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+; CHECK-NEXT:    [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    call void @func1()
 ; CHECK-NEXT:    call void @func2()
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
-  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+  %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
   call void @llvm.amdgcn.s.barrier()
   call void @func1()
   call void @func2()
@@ -64,14 +64,14 @@ define amdgpu_kernel void @kernel1() #0 {
 define amdgpu_kernel void @kernel2() #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel2(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    call void @func2()
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
 
   call void @func2()
@@ -81,13 +81,13 @@ define amdgpu_kernel void @kernel2() #0 {
 declare void @llvm.amdgcn.s.barrier() #1
 declare void @llvm.amdgcn.s.barrier.wait(i16) #1
 declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
 declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
 declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
@@ -96,7 +96,7 @@ attributes #2 = { nounwind readnone }
 ; CHECK: attributes #[[ATTR0]] = { nounwind }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
-; CHECK: [[META2]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 7, i32 8}
+; CHECK: [[META2]] = !{i32 1, i32 2}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index b86f621b9e0cf..489061d329320 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -489,8 +489,8 @@ attributes #1 = { nounwind }
 ; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
 ; HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
 ;.
-; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10}
-; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10}
-; HSA: [[META2]] = !{i32 2, i32 10}
-; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
+; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 16}
+; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 16}
+; HSA: [[META2]] = !{i32 2, i32 16}
+; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
index ccc434842e498..ccb34288d5845 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
@@ -153,7 +153,7 @@ attributes #0 = { "amdgpu-no-flat-scratch-init" }
 ; GFX10: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
 ; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
 ;.
-; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
 ;.
-; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 1e8a33781ee8d..bf2e44c641664 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -877,15 +877,15 @@ define amdgpu_kernel void @with_inline_asm() {
 ; GFX10: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
 ; GFX10: attributes #[[ATTR4]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
 ;.
-; GFX9: [[META0]] = !{i32 2, i32 10}
-; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 2, i32 16}
+; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
 ;.
-; GFX10: [[META0]] = !{i32 2, i32 10}
-; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 2, i32 16}
+; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index d91b2117c7ad9..f699ba70bb935 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -633,7 +633,7 @@ define amdgpu_kernel void @no_alias_addr_space_has_meta(ptr addrspace(3) %sptr,
 !0 = !{i32 2, i32 3, i32 4, i32 10}
 
 ;.
-; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 10}
-; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 10}
+; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 16}
+; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 16}
 ; CHECK: [[META2]] = !{i32 2, i32 3, i32 4, i32 10}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll b/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
new file mode 100644
index 0000000000000..fa8e079e55b7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
@@ -0,0 +1,16 @@
+; Check we cannot dereference a barrier GV.
+
+; RUN: not --crash llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s                       2>&1  | FileCheck -check-prefixes=DAGISEL %s
+; RUN: not         llc -O0 -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s  2>&1  | FileCheck -check-prefixes=GISEL %s
+
+; TODO: It'd be nicer to have a Verifier diagnostic for this.
+
+; DAGISEL: LLVM ERROR: {{.*}} store<(store (s32) into @bar, addrspace 15)>
+; GISEL:   LLVM ERROR: {{.*}} G_LOAD %6:sgpr(p15) :: (load (s32) from @bar, addrspace 15) (in function: func1)
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+define amdgpu_kernel void @func1() {
+  %val = load i32, ptr addrspace(15) @bar
+  store i32 %val, ptr addrspace(15) @bar
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
index f573a3180c067..5f21822874683 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
@@ -7,10 +7,9 @@
 ; 3. group_segment_fixed_size = 0 (linker patches it)
 ; 4. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds)
 
- at bar = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
 
 ; CHECK-LABEL: kernel:
-; CHECK: s_lshr_b32 s{{[0-9]+}}, __amdgpu_named_barrier.bar{{[^ @]*}}@abs32 at lo, 4
 ; CHECK: s_barrier_join m0
 ; CHECK: s_barrier_signal m0
 ; CHECK: s_barrier_wait 1
@@ -26,8 +25,6 @@
 ; CHECK:        .amdgpu_call helper
 ; CHECK:      .end_amdgpu_info
 
-; CHECK:      .amdgpu_lds __amdgpu_named_barrier.bar{{[^ ,]*}}, 32, 4
-
 ; ELF:      Section {
 ; ELF:        Name: .amdgpu.info
 ; ELF:        Type: SHT_PROGBITS
@@ -39,16 +36,16 @@
 ; ELF-DAG: R_AMDGPU_ABS64 helper
 
 define amdgpu_kernel void @kernel() {
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   call void @helper()
   ret void
 }
 
 declare void @helper()
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #0
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #0
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #0
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #0
 declare void @llvm.amdgcn.s.barrier.wait(i16) #0
 
 attributes #0 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
index 62b32901d281d..9ec6fc523dc63 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
@@ -6,21 +6,21 @@
 ; 2. AMDGPULowerModuleLDS does not handle named barriers at all
 ; 3. amdgpu.lds.uses does NOT contain barrier entries
 
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
 @lds = internal addrspace(3) global [4 x i32] poison, align 4
 
 ; Internal named barrier becomes external with a module-unique hash suffix.
-; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(3) global target("amdgcn.named.barrier", 0)
+; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(15) global target("amdgcn.named.barrier", 0)
 ; CHECK-NOT: !absolute_symbol
 ; Regular LDS is packed into the per-function struct (external, for linker).
 ; CHECK: @__amdgpu_lds.kernel = external dso_local addrspace(3) global %__amdgpu_lds.kernel.t, align 16
 
 define amdgpu_kernel void @kernel(i32 %idx) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel(
-; CHECK:         call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @[[BAR]])
-; CHECK:         call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @[[BAR]], i32 3)
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
+; CHECK:         call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @[[BAR]])
+; CHECK:         call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @[[BAR]], i32 3)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   %gep = getelementptr [4 x i32], ptr addrspace(3) @lds, i32 0, i32 %idx
   store i32 42, ptr addrspace(3) %gep, align 4
@@ -29,7 +29,7 @@ define amdgpu_kernel void @kernel(i32 %idx) {
 
 ; Named barrier metadata: (barrier_sym, func1, ...) -- emitted by ExecSync.
 ; CHECK-DAG: !amdgpu.named_barrier.uses = !{[[BAR_MD:![0-9]+]]}
-; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(3) @[[BAR]], ptr @kernel}
+; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(15) @[[BAR]], ptr @kernel}
 ; LDS metadata must have exactly one entry (the LDS struct), no barrier entries.
 ; CHECK-DAG: !amdgpu.lds.uses = !{[[LDS_MD:![0-9]+]]}
 ; CHECK-DAG: [[LDS_MD]] = !{ptr @kernel, ptr addrspace(3) @__amdgpu_lds.kernel}
diff --git a/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll b/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
new file mode 100644
index 0000000000000..12be207e7e884
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-kernel.ll 2>&1   | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-kernel.ll 2>&1                | FileCheck %s
+
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250  -o - %t/null-named-barrier-func.ll 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-func.ll 2>&1               | FileCheck %s
+
+; CHECK: named barrier GV cannot be used to represent the NULL named barrier
+
+;--- null-named-barrier-kernel.ll
+
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+
+define amdgpu_kernel void @func1() {
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+    ret void
+}
+
+!0 = !{ i32 0, i32 1 }
+
+;--- null-named-barrier-func.ll
+
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+
+define void @func1() {
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+    ret void
+}
+
+!0 = !{ i32 0, i32 1 }
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
new file mode 100644
index 0000000000000..ae18ab914cc49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck  %s
+
+; The absolute_address of the GV can never be null.
+
+; CHECK: LLVM ERROR: named barrier GV cannot be used to represent the NULL named barrie
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+
+define void @func() {
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+    ret void
+}
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
new file mode 100644
index 0000000000000..ae5731b875f37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck  %s
+
+; Check what happens when the type or the AS of the barrier GV is wrong.
+; Using such a GV in a barrier intrinsic would be UB of course, but we should not crash.
+
+; @addrspacecasted doesn't trip up on this because we are not doing any unsupported
+; operation.
+;
+; CHECK: in function wrong_type void (): Unsupported use of BARRIER address space!
+
+ at bar = internal global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global i32 poison
+
+define void @addrspacecasted() {
+    %bar.ascast = addrspacecast ptr @bar to ptr addrspace(15)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %bar.ascast)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar.ascast, i32 7)
+    ret void
+}
+
+define void @wrong_type() {
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+    ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 78c134827be4d..d3b5dcaebfb68 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -3,26 +3,30 @@
 
 %class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
 
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
 
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
-; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
-; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; Test using the workgroup barrier with the GV.
+ at wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK-NEXT: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK-NEXT: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK-NEXT: @wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META3:![0-9]+]]
 
 ; SOUT:        .set .Lfunc1.num_named_barrier, 7
 define void @func1() {
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     ret void
 }
 
 ; SOUT:        .set .Lfunc2.num_named_barrier, 6
 define void @func2() {
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     ret void
 }
@@ -30,44 +34,44 @@ define void @func2() {
 ; SOUT:                .amdhsa_named_barrier_count 2
 ; SOUT:        .set .Lkernel1.num_named_barrier, max(4, .Lfunc1.num_named_barrier, .Lfunc2.num_named_barrier)
 define amdgpu_kernel void @kernel1() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
-    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
     call void @llvm.amdgcn.s.barrier()
     call void @func1()
     call void @func2()
     ret void
 }
 
-; SOUT:                .amdhsa_named_barrier_count 2
+; SOUT:      .amdhsa_kernel kernel2
+; SOUT:        .amdhsa_named_barrier_count 2
 ; SOUT:        .set .Lkernel2.num_named_barrier, max(4, .Lfunc2.num_named_barrier)
 define amdgpu_kernel void @kernel2() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
 
     call void @func2()
     ret void
 }
 
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+; SOUT:      .amdhsa_kernel wgbarr_as_gv
+; SOUT:        .amdhsa_named_barrier_count 0
+define amdgpu_kernel void @wgbarr_as_gv() {
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @wgbarr, i32 7)
+    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+    ret void
+}
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
 
-; CHECK: !0 = !{i32 8396880, i32 8396881}
-; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
-; CHECK-NEXT: !2 = !{i32 8396816, i32 8396817}
+!0 = !{i32 -1, i32 0}
+
+; CHECK:        [[META0:![0-9]+]] = !{i32 5, i32 6}
+; CHECK-NEXT:   [[META1:![0-9]+]] = !{i32 7, i32 8}
+; CHECK-NEXT:   [[META2:![0-9]+]] = !{i32 1, i32 2}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 495b7b02223e5..3dafedaf22813 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -2,9 +2,12 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+; Test using the workgroup barrier with the GV.
+ at wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
 
 define void @func1() {
 ; GFX12-SDAG-LABEL: func1:
@@ -33,8 +36,8 @@ define void @func1() {
 ; GFX12-GISEL-NEXT:    s_barrier_signal m0
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     ret void
 }
@@ -66,13 +69,13 @@ define void @func2() {
 ; GFX12-GISEL-NEXT:    s_barrier_signal m0
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     ret void
 }
 
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
 ; GFX12-SDAG-LABEL: kernel1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -85,9 +88,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_lshr_b32 s2, s2, 4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_and_b32 s2, s2, 63
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b32 s3, 0x90000, s2
 ; GFX12-SDAG-NEXT:    s_cmp_eq_u32 0, 0
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s3
@@ -141,9 +143,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_lshr_b32 s0, s0, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, 63
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b32 s1, s0, 0x90000
 ; GFX12-GISEL-NEXT:    s_cmp_eq_u32 0, 0
 ; GFX12-GISEL-NEXT:    s_mov_b32 m0, s1
@@ -188,17 +189,17 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, -1
 ; GFX12-GISEL-NEXT:    s_endpgm
-    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
-    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) @bar, i32 12)
+    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %in, i32 9)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 12)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %in, i32 9)
     call void @llvm.amdgcn.s.barrier.signal(i32 -1)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %in)
     %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     call void @llvm.amdgcn.s.barrier.leave(i16 1)
-    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
-    %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar)
+    %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %in)
     call void @llvm.amdgcn.s.barrier()
     call void @func1()
     call void @func2()
@@ -206,7 +207,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
     ret void
 }
 
-define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
 ; GFX12-SDAG-LABEL: kernel2:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
@@ -250,8 +251,8 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX12-GISEL-NEXT:    s_endpgm
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
-    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
 
     call void @func2()
@@ -268,11 +269,11 @@ define void @signal_var_cnt0_const_bar() {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_barrier_signal 1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 0)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 0)
     ret void
 }
 
-define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
+define void @signal_var_cnt0_dynamic_bar(ptr addrspace(15) inreg %bar) {
 ; GFX12-LABEL: signal_var_cnt0_dynamic_bar:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -280,12 +281,10 @@ define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s0, s0, 4
-; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_and_b32 m0, s0, 63
 ; GFX12-NEXT:    s_barrier_signal m0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %bar, i32 0)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar, i32 0)
     ret void
 }
 
@@ -308,17 +307,41 @@ define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrs
   ret void
 }
 
+
+define amdgpu_kernel void @wgbarr_as_gv() {
+; GFX12-LABEL: wgbarr_as_gv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 m0, 0x7003f
+; GFX12-NEXT:    s_barrier_signal m0
+; GFX12-NEXT:    s_barrier_wait -1
+; GFX12-NEXT:    s_endpgm
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @wgbarr, i32 7)
+    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+    ret void
+}
+
+define amdgpu_kernel void @null_barrier() {
+; GFX12-LABEL: null_barrier:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_barrier_join 0
+; GFX12-NEXT:    s_endpgm
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) null)
+    ret void
+}
+
 declare void @llvm.amdgcn.s.barrier() #1
 declare void @llvm.amdgcn.s.barrier.wait(i16) #1
 declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
 declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
 declare void @llvm.amdgcn.s.barrier.leave(i16) #1
 declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
+
+!0 = !{i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
index 2ecb380e6c37e..5d81f6b649de9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
@@ -2,9 +2,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-SDAG %s
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-GISEL %s
 
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
 
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
 ; GFX1250-SDAG-LABEL: kernel1:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -12,8 +12,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, 1
 ; GFX1250-SDAG-NEXT:    s_wakeup_barrier m0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_lshr_b32 s0, s0, 4
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_and_b32 m0, s0, 63
 ; GFX1250-SDAG-NEXT:    s_wakeup_barrier m0
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -24,18 +22,16 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c nv
 ; GFX1250-GISEL-NEXT:    s_wakeup_barrier 1
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_lshr_b32 s0, s0, 4
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-NEXT:    s_and_b32 m0, s0, 63
 ; GFX1250-GISEL-NEXT:    s_wakeup_barrier m0
 ; GFX1250-GISEL-NEXT:    s_endpgm
-    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar)
-    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in)
+    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) @bar)
+    call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %in)
     ret void
 }
 
 
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index cb52a639d12bd..b65f35394125d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -59,5 +59,5 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
 ;.
-; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
 ;.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index ba185dc27dc7e..29b0ec7dbd151 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -405,16 +405,17 @@ def ROCDL_WaveBarrierOp : ROCDL_ConcreteNonMemIntrOp<"wave.barrier", [], 0> {
 
 def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
 def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
+def ROCDLExecSync : LLVM_PointerInAddressSpace<15>;
 
 def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
-  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$memberCnt)> {
+  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
   let description = [{
     Available on gfx1250+.
 
     Example:
     ```mlir
     // Initialize a named barrier with member count.
-    rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+    rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
     ```
   }];
   let results = (outs);
@@ -437,7 +438,7 @@ def ROCDL_BarrierSignalOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal", [], 0
 }
 
 def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
-  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$memberCnt)> {
+  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
   let description = [{
     Available on gfx1250+.
 
@@ -446,7 +447,7 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
     Example:
     ```mlir
     // Signal a named barrier with variable ID.
-    rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+    rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
     ```
   }];
   let results = (outs);
@@ -454,14 +455,14 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
 }
 
 def ROCDL_BarrierJoinOp : ROCDL_IntrOp<"s.barrier.join", [], [], [], 0>,
-  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
   let description = [{
     Available on gfx1250+.
 
     Example:
     ```mlir
     // Join a named barrier.
-    rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+    rocdl.s.barrier.join %ptr : !llvm.ptr<15>
     ```
   }];
   let results = (outs);
@@ -529,14 +530,14 @@ def ROCDL_GetBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.barrier.state",
 }
 
 def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barrier.state", [], 1, [], []>,
-  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
   let description = [{
     Available on gfx1250+.
 
     Example:
     ```mlir
     // Query named barrier state by pointer.
-    %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+    %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
     ```
   }];
   let results = (outs I32:$res);
@@ -544,7 +545,7 @@ def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barri
 }
 
 def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0, [], []>,
-  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
   let description = [{
     Wakes up waves associated with a given named barrier. Note, This op does not release waves waiting
     at the barrier. It just signal other waves in the same work-group waiting on the indicated named barrier
@@ -554,7 +555,7 @@ def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0
     Example:
     ```mlir
     // Wake up waves waiting on a named barrier.
-    rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+    rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
     ```
   }];
   let assemblyFormat = "$ptr attr-dict `:` qualified(type($ptr))";
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index dd0b00faf7f1f..505bff44a6f61 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1216,10 +1216,10 @@ llvm.func @rocdl.s.barrier() {
   llvm.return
 }
 
-llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.barrier.init
-  // CHECK: rocdl.s.barrier.init %{{.*}} member_cnt = 1 : !llvm.ptr<3>
-  rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+  // CHECK: rocdl.s.barrier.init %{{.*}} member_cnt = 1 : !llvm.ptr<15>
+  rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
   llvm.return
 }
 
@@ -1230,17 +1230,17 @@ llvm.func @rocdl.s.barrier.signal() {
   llvm.return
 }
 
-llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.barrier.signal.var
-  // CHECK: rocdl.s.barrier.signal.var %{{.*}} member_cnt = 1 : !llvm.ptr<3>
-  rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+  // CHECK: rocdl.s.barrier.signal.var %{{.*}} member_cnt = 1 : !llvm.ptr<15>
+  rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
   llvm.return
 }
 
-llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.barrier.join
-  // CHECK: rocdl.s.barrier.join %{{.*}} : !llvm.ptr<3>
-  rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+  // CHECK: rocdl.s.barrier.join %{{.*}} : !llvm.ptr<15>
+  rocdl.s.barrier.join %ptr : !llvm.ptr<15>
   llvm.return
 }
 
@@ -1272,17 +1272,17 @@ llvm.func @rocdl.s.get.barrier.state() {
   llvm.return
 }
 
-llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.get.named.barrier.state
-  // CHECK: rocdl.s.get.named.barrier.state %{{.*}} : !llvm.ptr<3> -> i32
-  %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+  // CHECK: rocdl.s.get.named.barrier.state %{{.*}} : !llvm.ptr<15> -> i32
+  %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
   llvm.return
 }
 
-llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.wakeup.barrier
-  // CHECK: rocdl.s.wakeup.barrier %{{.*}} : !llvm.ptr<3>
-  rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+  // CHECK: rocdl.s.wakeup.barrier %{{.*}} : !llvm.ptr<15>
+  rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index f3233f3e50bce..0a952356a20fb 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -269,10 +269,10 @@ llvm.func @rocdl.wave_barrier() {
   llvm.return
 }
 
-llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.barrier.init
-  // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %{{.*}}, i32 1)
-  rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %{{.*}}, i32 1)
+  rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
   llvm.return
 }
 
@@ -283,17 +283,17 @@ llvm.func @rocdl.s.barrier.signal() {
   llvm.return
 }
 
-llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.barrier.signal.var
-  // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %{{.*}}, i32 1)
-  rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %{{.*}}, i32 1)
+  rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
   llvm.return
 }
 
-llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.barrier.join
-  // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %{{.*}})
-  rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %{{.*}})
+  rocdl.s.barrier.join %ptr : !llvm.ptr<15>
   llvm.return
 }
 
@@ -325,17 +325,17 @@ llvm.func @rocdl.s.get.barrier.state() {
   llvm.return
 }
 
-llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.get.named.barrier.state
-  // CHECK: %{{.*}} = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %{{.*}})
-  %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+  // CHECK: %{{.*}} = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %{{.*}})
+  %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
   llvm.return
 }
 
-llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<15>) {
   // CHECK-LABEL: rocdl.s.wakeup.barrier
-  // CHECK: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %{{.*}})
-  rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %{{.*}})
+  rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
   llvm.return
 }
 

>From 937f0051464e66b41a1f4473a7ea7c9f15d82678 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 1 Jun 2026 10:49:30 +0200
Subject: [PATCH 2/6] Fix MLIR

---
 .../CodeGen/AMDGPU/s-barrier-id-allocation.ll | 42 +++++++++----------
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  | 12 +++---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  2 +-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |  4 +-
 .../gpu-to-rocdl-barriers-gfx12.mlir          |  8 ++--
 5 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll
index 6df651b033a5f..892be0e7a3a32 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll
@@ -1,42 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s
 
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
 
 ;.
-; CHECK: @bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0]]
+; CHECK: @bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0]]
 ;.
 define void @func1() {
 ; CHECK-LABEL: define void @func1() {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   ret void
 }
 
 define void @func2() {
 ; CHECK-LABEL: define void @func2() {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+  call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+  call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
   call void @llvm.amdgcn.s.barrier.wait(i16 1)
   ret void
 }
 
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
 ; CHECK-NEXT:    call void @func1()
 ; CHECK-NEXT:    [[STATE3:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
 ; CHECK-NEXT:    ret void
@@ -46,9 +46,9 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
   ret void
 }
 
-define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(15) %in) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
 ; CHECK-NEXT:    call void @func1()
 ; CHECK-NEXT:    ret void
 ;
@@ -56,9 +56,9 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
   ret void
 }
 
-define amdgpu_kernel void @kernel3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel3(ptr addrspace(1) %out, ptr addrspace(15) %in) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel3(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
 ; CHECK-NEXT:    call void @func2()
 ; CHECK-NEXT:    [[STATE3:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
 ; CHECK-NEXT:    ret void
@@ -68,9 +68,9 @@ define amdgpu_kernel void @kernel3(ptr addrspace(1) %out, ptr addrspace(3) %in)
   ret void
 }
 
-define amdgpu_kernel void @kernel4(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel4(ptr addrspace(1) %out, ptr addrspace(15) %in) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel4(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
 ; CHECK-NEXT:    call void @func2()
 ; CHECK-NEXT:    ret void
 ;
@@ -82,5 +82,5 @@ define amdgpu_kernel void @kernel4(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ;.
-; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 1, i32 2}
 ;.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 29b0ec7dbd151..6e72bda93f5bd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -405,10 +405,10 @@ def ROCDL_WaveBarrierOp : ROCDL_ConcreteNonMemIntrOp<"wave.barrier", [], 0> {
 
 def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
 def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
-def ROCDLExecSync : LLVM_PointerInAddressSpace<15>;
+def ROCDLBarrier : LLVM_PointerInAddressSpace<15>;
 
 def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
-  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
+  Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr, I32Attr:$memberCnt)> {
   let description = [{
     Available on gfx1250+.
 
@@ -438,7 +438,7 @@ def ROCDL_BarrierSignalOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal", [], 0
 }
 
 def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
-  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
+  Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr, I32Attr:$memberCnt)> {
   let description = [{
     Available on gfx1250+.
 
@@ -455,7 +455,7 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
 }
 
 def ROCDL_BarrierJoinOp : ROCDL_IntrOp<"s.barrier.join", [], [], [], 0>,
-  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
+  Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr)> {
   let description = [{
     Available on gfx1250+.
 
@@ -530,7 +530,7 @@ def ROCDL_GetBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.barrier.state",
 }
 
 def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barrier.state", [], 1, [], []>,
-  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
+  Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr)> {
   let description = [{
     Available on gfx1250+.
 
@@ -545,7 +545,7 @@ def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barri
 }
 
 def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0, [], []>,
-  Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
+  Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr)> {
   let description = [{
     Wakes up waves associated with a given named barrier. Note, This op does not release waves waiting
     at the barrier. It just signal other waves in the same work-group waiting on the indicated named barrier
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 90e15b1680446..3b3cc6bd81fec 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -4553,7 +4553,7 @@ void mlir::amdgpu::populateCommonGPUTypeAndAttributeConversions(
       });
   typeConverter.addConversion([](gpu::NamedBarrierType type) {
     return LLVM::LLVMPointerType::get(
-        type.getContext(), ROCDL::ROCDLDialect::kSharedMemoryAddressSpace);
+        type.getContext(), ROCDL::ROCDLDialect::kBarrierAddressSpace);
   });
 }
 
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index a3819df4f8a84..7865a2a10aa0e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -683,7 +683,7 @@ struct GPUInitializeNamedBarrierOpLowering final
 
     auto targetTy = LLVM::LLVMTargetExtType::get(
         rewriter.getContext(), "amdgcn.named.barrier", {}, {0});
-    auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 3);
+    auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 15);
 
     // Build the global detached so SymbolTable::insert can both place it and
     // rename it as needed without creating a transient name conflict in IR.
@@ -691,7 +691,7 @@ struct GPUInitializeNamedBarrierOpLowering final
     auto globalOp = LLVM::GlobalOp::create(
         detachedBuilder, loc, targetTy, /*isConstant=*/false,
         LLVM::Linkage::Internal, "__named_barrier", /*value=*/Attribute(),
-        /*alignment=*/0, /*addrSpace=*/3);
+        /*alignment=*/0, /*addrSpace=*/15);
     // Initialize with poison.
     {
       Region &region = globalOp.getInitializerRegion();
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir
index c6a9574ca43c1..402dcae5e9832 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir
@@ -5,7 +5,7 @@ gpu.module @test_module {
 // CHECK-LABEL: func @named_barrier
 func.func @named_barrier() {
   %member_count = arith.constant 4 : i32
-  // CHECK: %[[ADDR:.*]] = llvm.mlir.addressof @[[NB:__named_barrier[_0-9]*]] : !llvm.ptr<3>
+  // CHECK: %[[ADDR:.*]] = llvm.mlir.addressof @[[NB:__named_barrier[_0-9]*]] : !llvm.ptr<15>
   // CHECK: rocdl.s.barrier.init %[[ADDR]] member_cnt = 4
   %nb = gpu.initialize_named_barrier %member_count : i32 -> !gpu.named_barrier
   // CHECK: llvm.fence syncscope("workgroup") release
@@ -21,10 +21,10 @@ func.func @named_barrier() {
 func.func @two_named_barriers() {
   %c4 = arith.constant 4 : i32
   %c8 = arith.constant 8 : i32
-  // CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @[[NB0:__named_barrier[_0-9]*]] : !llvm.ptr<3>
+  // CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @[[NB0:__named_barrier[_0-9]*]] : !llvm.ptr<15>
   // CHECK: rocdl.s.barrier.init %[[ADDR0]] member_cnt = 4
   %nb0 = gpu.initialize_named_barrier %c4 : i32 -> !gpu.named_barrier
-  // CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @[[NB1:__named_barrier[_0-9]*]] : !llvm.ptr<3>
+  // CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @[[NB1:__named_barrier[_0-9]*]] : !llvm.ptr<15>
   // CHECK: rocdl.s.barrier.init %[[ADDR1]] member_cnt = 8
   %nb1 = gpu.initialize_named_barrier %c8 : i32 -> !gpu.named_barrier
   // CHECK: rocdl.s.barrier.join %[[ADDR0]]
@@ -49,6 +49,6 @@ func.func @cluster_scope() {
 }
 
 // One LDS global per gpu.initialize_named_barrier.
-// CHECK-COUNT-3: llvm.mlir.global internal @__named_barrier{{[_0-9]*}}() {addr_space = 3 : i32} : !llvm.target<"amdgcn.named.barrier", 0>
+// CHECK-COUNT-3: llvm.mlir.global internal @__named_barrier{{[_0-9]*}}() {addr_space = 15 : i32} : !llvm.target<"amdgcn.named.barrier", 0>
 
 }

>From 6662f90ada3de8cd5bfdae6afb7d5558d262a939 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 2 Jun 2026 10:55:44 +0200
Subject: [PATCH 3/6] Fix docs

---
 llvm/docs/AMDGPUUsage.rst                   | 4 ++--
 llvm/include/llvm/Support/AMDGPUAddrSpace.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index fe39f20cea5f6..c2df16a9b2c80 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1371,8 +1371,8 @@ Named barriers are fixed function hardware barrier objects that are available
 in gfx12.5+ in addition to the traditional default barriers.
 
 In LLVM IR, named barriers are represented by global variables of type
-``target("amdgcn.named.barrier", 0)`` in the Execution Synchronization Resources
-address space. Programs in LLVM IR refer to named barriers using pointers.
+``target("amdgcn.named.barrier", 0)`` in the barrier address space.
+Programs in LLVM IR refer to named barriers using pointers.
 
 The following named barrier types are supported in global variables, defined
 recursively:
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 89a6f7f2dc9b2..d72ba0a1415c0 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -92,7 +92,7 @@ enum : unsigned {
   UNKNOWN_ADDRESS_SPACE = ~0u,
 };
 
-/// The BARRIER AS is does not have an aperture in HW, so when converting
+/// The BARRIER AS does not have an aperture in HW, so when converting
 /// BARRIER addresses from/to generic, we represent them as LDS addresses
 /// offset by a large amount so they can never alias with real LDS memory.
 static constexpr unsigned BarrierAddrLDSOffset = 0x802000u;

>From 681d2ed0bf13c559e1a6adf9cde3130dd95465dc Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jun 2026 12:41:32 +0200
Subject: [PATCH 4/6] Address comments, fix rebase

---
 llvm/docs/AMDGPUUsage.rst                        | 8 ++++----
 mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.td | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index c2df16a9b2c80..59e4617c4e881 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1200,11 +1200,11 @@ supported for the ``amdgcn`` target.
   * Never alias with any other pointers outside this address space.
   * Cannot be dereferenced.
   * Can only be consumed by intrinsics.
-  * Are always uniform.
 
-  Pointer are 32 bits and directly correspond to valid barrier IDs. All barrier pointers must,
-  when interpreted as signed 32 bit integers, have a value corresponding to a valid barrier ID
-  on the target. Otherwise, the behavior is undefined.
+  Pointer are 32 bits and directly correspond to valid barrier IDs. When consumed by an
+  intrinsic, all barrier pointers must, when interpreted as signed 32 bit integers,
+  have a value corresponding to a valid barrier ID on the target.
+  Otherwise, the behavior is undefined
 
   Due to these pointers being a compiler abstraction without a corresponding hardware aperture,
   the back-end handles them as-if they were local pointers with a very large offset as to not
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.td
index 0807cf8cf04a4..2b4002e98caae 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.td
@@ -142,6 +142,8 @@ def ROCDL_Dialect : Dialect {
     static constexpr unsigned kConstantMemoryAddressSpace = 4;
     /// The address space value that represents private memory.
     static constexpr unsigned kPrivateMemoryAddressSpace = 5;
+    /// The address space value that represents barriers.
+    static constexpr unsigned kBarrierAddressSpace = 15;
   }];
 
   let discardableAttrs = (ins

>From 2e599c0423e6af19552ef866fac2e8ee19e1f280 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 12 Jun 2026 13:05:17 +0200
Subject: [PATCH 5/6] Clean-up docs

---
 llvm/docs/AMDGPUUsage.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 59e4617c4e881..e925b18a0e77f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1206,9 +1206,9 @@ supported for the ``amdgcn`` target.
   have a value corresponding to a valid barrier ID on the target.
   Otherwise, the behavior is undefined
 
-  Due to these pointers being a compiler abstraction without a corresponding hardware aperture,
-  the back-end handles them as-if they were local pointers with a very large offset as to not
-  overlap with any addressable local memory.
+  These pointers do not have a corresponding hardware aperture but safe round-tripping
+  through the generic address space is still possible. Attempting to dereference a
+  generic pointer derived from a barrier pointer is undefined behavior.
 
 **Streamout Registers**
   Dedicated registers used by the GS NGG Streamout Instructions. The register

>From bd37699785cfbc81aafc8011c499c64c555de5b4 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 15 Jun 2026 14:26:12 +0200
Subject: [PATCH 6/6] comments

---
 mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 7865a2a10aa0e..ef1292918a1cf 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -683,7 +683,8 @@ struct GPUInitializeNamedBarrierOpLowering final
 
     auto targetTy = LLVM::LLVMTargetExtType::get(
         rewriter.getContext(), "amdgcn.named.barrier", {}, {0});
-    auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 15);
+    auto ptrTy = LLVM::LLVMPointerType::get(
+        rewriter.getContext(), ROCDL::ROCDLDialect::kBarrierAddressSpace);
 
     // Build the global detached so SymbolTable::insert can both place it and
     // rename it as needed without creating a transient name conflict in IR.
@@ -691,7 +692,8 @@ struct GPUInitializeNamedBarrierOpLowering final
     auto globalOp = LLVM::GlobalOp::create(
         detachedBuilder, loc, targetTy, /*isConstant=*/false,
         LLVM::Linkage::Internal, "__named_barrier", /*value=*/Attribute(),
-        /*alignment=*/0, /*addrSpace=*/15);
+        /*alignment=*/0,
+        /*addrSpace=*/ROCDL::ROCDLDialect::kBarrierAddressSpace);
     // Initialize with poison.
     {
       Region &region = globalOp.getInitializerRegion();



More information about the llvm-branch-commits mailing list