[llvm-branch-commits] [clang] [llvm] [mlir] [RFC][AMDGPU] Add BARRIER address space (PR #195613)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue May 12 05:07:51 PDT 2026
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/195613
>From 2b96f9c1601d9d8c31b0ea9e65321aa18fd2f0b5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 22 Apr 2026 11:35:22 +0200
Subject: [PATCH 1/2] [RFC][AMDGPU] Add BARRIER address space
Add a new BARRIER address space that is used for global variables that are used to represent the barrier IDs in GFX12.5.
These barrier addresses just have values corresponding 1-1 to barrier IDs. They are still implemented on top of LDS, but the offsetting happens during an addrspacecast to generic, not whenever the barrier GV is used.
The motivation for this is to make the relation between LDS and barrier GVs explicit in the compiler. It does add a bit more complexity, but that complexity was already there, just hidden by pretending barrier GVs were actual LDS.
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 15 +-
clang/test/CodeGen/target-data.c | 4 +-
clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl | 2 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 16 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 14 +-
llvm/docs/AMDGPUUsage.rst | 34 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 +-
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 16 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 28 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 24 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 66 ++-
.../lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 88 ++--
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 10 -
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 4 +-
.../AMDGPU/AMDGPUMachineFunctionInfo.cpp | 42 +-
.../Target/AMDGPU/AMDGPUMachineFunctionInfo.h | 9 +-
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 12 +-
llvm/lib/Target/AMDGPU/SIDefines.h | 4 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 107 +++--
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +-
llvm/lib/TargetParser/TargetDataLayout.cpp | 5 +-
.../AMDGPU/always_uniform.ll | 6 +-
.../CodeGen/AMDGPU/addrspacecast-barrier.ll | 442 ++++++++++++++++++
.../amdgpu-lower-exec-sync-and-module-lds.ll | 66 +--
.../amdgpu-lower-exec-sync-and-sw-lds.ll | 39 +-
.../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 66 +--
.../AMDGPU/annotate-kernel-features-hsa.ll | 8 +-
...utor-flatscratchinit-undefined-behavior.ll | 4 +-
.../AMDGPU/attributor-flatscratchinit.ll | 20 +-
.../AMDGPU/attributor-noalias-addrspace.ll | 4 +-
.../lds-link-time-codegen-named-barrier.ll | 12 +-
.../AMDGPU/lds-link-time-named-barrier.ll | 14 +-
.../CodeGen/AMDGPU/null-named-barrier-gv.ll | 31 ++
.../s-barrier-lowering-bad-absolute-symbol.ll | 16 +
.../s-barrier-lowering-wrong-gv-signature.ll | 27 ++
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 67 +--
llvm/test/CodeGen/AMDGPU/s-barrier.ll | 85 ++--
llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll | 14 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 2 +-
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 23 +-
mlir/test/Dialect/LLVMIR/rocdl.mlir | 30 +-
mlir/test/Target/LLVMIR/rocdl.mlir | 30 +-
44 files changed, 1091 insertions(+), 473 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index be7c2f9c89d97..069deecac7a4d 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -555,8 +555,21 @@ AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
LangAS AMDGPUTargetCodeGenInfo::adjustGlobalVarAddressSpace(
CodeGenModule &CGM, const VarDecl *D, std::optional<LangAS> AS) const {
- if (AS)
+ if (AS) {
+ // NamedWorkgroupBarrier GVs are declared as __shared__, but the back-end
+ // models them as a separate address space.
+ const LangOptions &LangOpts = CGM.getLangOpts();
+ if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+ AS == LangAS::cuda_shared) {
+ const Type *Ty = D->getType().getCanonicalType().getTypePtr();
+ if (Ty->isArrayType())
+ Ty = Ty->getBaseElementTypeUnsafe();
+ const BuiltinType *BTy = dyn_cast<BuiltinType>(Ty);
+ if (BTy && BTy->getKind() == BuiltinType::AMDGPUNamedWorkgroupBarrier)
+ return getLangASFromTargetAS(llvm::AMDGPUAS::BARRIER);
+ }
return *AS;
+ }
LangAS DefaultGlobalAS = getLangASFromTargetAS(
CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 3912c22cc8814..d65476b85c29d 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// Test default -target-cpu
// RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..fcee9b3b20813 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
void foo(void) {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 14d7e7a365989..aa0fb93db3cc1 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -94,9 +94,9 @@ void test_s_barrier_signal()
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
void test_s_barrier_signal_var(void *bar, int a)
@@ -148,9 +148,9 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
void test_s_barrier_init(void *bar, int a)
@@ -164,8 +164,8 @@ void test_s_barrier_init(void *bar, int a)
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_barrier_join(void *bar)
@@ -208,8 +208,8 @@ unsigned test_s_get_barrier_state(int a)
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT: ret i32 [[TMP3]]
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 0b4cdd0c2c28f..9435644f8f530 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -1495,8 +1495,8 @@ void test_s_cluster_barrier()
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_wakeup_barrier(void *bar)
@@ -1514,7 +1514,7 @@ void test_s_wakeup_barrier(void *bar)
// CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]]
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
// CHECK-NEXT: ret float [[TMP2]]
//
float test_global_add_f32(global float *addr, float x) {
@@ -1531,7 +1531,7 @@ float test_global_add_f32(global float *addr, float x) {
// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: ret <2 x half> [[TMP2]]
//
half2 test_global_add_half2(global half2 *addr, half2 x) {
@@ -1548,7 +1548,7 @@ half2 test_global_add_half2(global half2 *addr, half2 x) {
// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: ret <2 x half> [[TMP2]]
//
half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
@@ -1566,7 +1566,7 @@ half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
-// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
// CHECK-NEXT: ret <2 x i16> [[TMP4]]
//
@@ -1585,7 +1585,7 @@ short2 test_flat_add_2bf16(generic short2 *addr, short2 x) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
-// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
// CHECK-NEXT: ret <2 x i16> [[TMP4]]
//
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 4ac8cc9197515..41bbd7c881350 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -928,6 +928,7 @@ supported for the ``amdgcn`` target.
*reserved for downstream use (LLPC)* 12
*reserved for future use* 13
*reserved for future use* 14
+ Barrier 15 N/A N/A 32 0
*reserved for future use* 16
Streamout Registers 128 N/A GS_REGS
===================================== =============== =========== ================ ======= ============================
@@ -1141,6 +1142,23 @@ supported for the ``amdgcn`` target.
a buffer strided pointer, this means that the base pointer is ``align(4)``, that
the offset is a multiple of 4 bytes, and that the stride is a multiple of 4.
+**Barrier**
+ This address space represents barrier IDs (introduced in GFX12) as addresses.
+ It does not map directly to any addressable memory, thus pointers into this address space:
+
+ * Never alias with any other pointers outside this address space.
+ * Cannot be dereferenced.
+ * Can only be consumed by intrinsics.
+ * Are always uniform.
+
+ Pointer are 32 bits and directly correspond to valid barrier IDs. All barrier pointers must,
+ when interpreted as signed 32 bit integers, have a value corresponding to a valid barrier ID
+ on the target. Otherwise, the behavior is undefined.
+
+ Due to these pointers being a compiler abstraction without a corresponding hardware aperture,
+ the back-end handles them as-if they were local pointers with a very large offset as to not
+ overlap with any addressable local memory.
+
**Streamout Registers**
Dedicated registers used by the GS NGG Streamout Instructions. The register
file is modelled as a memory in a distinct address space because it is indexed
@@ -1293,10 +1311,8 @@ Named barriers are fixed function hardware barrier objects that are available
in gfx12.5+ in addition to the traditional default barriers.
In LLVM IR, named barriers are represented by global variables of type
-``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier
-global variables do not occupy actual LDS memory, but their lifetime and
-allocation scope matches that of global variables in LDS. Programs in LLVM IR
-refer to named barriers using pointers.
+``target("amdgcn.named.barrier", 0)`` in the Execution Synchronization Resources
+address space. Programs in LLVM IR refer to named barriers using pointers.
The following named barrier types are supported in global variables, defined
recursively:
@@ -1307,14 +1323,14 @@ recursively:
.. code-block:: llvm
- @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef
- @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef
- @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef
+ @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef
+ @foo = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef
+ @baz = addrspace(15) global { target("amdgcn.named.barrier", 0) } undef
...
- %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0)
+ %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(15) @foo, i32 0, i32 %i
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %foo.i, i32 0)
Named barrier types may not be used in ``alloca``.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8631985de9a0a..3ecb1a536a8be 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -13,6 +13,7 @@
def flat_ptr_ty : LLVMQualPointerType<0>;
def global_ptr_ty : LLVMQualPointerType<1>;
def local_ptr_ty : LLVMQualPointerType<3>;
+def barrier_ptr_ty : LLVMQualPointerType<15>;
// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
// by the backend cause whole-program undefined behavior when violated, such as
@@ -295,7 +296,7 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa
// If %memberCnt is 0, the member count is retained from the previous
// s_barrier_init or s_barrier_signal operation.
def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
- Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [barrier_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType)
@@ -307,19 +308,19 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri
// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
- Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ Intrinsic<[], [barrier_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
- Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
- Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
@@ -342,7 +343,7 @@ def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrie
// uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">,
- Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[llvm_i32_ty], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 5fe52dc1279bd..810a2aff8ef0c 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -26,8 +26,7 @@ namespace llvm {
/// memory locations.
namespace AMDGPUAS {
enum : unsigned {
- // The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 9,
+ MAX_AMDGPU_ADDRESS = 15,
FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
@@ -47,6 +46,14 @@ enum : unsigned {
BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
///< pointers with an additional index.
+ RESERVED_0 = 10,
+ RESERVED_1 = 11,
+ RESERVED_2 = 12,
+ RESERVED_3 = 13,
+ RESERVED_4 = 14,
+
+ BARRIER = 15, ///< Address space for modeling barrier IDs as addresses.
+
RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
/// Internal address spaces. Can be freely renumbered.
@@ -84,6 +91,11 @@ enum : unsigned {
// Some places use this if the address space can't be determined.
UNKNOWN_ADDRESS_SPACE = ~0u,
};
+
+/// The BARRIER AS is does not have an aperture in HW, so when converting
+/// BARRIER addresses from/to generic, we represent them as LDS addresses
+/// offset by a large amount so they can never alias with real LDS memory.
+static constexpr unsigned BarrierAddrLDSOffset = 0x802000u;
} // end namespace AMDGPUAS
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c6dd1dbb62449..e71cbf3921de1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -622,17 +622,23 @@ static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
// clang-format off
static const bool ASAliasRules[][AMDGPUAS::MAX_AMDGPU_ADDRESS + 1] = {
- /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
- /* Flat */ {true, true, false, true, true, true, true, true, true, true},
- /* Global */ {true, true, false, false, true, false, true, true, true, true},
- /* Region */ {false, false, true, false, false, false, false, false, false, false},
- /* Local */ {true, false, false, true, false, false, false, false, false, false},
- /* Constant */ {true, true, false, false, false, false, true, true, true, true},
- /* Private */ {true, false, false, false, false, true, false, false, false, false},
- /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true},
- /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true},
- /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true},
- /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true},
+ /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr Reserved Reserved Reserved Reserved Reserved ExecSync */
+ /* Flat */ {true, true, false, true, true, true, true, true, true, true, false, false, false, false, false, false},
+ /* Global */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Region */ {false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Local */ {true, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Constant */ {true, true, false, false, false, false, true, true, true, true, false, false, false, false, false, false},
+ /* Private */ {true, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false},
+ /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true, false, false, false, false, false, false},
+ /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Barrier */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
};
// clang-format on
static_assert(std::size(ASAliasRules) == AMDGPUAS::MAX_AMDGPU_ADDRESS + 1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index dd7b4dee76c45..e3652df0afdb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -18,6 +18,7 @@ def p3 : PtrValueType<i32, 3>;
def p4 : PtrValueType<i64, 4>;
def p5 : PtrValueType<i32, 5>;
def p6 : PtrValueType<i32, 6>;
+def p15 : PtrValueType<i32, 15>;
//===-----------------------------------------------------------------------===//
// AMDGPU Subtarget Feature (device properties)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f0918a86be757..983be7c84ac2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Target/TargetMachine.h"
@@ -1534,25 +1535,42 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
+ const auto TrapAndPoison = [&] {
+ SDLoc DL(Op);
+ SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
+ SDValue OutputChain =
+ DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Trap, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ return DAG.getPOISON(Op.getValueType());
+ };
+
+ if (G->getAddressSpace() == AMDGPUAS::BARRIER) {
+ const GlobalVariable *GVar = cast<GlobalVariable>(GV);
+
+ if (!AMDGPU::isNamedBarrier(*GVar)) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ Fn, "Unsupported use of BARRIER address space!",
+ SDLoc(Op).getDebugLoc(), DS_Error));
+ return TrapAndPoison();
+ }
+
+ unsigned Offset = MFI->allocateBarrierGlobal(DL, *cast<GlobalVariable>(GV));
+ return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+ }
+
if (!MFI->isModuleEntryFunction()) {
- auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
if (std::optional<uint32_t> Address =
- AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
- if (IsNamedBarrier) {
- unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
- MFI->recordNumNamedBarriers(Address.value(), BarCnt);
- }
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::LOCAL_ADDRESS)) {
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
- } else if (IsNamedBarrier) {
- llvm_unreachable("named barrier should have an assigned address");
}
}
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds" &&
- !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+ GV->getName() != "llvm.amdgcn.module.lds") {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
@@ -1564,11 +1582,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
// functions that use local objects. However, if these dead functions are
// not eliminated, we don't want a compile time error. Just emit a warning
// and a trap, since there should be no callable path here.
- SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
- SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- Trap, DAG.getRoot());
- DAG.setRoot(OutputChain);
- return DAG.getPOISON(Op.getValueType());
+ return TrapAndPoison();
}
// XXX: What does the value of G->getOffset() mean?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c157a88694c39..f21774de24516 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -7161,7 +7161,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
std::optional<int64_t> BarValImm =
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
if (BarValImm) {
- auto BarID = ((*BarValImm) >> 4) & 0x3F;
+ uint32_t BarID = *BarValImm & 0x3F;
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
.addImm(BarID);
I.eraseFromParent();
@@ -7170,16 +7170,10 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
}
}
- // BarID = (BarOp >> 4) & 0x3F
- Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
- .add(BarOp)
- .addImm(4u)
- .setOperandDead(3); // Dead scc
-
+ // BarID = BarOp & 0x3F
Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
- .addReg(TmpReg0)
+ .add(BarOp)
.addImm(0x3F)
.setOperandDead(3); // Dead scc
@@ -7228,16 +7222,10 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
if (!BarValImm) {
- // BarID = (BarOp >> 4) & 0x3F
- Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
- .addReg(BarOp.getReg())
- .addImm(4u)
- .setOperandDead(3); // Dead scc;
-
+ // BarID = BarOp & 0x3F
Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
- .addReg(TmpReg0)
+ .addReg(BarOp.getReg())
.addImm(0x3F)
.setOperandDead(3); // Dead scc;
@@ -7260,7 +7248,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
}
if (BarValImm) {
- auto BarId = ((*BarValImm) >> 4) & 0x3F;
+ uint32_t BarId = *BarValImm & 0x3F;
MIB.addImm(BarId);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 43db1ead84c80..51243cc493b6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -36,6 +36,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
@@ -2352,15 +2353,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
+ bool IsLDS = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER);
+ assert(IsLDS || AS == AMDGPUAS::PRIVATE_ADDRESS);
if (ST.hasApertureRegs()) {
// Note: this register is somewhat broken. When used as a 32-bit operand,
// it only returns zeroes. The real value is in the upper 32 bits.
// Thus, we must emit extract the high 32 bits.
- const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
- ? AMDGPU::SRC_SHARED_BASE
- : AMDGPU::SRC_PRIVATE_BASE;
+ const unsigned ApertureRegNo =
+ IsLDS ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!ST.hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
@@ -2415,7 +2416,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
- uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+ uint32_t StructOffset = IsLDS ? 0x40 : 0x44;
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo,
@@ -2483,7 +2484,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
}
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
- (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+ (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::BARRIER ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -2501,7 +2502,17 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
return B.buildIntToPtr(Dst, Sub).getReg(0);
}
- // Extract low 32-bits of the pointer.
+ if (DestAS == AMDGPUAS::BARRIER) {
+ // flat -> barrier: extract the low 32 bits, then sub the barrier AS
+ // offset.
+ Register LoBits = B.buildExtract(S32, Src, 0).getReg(0);
+ Register Sub =
+ B.buildSub(S32, LoBits,
+ B.buildConstant(S32, AMDGPUAS::BarrierAddrLDSOffset))
+ .getReg(0);
+ return B.buildIntToPtr(Dst, Sub).getReg(0);
+ }
+
return B.buildExtract(Dst, Src, 0).getReg(0);
};
@@ -2530,7 +2541,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
}
if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
- (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::BARRIER ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
// Coerce the type of the low half of the result so we can use
@@ -2572,6 +2583,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (!ApertureReg.isValid())
return false;
+ if (SrcAS == AMDGPUAS::BARRIER) {
+ // barrier -> flat: add the barrier AS offset
+ SrcAsInt =
+ B.buildAdd(S32, SrcAsInt,
+ B.buildConstant(S32, AMDGPUAS::BarrierAddrLDSOffset))
+ .getReg(0);
+ }
+
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -3267,10 +3286,32 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
MachineFunction &MF = B.getMF();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const auto TrapAndPoison = [&] {
+ B.buildTrap();
+ B.buildUndef(DstReg);
+ MI.eraseFromParent();
+ return true;
+ };
+
+ if (AS == AMDGPUAS::BARRIER) {
+ const GlobalVariable *GVar = cast<GlobalVariable>(GV);
+ if (!AMDGPU::isNamedBarrier(*GVar)) {
+ const Function &Fn = MF.getFunction();
+ Fn.getContext().diagnose(DiagnosticInfoUnsupported(
+ Fn, "Unsupported use of BARRIER address space!", MI.getDebugLoc(),
+ DS_Error));
+ return TrapAndPoison();
+ }
+
+ B.buildConstant(DstReg,
+ MFI->allocateBarrierGlobal(B.getDataLayout(), *GVar));
+ MI.eraseFromParent();
+ return true;
+ }
+
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds" &&
- !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+ GV->getName() != "llvm.amdgcn.module.lds") {
const Function &Fn = MF.getFunction();
Fn.getContext().diagnose(DiagnosticInfoUnsupported(
Fn, "local memory global used by non-kernel function",
@@ -3281,10 +3322,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
// functions that use local objects. However, if these dead functions are
// not eliminated, we don't want a compile time error. Just emit a warning
// and a trap, since there should be no callable path here.
- B.buildTrap();
- B.buildUndef(DstReg);
- MI.eraseFromParent();
- return true;
+ return TrapAndPoison();
}
// TODO: We could emit code to handle the initialization somewhere.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 0fcda203d3810..8d04538599a16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -6,12 +6,10 @@
//
//===----------------------------------------------------------------------===//
//
-// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// Lower global variables with target extension type "amdgpu.named.barrier"
// that require specialized address assignment. It assigns a unique
-// barrier identifier to each named-barrier LDS variable and encodes
+// barrier identifier to each named-barrier variable and encodes
// this identifier within the !absolute_symbol metadata of that global.
-// This encoding ensures that subsequent LDS lowering passes can process these
-// barriers correctly without conflicts.
//
//===----------------------------------------------------------------------===//
@@ -37,6 +35,10 @@ using namespace AMDGPU;
namespace {
+static bool isNamedBarrierToLower(const GlobalVariable &GV) {
+ return isNamedBarrier(GV) && !GV.isAbsoluteSymbolRef();
+}
+
// If GV is also used directly by other kernels, create a new GV
// used only by this kernel and its function.
static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
@@ -72,8 +74,8 @@ static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
// Write the specified address into metadata where it can be retrieved by
// the assembler. Format is a half open range, [Address Address+1)
-static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
- uint32_t Address) {
+static void recordAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
LLVMContext &Ctx = M->getContext();
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
@@ -89,42 +91,36 @@ template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
return {std::move(V)};
}
-// Main utility function for special LDS variables lowering.
static bool lowerExecSyncGlobalVariables(
- Module &M, GVUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ Module &M, GVUsesInfoTy &UsesInfo,
+ VariableFunctionMap &GVsToKernelsThatNeedToAccessItIndirectly) {
bool Changed = false;
const DataLayout &DL = M.getDataLayout();
// The 1st round: give module-absolute assignments
int NumAbsolutes = 0;
SmallVector<GlobalVariable *> OrderedGVs;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ for (auto &K : GVsToKernelsThatNeedToAccessItIndirectly) {
GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
- continue;
+ assert(isNamedBarrierToLower(*GV));
+
// give a module-absolute assignment if it is indirectly accessed by
// multiple kernels. This is not precise, but we don't want to duplicate
// a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ if (GVsToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
OrderedGVs.push_back(GV);
} else {
// leave it to the 2nd round, which will give a kernel-relative
// assignment if it is only indirectly accessed by one kernel
- LDSUsesInfo.DirectAccess[*K.second.begin()].insert(GV);
+ UsesInfo.DirectAccess[*K.second.begin()].insert(GV);
}
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ GVsToKernelsThatNeedToAccessItIndirectly.erase(GV);
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
- unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = NumAbsolutes + 1;
unsigned BarCnt = GV->getGlobalSize(DL) / 16;
NumAbsolutes += BarCnt;
-
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
+ recordAbsoluteAddress(&M, GV, BarId);
}
OrderedGVs.clear();
@@ -132,7 +128,7 @@ static bool lowerExecSyncGlobalVariables(
// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
SmallVector<Function *> OrderedKernels;
- for (auto &K : LDSUsesInfo.DirectAccess) {
+ for (auto &K : UsesInfo.DirectAccess) {
Function *F = K.first;
assert(isKernel(*F));
OrderedKernels.push_back(F);
@@ -141,11 +137,10 @@ static bool lowerExecSyncGlobalVariables(
DenseMap<Function *, uint32_t> Kernel2BarId;
for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.DirectAccess[F]) {
- if (!isNamedBarrier(*GV))
- continue;
+ for (GlobalVariable *GV : UsesInfo.DirectAccess[F]) {
+ assert(isNamedBarrierToLower(*GV));
- LDSUsesInfo.DirectAccess[F].erase(GV);
+ UsesInfo.DirectAccess[F].erase(GV);
if (GV->isAbsoluteSymbolRef()) {
// already assigned
continue;
@@ -156,20 +151,19 @@ static bool lowerExecSyncGlobalVariables(
for (GlobalVariable *GV : OrderedGVs) {
// GV could also be used directly by other kernels. If so, we need to
// create a new GV used only by this kernel and its function.
- auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ auto *NewGV = uniquifyGVPerKernel(M, GV, F);
Changed |= (NewGV != GV);
- unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = Kernel2BarId[F];
BarId += NumAbsolutes + 1;
unsigned BarCnt = GV->getGlobalSize(DL) / 16;
Kernel2BarId[F] += BarCnt;
- unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ recordAbsoluteAddress(&M, NewGV, BarId);
}
OrderedGVs.clear();
}
- // Also erase those special LDS variables from indirect_access.
- for (auto &K : LDSUsesInfo.IndirectAccess) {
+ // TODO: is this even necessary?
+ // Also erase those special variables from indirect_access.
+ for (auto &K : UsesInfo.IndirectAccess) {
assert(isKernel(*K.first));
for (GlobalVariable *GV : K.second) {
if (isNamedBarrier(*GV))
@@ -179,18 +173,6 @@ static bool lowerExecSyncGlobalVariables(
return Changed;
}
-static bool hasBarrierToLower(const GVUsesInfoTy &LDSUsesInfo) {
- for (auto &Map : {LDSUsesInfo.DirectAccess, LDSUsesInfo.IndirectAccess}) {
- for (auto &[Fn, GVs] : Map) {
- for (auto &GV : GVs) {
- if (AMDGPU::isNamedBarrier(*GV))
- return true;
- }
- }
- }
- return false;
-}
-
// With object linking, barrier ID assignment is deferred to the linker.
// Externalize named barrier globals and emit self-contained metadata so the
// AsmPrinter can generate the callgraph entries the linker needs.
@@ -238,27 +220,25 @@ static bool runLowerExecSyncGlobals(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = false;
Changed |=
- eliminateGVConstantExprUsesFromAllInstructions(M, isLDSVariableToLower);
+ eliminateGVConstantExprUsesFromAllInstructions(M, isNamedBarrierToLower);
// For each kernel, what variables does it access directly or through
// callees
- GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
+ GVUsesInfoTy BarrierUsesInfo =
+ getTransitiveUsesOfGV(CG, M, isNamedBarrierToLower);
// For each variable accessed through callees, which kernels access it
- VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
- for (auto &K : LDSUsesInfo.IndirectAccess) {
+ VariableFunctionMap BarriersToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : BarrierUsesInfo.IndirectAccess) {
Function *F = K.first;
assert(isKernel(*F));
for (GlobalVariable *GV : K.second) {
- LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ BarriersToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
- if (hasBarrierToLower(LDSUsesInfo)) {
- // Special LDS variables need special address assignment
- Changed |= lowerExecSyncGlobalVariables(
- M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
- }
+ Changed |= lowerExecSyncGlobalVariables(
+ M, BarrierUsesInfo, BarriersToKernelsThatNeedToAccessItIndirectly);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index ef86d279d193b..12e2478e055b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -937,16 +937,6 @@ class AMDGPULowerModuleLDS {
for (auto &[F, Vars] : FunctionLDSUses)
AllLDSUses[F].insert(Vars.begin(), Vars.end());
- // Named barriers are handled by AMDGPULowerExecSync; filter them out.
- for (auto &[F, Vars] : AllLDSUses) {
- SmallVector<GlobalVariable *> Barriers;
- for (GlobalVariable *V : Vars)
- if (AMDGPU::isNamedBarrier(*V))
- Barriers.push_back(V);
- for (GlobalVariable *V : Barriers)
- Vars.erase(V);
- }
-
// Build reverse map: LDS variable -> functions that use it.
DenseMap<GlobalVariable *, SmallVector<Function *, 4>> VarToFuncs;
for (auto &[F, Vars] : AllLDSUses) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2863f263fcf94..0ab3ca0687608 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -31,6 +31,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
@@ -277,7 +278,8 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
// Intercept LDS variables with known addresses
if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) {
if (std::optional<uint32_t> Address =
- AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::LOCAL_ADDRESS)) {
auto *IntTy = Type::getInt32Ty(CV->getContext());
return AsmPrinter::lowerConstant(ConstantInt::get(IntTy, *Address),
BaseCV, Offset);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
index 3e8a75a7eb840..540e0981a95a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
@@ -15,6 +15,7 @@
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -97,17 +98,8 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- if (AMDGPU::isNamedBarrier(GV)) {
- std::optional<unsigned> BarAddr = getLDSAbsoluteAddress(GV);
- if (!BarAddr)
- llvm_unreachable("named barrier should have an assigned address");
- Entry.first->second = BarAddr.value();
- unsigned BarCnt = GV.getGlobalSize(DL) / 16;
- recordNumNamedBarriers(BarAddr.value(), BarCnt);
- return BarAddr.value();
- }
-
- std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
+ std::optional<uint32_t> MaybeAbs =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
if (MaybeAbs) {
// Absolute address LDS variables that exist prior to the LDS lowering
// pass raise a fatal error in that pass. These failure modes are only
@@ -165,6 +157,26 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
+unsigned
+AMDGPUMachineFunctionInfo::allocateBarrierGlobal(const DataLayout &DL,
+ const GlobalVariable &GV) {
+ assert(AMDGPU::isNamedBarrier(GV));
+ std::optional<unsigned> BarAddr =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::BARRIER);
+ if (!BarAddr)
+ llvm_unreachable("named barrier should have an assigned address");
+ if (*BarAddr == 0) {
+ // We cannot allow this because some places in CodeGen (rightfully) assume a
+ // GV address is never null. For example, there are no null checks on
+ // addrspacecast if the pointer is a GV pointer.
+ report_fatal_error(
+ "named barrier GV cannot be used to represent the NULL named barrier");
+ }
+ unsigned BarCnt = GV.getGlobalSize(DL) / 16;
+ recordNumNamedBarriers(BarAddr.value(), BarCnt);
+ return BarAddr.value();
+}
+
std::optional<uint32_t>
AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
@@ -182,8 +194,9 @@ AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
}
std::optional<uint32_t>
-AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(const GlobalValue &GV) {
- if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(const GlobalValue &GV,
+ unsigned AS) {
+ if (GV.getAddressSpace() != AS)
return {};
std::optional<ConstantRange> AbsSymRange = GV.getAbsoluteSymbolRange();
@@ -221,7 +234,8 @@ void AMDGPUMachineFunctionInfo::setDynLDSAlign(const Function &F,
const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
if (Dyn) {
unsigned Offset = LDSSize; // return this?
- std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+ std::optional<uint32_t> Expect =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
if (!Expect || (Offset != *Expect)) {
report_fatal_error("Inconsistent metadata on dynamic LDS variable");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
index 36db6c2dd0d12..c65592bd965ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
@@ -82,7 +82,7 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
NumNamedBarriers =
- std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+ std::max(NumNamedBarriers, (GVAddr & 0x1ff) + BarCnt - 1);
}
uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
@@ -109,8 +109,13 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
Align Trailing);
+ unsigned allocateBarrierGlobal(const DataLayout &DL,
+ const GlobalVariable &GV);
+
static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
- static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
+
+ static std::optional<uint32_t> get32BitAbsoluteAddress(const GlobalValue &GV,
+ unsigned AS);
Align getDynLDSAlign() const { return DynLDSAlign; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index 5c2172ff26cef..33e598d781eca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-memory-utils"
@@ -59,6 +60,8 @@ static TargetExtType *getTargetExtType(const GlobalVariable &GV) {
}
TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
+ if (GV.getAddressSpace() != AMDGPUAS::BARRIER)
+ return nullptr;
if (TargetExtType *Ty = getTargetExtType(GV))
return Ty->getName() == "amdgcn.named.barrier" ? Ty : nullptr;
return nullptr;
@@ -275,15 +278,6 @@ GVUsesInfoTy getTransitiveUsesOfLDSForLowering(const CallGraph &CG, Module &M) {
if (IsDirectMapDynLDSGV)
continue;
- // TODO: Remove once barriers are no longer in the LDS AS.
- if (isNamedBarrier(*GV)) {
- if (IsAbsolute) {
- UsesInfo.DirectAccess[Fn].erase(GV);
- UsesInfo.IndirectAccess[Fn].erase(GV);
- }
- continue;
- }
-
if (HasAbsoluteGVs.has_value()) {
if (*HasAbsoluteGVs != IsAbsolute) {
reportFatalUsageError(
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 9867f5b02d7a7..6b0b360ebbd2e 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1132,10 +1132,6 @@ enum Type {
NAMED_BARRIER_LAST = 16,
};
-enum {
- BARRIER_SCOPE_WORKGROUP = 0,
-};
-
} // namespace Barrier
} // namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8b89366f89c5a..527e4057a94f3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -43,6 +43,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -8377,9 +8378,11 @@ bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
// linker can assign their offsets.
if (AMDGPUTargetMachine::EnableObjectLinking) {
if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
- if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
- "when object linking is enabled");
+ if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ GVar->getAddressSpace() == AMDGPUAS::BARRIER) {
+ assert(GVar->isDeclaration() &&
+ "AS 3 & 13 GVs should be declaration here "
+ "when object linking is enabled");
return false;
}
}
@@ -9096,10 +9099,11 @@ SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
+ const bool IsLDS = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER);
+
if (Subtarget->hasApertureRegs()) {
- const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
- ? AMDGPU::SRC_SHARED_BASE
- : AMDGPU::SRC_PRIVATE_BASE;
+ const unsigned ApertureRegNo =
+ IsLDS ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!Subtarget->hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
@@ -9121,8 +9125,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// implicit kernargs.
const Module *M = DAG.getMachineFunction().getFunction().getParent();
if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
- ImplicitParameter Param =
- (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
+ ImplicitParameter Param = IsLDS ? SHARED_BASE : PRIVATE_BASE;
return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
}
@@ -9140,7 +9143,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
- uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+ uint32_t StructOffset = IsLDS ? 0x40 : 0x44;
SDValue Ptr =
DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
@@ -9196,10 +9199,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
- // flat -> local/private
+ // flat -> local/private/barrier
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
- DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS || DestAS == AMDGPUAS::BARRIER) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -9212,6 +9215,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
0);
Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+ } else if (DestAS == AMDGPUAS::BARRIER) {
+ // flat -> barrier: sub the barrier AS offset.
+ Ptr = DAG.getNode(
+ ISD::SUB, SL, MVT::i32, Ptr,
+ DAG.getConstant(AMDGPUAS::BarrierAddrLDSOffset, SL, MVT::i32));
}
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
@@ -9226,10 +9234,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}
}
- // local/private -> flat
+ // local/private/barrier -> flat
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS || SrcAS == AMDGPUAS::BARRIER) {
SDValue CvtPtr;
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
@@ -9261,7 +9269,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
} else {
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
- CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+ if (SrcAS == AMDGPUAS::BARRIER) {
+ // barrier -> flat: add the barrier AS offset.
+ SDValue SrcOffset = DAG.getNode(
+ ISD::ADD, SL, MVT::i32, Src,
+ DAG.getConstant(AMDGPUAS::BarrierAddrLDSOffset, SL, MVT::i32));
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, SrcOffset,
+ Aperture);
+ } else {
+ CvtPtr =
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ }
+
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
}
@@ -9808,12 +9828,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
EVT PtrVT = Op.getValueType();
const GlobalValue *GV = GSD->getGlobal();
- if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ const unsigned AS = GSD->getAddressSpace();
+ if (((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER) &&
shouldUseLDSConstAddress(GV)) ||
- GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
- GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- GV->hasExternalLinkage()) {
+ AS == AMDGPUAS::REGION_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
// HIP uses an unsized array `extern __shared__ T s[]` or similar
// zero-sized type in other languages to declare the dynamic shared
@@ -9833,7 +9852,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
}
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ if (AS == AMDGPUAS::BARRIER) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+ SIInstrInfo::MO_ABS32_LO);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0);
+ }
+
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
SIInstrInfo::MO_ABS32_LO);
return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
@@ -11787,7 +11812,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (isa<ConstantSDNode>(Op->getOperand(2))) {
uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
- BarID = (BarID >> 4) & 0x3F;
+ BarID = BarID & 0x3F;
Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
@@ -11795,13 +11820,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
} else {
Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
- SDValue M0Val;
- M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
- DAG.getShiftAmountConstant(4, MVT::i32, DL));
- M0Val = SDValue(
- DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
- DAG.getTargetConstant(0x3F, DL, MVT::i32)),
- 0);
+ SDValue M0Val =
+ SDValue(DAG.getMachineNode(
+ AMDGPU::S_AND_B32, DL, MVT::i32, Op->getOperand(2),
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
} else
Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
@@ -12390,12 +12413,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
BarVal = C->getZExtValue();
else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
- if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(
- *GA->getGlobal()))
+ if (auto Addr = AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GA->getGlobal(), AMDGPUAS::BARRIER))
BarVal = *Addr + GA->getOffset();
if (BarVal) {
- unsigned BarID = (*BarVal >> 4) & 0x3F;
+ unsigned BarID = *BarVal & 0x3F;
Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
Ops.push_back(Chain);
auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
@@ -12415,12 +12438,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
? AMDGPU::S_BARRIER_INIT_M0
: AMDGPU::S_BARRIER_SIGNAL_M0;
- // extract the BarrierID from bits 4-9 of BarOp
- SDValue BarID;
- BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
- DAG.getShiftAmountConstant(4, MVT::i32, DL));
- BarID =
- SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
+ // extract the BarrierID from bits 0-5 of BarOp
+ SDValue BarID =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarOp,
DAG.getTargetConstant(0x3F, DL, MVT::i32)),
0);
// Member count should be put into M0[ShAmt:+6]
@@ -12465,8 +12485,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
break;
}
- // extract the BarrierID from bits 4-9 of the immediate
- unsigned BarID = (BarVal >> 4) & 0x3F;
+ // extract the BarrierID from bits 0-5 of the immediate
+ unsigned BarID = BarVal & 0x3F;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
Ops.push_back(Chain);
@@ -12481,12 +12501,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
break;
}
- // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
- SDValue M0Val;
- M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
- DAG.getShiftAmountConstant(4, MVT::i32, DL));
- M0Val =
- SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+ // extract the BarrierID from bits 0-5 of BarOp, copy to M0[5:0]
+ SDValue M0Val =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarOp,
DAG.getTargetConstant(0x3F, DL, MVT::i32)),
0);
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 768d9c1f91470..d33874686a7ab 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -569,7 +569,7 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32DataTypes: RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16]>;
-def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6]>;
+def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6, p15]>;
def Reg32Types : RegisterTypes<!listconcat(Reg32DataTypes.types, Reg32PtrTypes.types)>;
def Reg64DataTypes: RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16]>;
def Reg64PtrTypes: RegisterTypes<[p0, p1, p4]>;
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..a846ad17ccf3c 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -274,8 +274,9 @@ static std::string computeAMDDataLayout(const Triple &TT) {
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
- "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
+ "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32"
+ "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:"
+ "512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 0fab1395ffc6e..953959ad6a0c6 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -128,10 +128,10 @@ define i32 @s_get_barrier_state(i32 %bar) {
}
; CHECK-LABEL: for function 's_get_named_barrier_state':
-; CHECK: DIVERGENT: ptr addrspace(3) %bar
+; CHECK: DIVERGENT: ptr addrspace(15) %bar
; CHECK-NOT: DIVERGENT
-define i32 @s_get_named_barrier_state(ptr addrspace(3) %bar) {
- %result = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %bar)
+define i32 @s_get_named_barrier_state(ptr addrspace(15) %bar) {
+ %result = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %bar)
ret i32 %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
new file mode 100644
index 0000000000000..34f9a51a0455c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
@@ -0,0 +1,442 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030,GFX1030-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030,GFX1030-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+define amdgpu_kernel void @barrier_to_generic(ptr addrspace(15) %bar, ptr %out) {
+; GFX942-SDAG-LABEL: barrier_to_generic:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_add_i32 s4, s0, 0x802000
+; GFX942-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: barrier_to_generic:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_add_u32 s0, s6, 0x802000
+; GFX942-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX942-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT: s_clause 0x1
+; GFX1030-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: s_add_i32 s4, s0, 0x802000
+; GFX1030-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1030-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT: s_clause 0x1
+; GFX1030-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX1030-GISEL-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8
+; GFX1030-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT: s_add_u32 s0, s4, 0x802000
+; GFX1030-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_to_generic:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-SDAG-NEXT: s_clause 0x1
+; GFX1200-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: s_add_co_i32 s4, s0, 0x802000
+; GFX1200-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1200-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_to_generic:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_clause 0x1
+; GFX1200-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0
+; GFX1200-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
+; GFX1200-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: s_add_co_u32 s0, s6, 0x802000
+; GFX1200-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: barrier_to_generic:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s0, 0x802000
+; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-SDAG-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: barrier_to_generic:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s6, 0x802000
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) %bar to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @barrier_gv_to_generic(ptr %out) {
+; GFX942-SDAG-LABEL: barrier_gv_to_generic:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x802001
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: barrier_gv_to_generic:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, 0x802001
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, 0x802001 :: v_dual_mov_b32 v1, s1
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1200-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0x802001
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) @bar to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+
+define amdgpu_kernel void @barrier_null_to_generic(ptr %out) {
+; GFX942-LABEL: barrier_null_to_generic:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_null_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_null_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1030-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_null_to_generic:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s0
+; GFX1200-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_null_to_generic:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1200-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-LABEL: barrier_null_to_generic:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) null to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @generic_to_barrier(ptr %generic, ptr %out) {
+; GFX942-SDAG-LABEL: generic_to_barrier:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT: s_add_i32 s2, s0, 0xff7fe000
+; GFX942-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s0, s2, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: generic_to_barrier:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_add_i32 s4, s0, 0xff7fe000
+; GFX942-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-GISEL-NEXT: s_cselect_b32 s0, s4, 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-LABEL: generic_to_barrier:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: s_add_i32 s4, s0, 0xff7fe000
+; GFX1030-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-NEXT: flat_store_dword v[0:1], v2
+; GFX1030-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: generic_to_barrier:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1200-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1200-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1200-SDAG-NEXT: flat_store_b32 v[0:1], v2
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: generic_to_barrier:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1200-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1200-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1200-GISEL-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s0
+; GFX1200-GISEL-NEXT: flat_store_b32 v[0:1], v2
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: generic_to_barrier:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-SDAG-NEXT: flat_store_b32 v0, v1, s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: generic_to_barrier:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-GISEL-NEXT: flat_store_b32 v1, v0, s[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr %generic to ptr addrspace(15)
+ store ptr addrspace(15) %res, ptr %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1200: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
index 8a3fa1288ca82..b75bb69fe5836 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
@@ -6,29 +6,29 @@
; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
-; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]]
; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
;.
define void @func1() #0 {
; CHECK-LABEL: define void @func1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -36,14 +36,14 @@ define void @func1() #0 {
define void @func2() #0 {
; CHECK-LABEL: define void @func2(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
store i8 7, ptr addrspace(3) @lds1, align 4
ret void
@@ -53,20 +53,20 @@ define amdgpu_kernel void @kernel1() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1.kernel1, i32 11)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1.kernel1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -78,15 +78,15 @@ define amdgpu_kernel void @kernel2() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
; CHECK-SAME: ) #[[ATTR1]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
store i8 10, ptr addrspace(3) @lds1, align 4
@@ -96,13 +96,13 @@ define amdgpu_kernel void @kernel2() #0 {
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
@@ -114,8 +114,8 @@ attributes #2 = { nounwind readnone }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
;.
-; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
-; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
-; CHECK: [[META2]] = !{i32 8396848, i32 8396849}
+; CHECK: [[META0]] = !{i32 1, i32 2}
+; CHECK: [[META1]] = !{i32 7, i32 8}
+; CHECK: [[META2]] = !{i32 3, i32 4}
; CHECK: [[META3]] = !{i32 0, i32 1}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
index 16533e0a204d4..905b1174db711 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
@@ -4,24 +4,24 @@
; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario,
; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
;
define void @bar() #0 {
; CHECK-LABEL: define void @bar(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK: store i8 7, ptr addrspace(1) {{.*}}, align 4
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
store i8 7, ptr addrspace(3) @lds1, align 4
ret void
@@ -32,32 +32,21 @@ define amdgpu_kernel void @barkernel() #0 {
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
; CHECK: {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}})
; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK: call void @bar()
; CHECK: store i8 10, ptr addrspace(1) {{.*}}, align 4
; CHECK: call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}})
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @bar()
store i8 10, ptr addrspace(3) @lds1, align 4
ret void
}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
-
attributes #0 = { nounwind sanitize_address }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
@@ -68,6 +57,6 @@ attributes #2 = { nounwind readnone }
; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address }
; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" }
;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 1, i32 2}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index b76a80811b2d5..b6c2133d81d05 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -4,38 +4,38 @@
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
-; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
;.
define void @func1() {
; CHECK-LABEL: define void @func1() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
define void @func2() {
; CHECK-LABEL: define void @func2() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -43,19 +43,19 @@ define void @func2() {
define amdgpu_kernel void @kernel1() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1.kernel1, i32 11)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1.kernel1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -65,14 +65,14 @@ define amdgpu_kernel void @kernel1() #0 {
define amdgpu_kernel void @kernel2() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -82,13 +82,13 @@ define amdgpu_kernel void @kernel2() #0 {
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
@@ -97,7 +97,7 @@ attributes #2 = { nounwind readnone }
; CHECK: attributes #[[ATTR0]] = { nounwind }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
;.
-; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
-; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
-; CHECK: [[META2]] = !{i32 8396848, i32 8396849}
+; CHECK: [[META0]] = !{i32 1, i32 2}
+; CHECK: [[META1]] = !{i32 7, i32 8}
+; CHECK: [[META2]] = !{i32 3, i32 4}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index b86f621b9e0cf..489061d329320 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -489,8 +489,8 @@ attributes #1 = { nounwind }
; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
; HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
;.
-; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10}
-; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10}
-; HSA: [[META2]] = !{i32 2, i32 10}
-; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
+; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 16}
+; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 16}
+; HSA: [[META2]] = !{i32 2, i32 16}
+; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
index ccc434842e498..ccb34288d5845 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
@@ -153,7 +153,7 @@ attributes #0 = { "amdgpu-no-flat-scratch-init" }
; GFX10: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
;.
-; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
-; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 1e8a33781ee8d..bf2e44c641664 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -877,15 +877,15 @@ define amdgpu_kernel void @with_inline_asm() {
; GFX10: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR4]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
;.
-; GFX9: [[META0]] = !{i32 2, i32 10}
-; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 2, i32 16}
+; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
;.
-; GFX10: [[META0]] = !{i32 2, i32 10}
-; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 2, i32 16}
+; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index d91b2117c7ad9..f699ba70bb935 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -633,7 +633,7 @@ define amdgpu_kernel void @no_alias_addr_space_has_meta(ptr addrspace(3) %sptr,
!0 = !{i32 2, i32 3, i32 4, i32 10}
;.
-; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 10}
-; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 10}
+; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 16}
+; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 16}
; CHECK: [[META2]] = !{i32 2, i32 3, i32 4, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
index f573a3180c067..1974e48a1b7cd 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
@@ -7,10 +7,10 @@
; 3. group_segment_fixed_size = 0 (linker patches it)
; 4. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds)
- at bar = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
; CHECK-LABEL: kernel:
-; CHECK: s_lshr_b32 s{{[0-9]+}}, __amdgpu_named_barrier.bar{{[^ @]*}}@abs32 at lo, 4
+; CHECK: s_barrier_signal m0
; CHECK: s_barrier_join m0
; CHECK: s_barrier_signal m0
; CHECK: s_barrier_wait 1
@@ -39,16 +39,16 @@
; ELF-DAG: R_AMDGPU_ABS64 helper
define amdgpu_kernel void @kernel() {
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @helper()
ret void
}
declare void @helper()
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #0
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #0
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #0
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #0
declare void @llvm.amdgcn.s.barrier.wait(i16) #0
attributes #0 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
index 62b32901d281d..9ec6fc523dc63 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
@@ -6,21 +6,21 @@
; 2. AMDGPULowerModuleLDS does not handle named barriers at all
; 3. amdgpu.lds.uses does NOT contain barrier entries
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
@lds = internal addrspace(3) global [4 x i32] poison, align 4
; Internal named barrier becomes external with a module-unique hash suffix.
-; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(3) global target("amdgcn.named.barrier", 0)
+; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(15) global target("amdgcn.named.barrier", 0)
; CHECK-NOT: !absolute_symbol
; Regular LDS is packed into the per-function struct (external, for linker).
; CHECK: @__amdgpu_lds.kernel = external dso_local addrspace(3) global %__amdgpu_lds.kernel.t, align 16
define amdgpu_kernel void @kernel(i32 %idx) {
; CHECK-LABEL: define amdgpu_kernel void @kernel(
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @[[BAR]])
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @[[BAR]], i32 3)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @[[BAR]])
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @[[BAR]], i32 3)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
%gep = getelementptr [4 x i32], ptr addrspace(3) @lds, i32 0, i32 %idx
store i32 42, ptr addrspace(3) %gep, align 4
@@ -29,7 +29,7 @@ define amdgpu_kernel void @kernel(i32 %idx) {
; Named barrier metadata: (barrier_sym, func1, ...) -- emitted by ExecSync.
; CHECK-DAG: !amdgpu.named_barrier.uses = !{[[BAR_MD:![0-9]+]]}
-; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(3) @[[BAR]], ptr @kernel}
+; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(15) @[[BAR]], ptr @kernel}
; LDS metadata must have exactly one entry (the LDS struct), no barrier entries.
; CHECK-DAG: !amdgpu.lds.uses = !{[[LDS_MD:![0-9]+]]}
; CHECK-DAG: [[LDS_MD]] = !{ptr @kernel, ptr addrspace(3) @__amdgpu_lds.kernel}
diff --git a/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll b/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
new file mode 100644
index 0000000000000..12be207e7e884
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-kernel.ll 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-kernel.ll 2>&1 | FileCheck %s
+
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-func.ll 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-func.ll 2>&1 | FileCheck %s
+
+; CHECK: named barrier GV cannot be used to represent the NULL named barrier
+
+;--- null-named-barrier-kernel.ll
+
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+
+define amdgpu_kernel void @func1() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ ret void
+}
+
+!0 = !{ i32 0, i32 1 }
+
+;--- null-named-barrier-func.ll
+
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+
+define void @func1() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ ret void
+}
+
+!0 = !{ i32 0, i32 1 }
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
new file mode 100644
index 0000000000000..ae18ab914cc49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+
+; The absolute_address of the GV can never be null.
+
+; CHECK: LLVM ERROR: named barrier GV cannot be used to represent the NULL named barrie
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+
+define void @func() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ ret void
+}
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
new file mode 100644
index 0000000000000..ae5731b875f37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+
+; Check what happens when the type or the AS of the barrier GV is wrong.
+; Using such a GV in a barrier intrinsic would be UB of course, but we should not crash.
+
+; @addrspacecasted doesn't trip up on this because we are not doing any unsupported
+; operation.
+;
+; CHECK: in function wrong_type void (): Unsupported use of BARRIER address space!
+
+ at bar = internal global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global i32 poison
+
+define void @addrspacecasted() {
+ %bar.ascast = addrspacecast ptr @bar to ptr addrspace(15)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %bar.ascast)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar.ascast, i32 7)
+ ret void
+}
+
+define void @wrong_type() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index be1cb6c91f1c3..27ad636dc244e 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -3,72 +3,77 @@
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
-; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
-; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
-; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; Test using the workgroup barrier with the GV.
+ at wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK-NEXT: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK-NEXT: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK-NEXT: @wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META3:![0-9]+]]
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
; SOUT: .set .Lfunc1.num_named_barrier, 7
define void @func1() {
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
; SOUT: .set .Lfunc2.num_named_barrier, 2
define void @func2() {
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
-; SOUT: .amdhsa_named_barrier_count 2
+; SOUT: .amdhsa_kernel kernel1
+; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel1.num_named_barrier, max(6, .Lfunc1.num_named_barrier, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel1() #0 {
; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
ret void
}
-; SOUT: .amdhsa_named_barrier_count 2
+; SOUT: .amdhsa_kernel kernel2
+; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel2.num_named_barrier, max(6, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel2() #0 {
; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
ret void
}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+; SOUT: .amdhsa_kernel wgbarr_as_gv
+; SOUT: .amdhsa_named_barrier_count 0
+define amdgpu_kernel void @wgbarr_as_gv() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @wgbarr, i32 7)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
-; CHECK: !0 = !{i32 8396816, i32 8396817}
-; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
-; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}
+!0 = !{i32 -1, i32 0}
+
+; CHECK: [[META0:![0-9]+]] = !{i32 1, i32 2}
+; CHECK-NEXT: [[META1:![0-9]+]] = !{i32 7, i32 8}
+; CHECK-NEXT: [[META2:![0-9]+]] = !{i32 3, i32 4}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 248d6a7a5f1d2..9415d839eeb57 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -2,9 +2,12 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+; Test using the workgroup barrier with the GV.
+ at wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
define void @func1() {
; GFX12-SDAG-LABEL: func1:
@@ -33,8 +36,8 @@ define void @func1() {
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -66,13 +69,13 @@ define void @func2() {
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX12-SDAG-LABEL: kernel1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -85,9 +88,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
@@ -141,9 +143,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
@@ -188,17 +189,17 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
; GFX12-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
- call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %in, i32 9)
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %in)
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @llvm.amdgcn.s.barrier.leave(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
- %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar)
+ %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %in)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -206,7 +207,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
-define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX12-SDAG-LABEL: kernel2:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -250,8 +251,8 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -268,11 +269,11 @@ define void @signal_var_cnt0_const_bar() {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_barrier_signal 2
; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 0)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 0)
ret void
}
-define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
+define void @signal_var_cnt0_dynamic_bar(ptr addrspace(15) inreg %bar) {
; GFX12-LABEL: signal_var_cnt0_dynamic_bar:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -280,12 +281,10 @@ define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s0, s0, 4
-; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_b32 m0, s0, 63
; GFX12-NEXT: s_barrier_signal m0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %bar, i32 0)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar, i32 0)
ret void
}
@@ -308,17 +307,41 @@ define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrs
ret void
}
+
+define amdgpu_kernel void @wgbarr_as_gv() {
+; GFX12-LABEL: wgbarr_as_gv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 m0, 0x7003f
+; GFX12-NEXT: s_barrier_signal m0
+; GFX12-NEXT: s_barrier_wait -1
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @wgbarr, i32 7)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
+
+define amdgpu_kernel void @null_barrier() {
+; GFX12-LABEL: null_barrier:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_barrier_join 0
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) null)
+ ret void
+}
+
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
+
+!0 = !{i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
index 2ecb380e6c37e..5d81f6b649de9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
@@ -2,9 +2,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-GISEL %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX1250-SDAG-LABEL: kernel1:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -12,8 +12,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX1250-SDAG-NEXT: s_mov_b32 m0, 1
; GFX1250-SDAG-NEXT: s_wakeup_barrier m0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_lshr_b32 s0, s0, 4
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_and_b32 m0, s0, 63
; GFX1250-SDAG-NEXT: s_wakeup_barrier m0
; GFX1250-SDAG-NEXT: s_endpgm
@@ -24,18 +22,16 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c nv
; GFX1250-GISEL-NEXT: s_wakeup_barrier 1
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: s_lshr_b32 s0, s0, 4
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_and_b32 m0, s0, 63
; GFX1250-GISEL-NEXT: s_wakeup_barrier m0
; GFX1250-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %in)
ret void
}
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index cb52a639d12bd..b65f35394125d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -59,5 +59,5 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
;.
-; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 14e3a359cab3a..ded2b14e0169d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -551,16 +551,17 @@ def ROCDL_WaveBarrierOp : ROCDL_ConcreteNonMemIntrOp<"wave.barrier", [], 0> {
def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
+def ROCDLExecSync : LLVM_PointerInAddressSpace<15>;
def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$memberCnt)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
let description = [{
Available on gfx1250+.
Example:
```mlir
// Initialize a named barrier with member count.
- rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+ rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
```
}];
let results = (outs);
@@ -583,7 +584,7 @@ def ROCDL_BarrierSignalOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal", [], 0
}
def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$memberCnt)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
let description = [{
Available on gfx1250+.
@@ -592,7 +593,7 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
Example:
```mlir
// Signal a named barrier with variable ID.
- rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+ rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
```
}];
let results = (outs);
@@ -600,14 +601,14 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
}
def ROCDL_BarrierJoinOp : ROCDL_IntrOp<"s.barrier.join", [], [], [], 0>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
let description = [{
Available on gfx1250+.
Example:
```mlir
// Join a named barrier.
- rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+ rocdl.s.barrier.join %ptr : !llvm.ptr<15>
```
}];
let results = (outs);
@@ -675,14 +676,14 @@ def ROCDL_GetBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.barrier.state",
}
def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barrier.state", [], 1, [], []>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
let description = [{
Available on gfx1250+.
Example:
```mlir
// Query named barrier state by pointer.
- %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+ %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
```
}];
let results = (outs I32:$res);
@@ -690,7 +691,7 @@ def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barri
}
def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0, [], []>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
let description = [{
Wakes up waves associated with a given named barrier. Note, This op does not release waves waiting
at the barrier. It just signal other waves in the same work-group waiting on the indicated named barrier
@@ -700,7 +701,7 @@ def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0
Example:
```mlir
// Wake up waves waiting on a named barrier.
- rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+ rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
```
}];
let assemblyFormat = "$ptr attr-dict `:` qualified(type($ptr))";
@@ -1507,7 +1508,7 @@ class ROCDL_Sudot_IntrOp<string mnemonic> :
the same for `b`. `clamp` controls result clamping.
These ops correspond to RDNA's unified mixed-sign `v_dot4_i32_iu8`
- and `v_dot8_i32_iu4` instructions (gfx11+).
+ and `v_dot8_i32_iu4` instructions (gfx11+).
Example:
```mlir
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index db1cefe86dfc6..05d9c78dcb423 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1163,10 +1163,10 @@ llvm.func @rocdl.s.barrier() {
llvm.return
}
-llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.init
- // CHECK: rocdl.s.barrier.init %{{.*}} member_cnt = 1 : !llvm.ptr<3>
- rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: rocdl.s.barrier.init %{{.*}} member_cnt = 1 : !llvm.ptr<15>
+ rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
@@ -1177,17 +1177,17 @@ llvm.func @rocdl.s.barrier.signal() {
llvm.return
}
-llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.signal.var
- // CHECK: rocdl.s.barrier.signal.var %{{.*}} member_cnt = 1 : !llvm.ptr<3>
- rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: rocdl.s.barrier.signal.var %{{.*}} member_cnt = 1 : !llvm.ptr<15>
+ rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
-llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.join
- // CHECK: rocdl.s.barrier.join %{{.*}} : !llvm.ptr<3>
- rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+ // CHECK: rocdl.s.barrier.join %{{.*}} : !llvm.ptr<15>
+ rocdl.s.barrier.join %ptr : !llvm.ptr<15>
llvm.return
}
@@ -1219,17 +1219,17 @@ llvm.func @rocdl.s.get.barrier.state() {
llvm.return
}
-llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.get.named.barrier.state
- // CHECK: rocdl.s.get.named.barrier.state %{{.*}} : !llvm.ptr<3> -> i32
- %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+ // CHECK: rocdl.s.get.named.barrier.state %{{.*}} : !llvm.ptr<15> -> i32
+ %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
llvm.return
}
-llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.wakeup.barrier
- // CHECK: rocdl.s.wakeup.barrier %{{.*}} : !llvm.ptr<3>
- rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+ // CHECK: rocdl.s.wakeup.barrier %{{.*}} : !llvm.ptr<15>
+ rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 9e1d98b8422da..dd2e98a830993 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -269,10 +269,10 @@ llvm.func @rocdl.wave_barrier() {
llvm.return
}
-llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.init
- // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %{{.*}}, i32 1)
- rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %{{.*}}, i32 1)
+ rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
@@ -283,17 +283,17 @@ llvm.func @rocdl.s.barrier.signal() {
llvm.return
}
-llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.signal.var
- // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %{{.*}}, i32 1)
- rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %{{.*}}, i32 1)
+ rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
-llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.join
- // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %{{.*}})
- rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %{{.*}})
+ rocdl.s.barrier.join %ptr : !llvm.ptr<15>
llvm.return
}
@@ -325,17 +325,17 @@ llvm.func @rocdl.s.get.barrier.state() {
llvm.return
}
-llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.get.named.barrier.state
- // CHECK: %{{.*}} = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %{{.*}})
- %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+ // CHECK: %{{.*}} = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %{{.*}})
+ %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
llvm.return
}
-llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.wakeup.barrier
- // CHECK: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %{{.*}})
- rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %{{.*}})
+ rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
llvm.return
}
>From 3b9a1b879ac6739075c5829c5c6db4cd0d63c59d Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 12 May 2026 14:07:11 +0200
Subject: [PATCH 2/2] Address comments
---
clang/test/CodeGenHIP/amdgpu-barrier-type.hip | 34 ++++++++++++++-----
llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +--
.../AMDGPU/barrier-addrspace-dereference.ll | 16 +++++++++
.../lds-link-time-codegen-named-barrier.ll | 3 --
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 4 +--
5 files changed, 45 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
diff --git a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
index 947ceb56d279e..684ceb85f9185 100644
--- a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
+++ b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
@@ -1,6 +1,6 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
- // REQUIRES: amdgpu-registered-target
- // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
#define __shared__ __attribute__((shared))
@@ -11,8 +11,15 @@ __shared__ struct {
__amdgpu_named_workgroup_barrier_t y;
} str;
-__amdgpu_named_workgroup_barrier_t *getBar();
-void useBar(__amdgpu_named_workgroup_barrier_t *);
+//.
+// CHECK: @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef, align 4
+// CHECK: @arr = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef, align 4
+// CHECK: @str = internal addrspace(3) global %struct.anon undef, align 4
+// CHECK: @__hip_cuid_ = addrspace(1) global i8 0
+// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
+//.
+__attribute__((device)) __amdgpu_named_workgroup_barrier_t *getBar();
+__attribute__((device)) void useBar(__amdgpu_named_workgroup_barrier_t *);
// CHECK-LABEL: define {{[^@]+}}@_Z7testSemPu34__amdgpu_named_workgroup_barrier_t
// CHECK-SAME: (ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -22,15 +29,15 @@ void useBar(__amdgpu_named_workgroup_barrier_t *);
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[TMP0]]) #[[ATTR2:[0-9]+]]
-// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(1) @bar to ptr)) #[[ATTR2]]
-// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(1) @arr to ptr), i64 16)) #[[ATTR2]]
-// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(1) @str to ptr), i64 16)) #[[ATTR2]]
+// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(15) @bar to ptr)) #[[ATTR2]]
+// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(15) @arr to ptr), i64 16)) #[[ATTR2]]
+// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @str to ptr), i64 16)) #[[ATTR2]]
// CHECK-NEXT: [[CALL:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[CALL]]) #[[ATTR2]]
// CHECK-NEXT: [[CALL1:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
// CHECK-NEXT: ret ptr [[CALL1]]
//
-__amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
+__attribute__((device)) __amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
useBar(p);
useBar(&bar);
useBar(&arr[1]);
@@ -38,3 +45,12 @@ __amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *
useBar(getBar());
return getBar();
}
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "uniform-work-group-size" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "uniform-work-group-size" }
+// CHECK: attributes #[[ATTR2]] = { convergent nounwind "uniform-work-group-size" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index e71cbf3921de1..e6eb95247d09b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -622,7 +622,7 @@ static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
// clang-format off
static const bool ASAliasRules[][AMDGPUAS::MAX_AMDGPU_ADDRESS + 1] = {
- /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr Reserved Reserved Reserved Reserved Reserved ExecSync */
+ /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr Reserved Reserved Reserved Reserved Reserved Barrier */
/* Flat */ {true, true, false, true, true, true, true, true, true, true, false, false, false, false, false, false},
/* Global */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
/* Region */ {false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false},
@@ -638,7 +638,7 @@ static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
/* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
/* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
/* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
- /* Barrier */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Barrier */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true},
};
// clang-format on
static_assert(std::size(ASAliasRules) == AMDGPUAS::MAX_AMDGPU_ADDRESS + 1);
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll b/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
new file mode 100644
index 0000000000000..fa8e079e55b7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
@@ -0,0 +1,16 @@
+; Check we cannot dereference a barrier GV.
+
+; RUN: not --crash llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefixes=DAGISEL %s
+; RUN: not llc -O0 -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefixes=GISEL %s
+
+; TODO: It'd be nicer to have a Verifier diagnostic for this.
+
+; DAGISEL: LLVM ERROR: {{.*}} store<(store (s32) into @bar, addrspace 15)>
+; GISEL: LLVM ERROR: {{.*}} G_LOAD %6:sgpr(p15) :: (load (s32) from @bar, addrspace 15) (in function: func1)
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+define amdgpu_kernel void @func1() {
+ %val = load i32, ptr addrspace(15) @bar
+ store i32 %val, ptr addrspace(15) @bar
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
index 1974e48a1b7cd..5f21822874683 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
@@ -10,7 +10,6 @@
@bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
; CHECK-LABEL: kernel:
-; CHECK: s_barrier_signal m0
; CHECK: s_barrier_join m0
; CHECK: s_barrier_signal m0
; CHECK: s_barrier_wait 1
@@ -26,8 +25,6 @@
; CHECK: .amdgpu_call helper
; CHECK: .end_amdgpu_info
-; CHECK: .amdgpu_lds __amdgpu_named_barrier.bar{{[^ ,]*}}, 32, 4
-
; ELF: Section {
; ELF: Name: .amdgpu.info
; ELF: Type: SHT_PROGBITS
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 27ad636dc244e..489ef57dc7b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -36,7 +36,7 @@ define void @func2() {
; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel1.num_named_barrier, max(6, .Lfunc1.num_named_barrier, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel1() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1.kernel1, i32 11)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
@@ -51,7 +51,7 @@ define amdgpu_kernel void @kernel1() #0 {
; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel2.num_named_barrier, max(6, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel2() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
More information about the llvm-branch-commits
mailing list