[llvm-branch-commits] [clang] [llvm] [RFC][AMDGPU] Add EXECSYNC address space (PR #195613)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 4 01:51:50 PDT 2026
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/195613
>From 204c929601eba171fa2f7c6b3f5e040b63cdceaf Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 22 Apr 2026 11:35:22 +0200
Subject: [PATCH 1/2] [RFC][AMDGPU] Add EXECSYNC address space
Add a new EXECSYNC address space that is used for global variables that are used to represent the barrier IDs in GFX12.5.
This patch only aims to add the new AS, and plumb it through the LLVM machinery. It does not change the pointer layout.
With this patch, EXECSYNC pointers are just local pointers disguised as something else. Depending on how the discussion evolves, we can consider changing the pointer layout subsequently.
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 15 +-
clang/test/CodeGen/target-data.c | 4 +-
clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl | 2 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 16 +--
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 14 +-
llvm/docs/AMDGPUUsage.rst | 40 ++++--
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 +-
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 14 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 28 ++--
llvm/lib/Target/AMDGPU/AMDGPU.td | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 34 +++--
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 +-
.../lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 81 +++++------
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 10 --
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 4 +-
.../AMDGPU/AMDGPUMachineFunctionInfo.cpp | 35 +++--
.../Target/AMDGPU/AMDGPUMachineFunctionInfo.h | 7 +-
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 12 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 40 +++---
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +-
llvm/lib/TargetParser/TargetDataLayout.cpp | 5 +-
.../CodeGen/AMDGPU/addrspacecast-execsync.ll | 132 ++++++++++++++++++
.../amdgpu-lower-exec-sync-and-module-lds.ll | 60 ++++----
.../amdgpu-lower-exec-sync-and-sw-lds.ll | 35 ++---
.../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 60 ++++----
.../AMDGPU/annotate-kernel-features-hsa.ll | 8 +-
...utor-flatscratchinit-undefined-behavior.ll | 4 +-
.../AMDGPU/attributor-flatscratchinit.ll | 20 +--
.../AMDGPU/attributor-noalias-addrspace.ll | 4 +-
.../lds-link-time-codegen-named-barrier.ll | 13 +-
.../AMDGPU/lds-link-time-named-barrier.ll | 14 +-
llvm/test/CodeGen/AMDGPU/nullptr.ll | 2 +-
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 47 +++----
llvm/test/CodeGen/AMDGPU/s-barrier.ll | 50 +++----
llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll | 10 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 2 +-
36 files changed, 514 insertions(+), 336 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/addrspacecast-execsync.ll
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index be7c2f9c89d97..c72e34a9adc8b 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -555,8 +555,21 @@ AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
LangAS AMDGPUTargetCodeGenInfo::adjustGlobalVarAddressSpace(
CodeGenModule &CGM, const VarDecl *D, std::optional<LangAS> AS) const {
- if (AS)
+ if (AS) {
+ // NamedWorkgroupBarrier GVs are declared as __shared__, but the back-end
+ // models them as a separate address space.
+ const LangOptions &LangOpts = CGM.getLangOpts();
+ if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+ AS == LangAS::cuda_shared) {
+ const Type *Ty = D->getType().getCanonicalType().getTypePtr();
+ if (Ty->isArrayType())
+ Ty = Ty->getBaseElementTypeUnsafe();
+ const BuiltinType *BTy = dyn_cast<BuiltinType>(Ty);
+ if (BTy && BTy->getKind() == BuiltinType::AMDGPUNamedWorkgroupBarrier)
+ return getLangASFromTargetAS(llvm::AMDGPUAS::EXECSYNC);
+ }
return *AS;
+ }
LangAS DefaultGlobalAS = getLangASFromTargetAS(
CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 6df20c728f71b..033ec3907ca6f 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// Test default -target-cpu
// RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..fcee9b3b20813 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
void foo(void) {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 14d7e7a365989..aa0fb93db3cc1 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -94,9 +94,9 @@ void test_s_barrier_signal()
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
void test_s_barrier_signal_var(void *bar, int a)
@@ -148,9 +148,9 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
void test_s_barrier_init(void *bar, int a)
@@ -164,8 +164,8 @@ void test_s_barrier_init(void *bar, int a)
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_barrier_join(void *bar)
@@ -208,8 +208,8 @@ unsigned test_s_get_barrier_state(int a)
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT: ret i32 [[TMP3]]
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 0b4cdd0c2c28f..9435644f8f530 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -1495,8 +1495,8 @@ void test_s_cluster_barrier()
// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_wakeup_barrier(void *bar)
@@ -1514,7 +1514,7 @@ void test_s_wakeup_barrier(void *bar)
// CHECK-NEXT: store float [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]]
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], float [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]]
// CHECK-NEXT: ret float [[TMP2]]
//
float test_global_add_f32(global float *addr, float x) {
@@ -1531,7 +1531,7 @@ float test_global_add_f32(global float *addr, float x) {
// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: ret <2 x half> [[TMP2]]
//
half2 test_global_add_half2(global half2 *addr, half2 x) {
@@ -1548,7 +1548,7 @@ half2 test_global_add_half2(global half2 *addr, half2 x) {
// CHECK-NEXT: store <2 x half> [[X:%.*]], ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[X_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x half> [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: ret <2 x half> [[TMP2]]
//
half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
@@ -1566,7 +1566,7 @@ half2 test_flat_add_2f16(generic half2 *addr, half2 x) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
-// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
// CHECK-NEXT: ret <2 x i16> [[TMP4]]
//
@@ -1585,7 +1585,7 @@ short2 test_flat_add_2bf16(generic short2 *addr, short2 x) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[X_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i16> [[TMP1]] to <2 x bfloat>
-// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP0]], <2 x bfloat> [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]]
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[TMP3]] to <2 x i16>
// CHECK-NEXT: ret <2 x i16> [[TMP4]]
//
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index fbc05c1732d90..b359daf8bcf45 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -927,6 +927,7 @@ supported for the ``amdgcn`` target.
*reserved for downstream use (LLPC)* 12
*reserved for future use* 13
*reserved for future use* 14
+ Execution Synchronization 15 N/A N/A 32 0xFFFFFFFF
Streamout Registers 128 N/A GS_REGS
===================================== =============== =========== ================ ======= ============================
@@ -1139,6 +1140,29 @@ supported for the ``amdgcn`` target.
a buffer strided pointer, this means that the base pointer is ``align(4)``, that
the offset is a multiple of 4 bytes, and that the stride is a multiple of 4.
+**Execution Synchronization (execsync)**
+ This address space represents hardware resources used to synchronize the execution
+ of wavefronts. It does not map directly to any addressable memory, thus pointers
+ into this address space:
+
+ * Never alias with any other pointers outside this address space.
+ * Cannot be dereferenced.
+ * Can only be consumed by intrinsics.
+ * Are always uniform.
+
+ Pointer are 32 bits. Due to these pointers being a compiler abstraction without
+ a corresponding hardware aperture, the back-end handles them as-if they were
+ local pointers; this is why the NULL pointer for the execsync address space is the
+ same as for local memory. The layout of pointer values into the execsync address space
+ is also designed so that no valid execsync address can conflict with valid local addresses.
+
+ The pointer layout for ``s_barrier`` IDs (``Bar#``) is:
+
+ * Offset: ``0x802000u``.
+ * Bits ``[9:11]``: Barrier scope.
+ * Bits ``[4:8]``: Barrier ID.
+ * Bits ``[0:3]`` are zeroes.
+
**Streamout Registers**
Dedicated registers used by the GS NGG Streamout Instructions. The register
file is modelled as a memory in a distinct address space because it is indexed
@@ -1291,10 +1315,8 @@ Named barriers are fixed function hardware barrier objects that are available
in gfx12.5+ in addition to the traditional default barriers.
In LLVM IR, named barriers are represented by global variables of type
-``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier
-global variables do not occupy actual LDS memory, but their lifetime and
-allocation scope matches that of global variables in LDS. Programs in LLVM IR
-refer to named barriers using pointers.
+``target("amdgcn.named.barrier", 0)`` in the Execution Synchronization Resources
+address space. Programs in LLVM IR refer to named barriers using pointers.
The following named barrier types are supported in global variables, defined
recursively:
@@ -1305,14 +1327,14 @@ recursively:
.. code-block:: llvm
- @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef
- @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef
- @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef
+ @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef
+ @foo = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef
+ @baz = addrspace(15) global { target("amdgcn.named.barrier", 0) } undef
...
- %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0)
+ %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(15) @foo, i32 0, i32 %i
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %foo.i, i32 0)
Named barrier types may not be used in ``alloca``.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8631985de9a0a..908d520e0dc69 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -13,6 +13,7 @@
def flat_ptr_ty : LLVMQualPointerType<0>;
def global_ptr_ty : LLVMQualPointerType<1>;
def local_ptr_ty : LLVMQualPointerType<3>;
+def execsync_ptr_ty : LLVMQualPointerType<15>;
// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
// by the backend cause whole-program undefined behavior when violated, such as
@@ -295,7 +296,7 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa
// If %memberCnt is 0, the member count is retained from the previous
// s_barrier_init or s_barrier_signal operation.
def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
- Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [execsync_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType)
@@ -307,19 +308,19 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri
// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
- Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ Intrinsic<[], [execsync_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
- Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [execsync_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
- Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [execsync_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
@@ -342,7 +343,7 @@ def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrie
// uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">,
- Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[llvm_i32_ty], [execsync_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 3fe6492584d84..d2c82a0d09778 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -26,8 +26,7 @@ namespace llvm {
/// memory locations.
namespace AMDGPUAS {
enum : unsigned {
- // The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 9,
+ MAX_AMDGPU_ADDRESS = 15,
FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
@@ -47,6 +46,16 @@ enum : unsigned {
BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
///< pointers with an additional index.
+ RESERVED_0 = 10,
+ RESERVED_1 = 11,
+ RESERVED_2 = 12,
+ RESERVED_3 = 13,
+ RESERVED_4 = 14,
+
+ EXECSYNC = 15, ///< Address space that models non-addressable
+ ///< execution synchronization resources such
+ ///< as barrier IDs.
+
/// Internal address spaces. Can be freely renumbered.
STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
/// end Internal address spaces.
@@ -175,6 +184,7 @@ constexpr int64_t getNullPointerValue(unsigned AS) {
case PRIVATE_ADDRESS:
case LOCAL_ADDRESS:
case REGION_ADDRESS:
+ case EXECSYNC:
return -1;
default:
return 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5c7259b7600a9..bbcb299abc254 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -614,17 +614,23 @@ static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
// clang-format off
static const bool ASAliasRules[][AMDGPUAS::MAX_AMDGPU_ADDRESS + 1] = {
- /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
- /* Flat */ {true, true, false, true, true, true, true, true, true, true},
- /* Global */ {true, true, false, false, true, false, true, true, true, true},
- /* Region */ {false, false, true, false, false, false, false, false, false, false},
- /* Local */ {true, false, false, true, false, false, false, false, false, false},
- /* Constant */ {true, true, false, false, false, false, true, true, true, true},
- /* Private */ {true, false, false, false, false, true, false, false, false, false},
- /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true},
- /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true},
- /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true},
- /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true},
+ /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr Reserved Reserved Reserved Reserved Reserved ExecSync */
+ /* Flat */ {true, true, false, true, true, true, true, true, true, true, false, false, false, false, false, false},
+ /* Global */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Region */ {false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Local */ {true, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Constant */ {true, true, false, false, false, false, true, true, true, true, false, false, false, false, false, false},
+ /* Private */ {true, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false},
+ /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true, false, false, false, false, false, false},
+ /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false},
+ /* ExecSync */ {false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false},
};
// clang-format on
static_assert(std::size(ASAliasRules) == AMDGPUAS::MAX_AMDGPU_ADDRESS + 1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 25fc64d178858..545024857bcb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -18,6 +18,7 @@ def p3 : PtrValueType<i32, 3>;
def p4 : PtrValueType<i64, 4>;
def p5 : PtrValueType<i32, 5>;
def p6 : PtrValueType<i32, 6>;
+def p15 : PtrValueType<i32, 15>;
//===-----------------------------------------------------------------------===//
// AMDGPU Subtarget Feature (device properties)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e05213b2aaf93..152702d8de225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Target/TargetMachine.h"
@@ -1529,24 +1530,37 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
const GlobalValue *GV = G->getGlobal();
if (!MFI->isModuleEntryFunction()) {
- auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
+ auto *IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
+ if (IsNamedBarrier) {
+ std::optional<uint32_t> Address =
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::EXECSYNC);
+ if (!Address)
+ llvm_unreachable("named barrier should have an assigned address");
+ unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
+ MFI->recordNumNamedBarriers(Address.value(), BarCnt);
+ return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+ }
+
if (std::optional<uint32_t> Address =
- AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
- if (IsNamedBarrier) {
- unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
- MFI->recordNumNamedBarriers(Address.value(), BarCnt);
- }
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::LOCAL_ADDRESS)) {
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
- } else if (IsNamedBarrier) {
- llvm_unreachable("named barrier should have an assigned address");
}
}
+ if (G->getAddressSpace() == AMDGPUAS::EXECSYNC) {
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ assert(GVar && AMDGPU::isNamedBarrier(*GVar) &&
+ "Unsupport use for EXECSYNC address space!");
+ unsigned Offset = MFI->allocateBarrierGlobal(DL, *cast<GlobalVariable>(GV));
+ return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+ }
+
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds" &&
- !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+ GV->getName() != "llvm.amdgcn.module.lds") {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5fc7e22b5feee..b566f6a440c15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -36,6 +36,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
@@ -3262,10 +3263,19 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
MachineFunction &MF = B.getMF();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (AS == AMDGPUAS::EXECSYNC) {
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ assert(GVar && AMDGPU::isNamedBarrier(*GVar) &&
+ "Unsupported use of EXECSYNC address space!");
+ B.buildConstant(DstReg,
+ MFI->allocateBarrierGlobal(B.getDataLayout(), *GVar));
+ MI.eraseFromParent();
+ return true;
+ }
+
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds" &&
- !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+ GV->getName() != "llvm.amdgcn.module.lds") {
const Function &Fn = MF.getFunction();
Fn.getContext().diagnose(DiagnosticInfoUnsupported(
Fn, "local memory global used by non-kernel function",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 0fcda203d3810..6cf9fb74dd762 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -6,12 +6,10 @@
//
//===----------------------------------------------------------------------===//
//
-// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// Lower global variables with target extension type "amdgpu.named.barrier"
// that require specialized address assignment. It assigns a unique
-// barrier identifier to each named-barrier LDS variable and encodes
+// barrier identifier to each named-barrier variable and encodes
// this identifier within the !absolute_symbol metadata of that global.
-// This encoding ensures that subsequent LDS lowering passes can process these
-// barriers correctly without conflicts.
//
//===----------------------------------------------------------------------===//
@@ -37,6 +35,10 @@ using namespace AMDGPU;
namespace {
+static bool isNamedBarrierToLower(const GlobalVariable &GV) {
+ return isNamedBarrier(GV) && !GV.isAbsoluteSymbolRef();
+}
+
// If GV is also used directly by other kernels, create a new GV
// used only by this kernel and its function.
static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
@@ -72,8 +74,8 @@ static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
// Write the specified address into metadata where it can be retrieved by
// the assembler. Format is a half open range, [Address Address+1)
-static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
- uint32_t Address) {
+static void recordAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
LLVMContext &Ctx = M->getContext();
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
@@ -89,30 +91,29 @@ template <typename T> SmallVector<T> sortByName(SmallVector<T> &&V) {
return {std::move(V)};
}
-// Main utility function for special LDS variables lowering.
static bool lowerExecSyncGlobalVariables(
- Module &M, GVUsesInfoTy &LDSUsesInfo,
- VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ Module &M, GVUsesInfoTy &UsesInfo,
+ VariableFunctionMap &GVsToKernelsThatNeedToAccessItIndirectly) {
bool Changed = false;
const DataLayout &DL = M.getDataLayout();
// The 1st round: give module-absolute assignments
int NumAbsolutes = 0;
SmallVector<GlobalVariable *> OrderedGVs;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ for (auto &K : GVsToKernelsThatNeedToAccessItIndirectly) {
GlobalVariable *GV = K.first;
- if (!isNamedBarrier(*GV))
- continue;
+ assert(isNamedBarrierToLower(*GV));
+
// give a module-absolute assignment if it is indirectly accessed by
// multiple kernels. This is not precise, but we don't want to duplicate
// a function when it is called by multiple kernels.
- if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ if (GVsToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
OrderedGVs.push_back(GV);
} else {
// leave it to the 2nd round, which will give a kernel-relative
// assignment if it is only indirectly accessed by one kernel
- LDSUsesInfo.DirectAccess[*K.second.begin()].insert(GV);
+ UsesInfo.DirectAccess[*K.second.begin()].insert(GV);
}
- LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ GVsToKernelsThatNeedToAccessItIndirectly.erase(GV);
}
OrderedGVs = sortByName(std::move(OrderedGVs));
for (GlobalVariable *GV : OrderedGVs) {
@@ -124,7 +125,7 @@ static bool lowerExecSyncGlobalVariables(
// 4 bits for alignment, 5 bits for the barrier num,
// 3 bits for the barrier scope
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, GV, Offset);
+ recordAbsoluteAddress(&M, GV, Offset);
}
OrderedGVs.clear();
@@ -132,7 +133,7 @@ static bool lowerExecSyncGlobalVariables(
// either only indirectly accessed by single kernel or only directly
// accessed by multiple kernels.
SmallVector<Function *> OrderedKernels;
- for (auto &K : LDSUsesInfo.DirectAccess) {
+ for (auto &K : UsesInfo.DirectAccess) {
Function *F = K.first;
assert(isKernel(*F));
OrderedKernels.push_back(F);
@@ -141,11 +142,10 @@ static bool lowerExecSyncGlobalVariables(
DenseMap<Function *, uint32_t> Kernel2BarId;
for (Function *F : OrderedKernels) {
- for (GlobalVariable *GV : LDSUsesInfo.DirectAccess[F]) {
- if (!isNamedBarrier(*GV))
- continue;
+ for (GlobalVariable *GV : UsesInfo.DirectAccess[F]) {
+ assert(isNamedBarrierToLower(*GV));
- LDSUsesInfo.DirectAccess[F].erase(GV);
+ UsesInfo.DirectAccess[F].erase(GV);
if (GV->isAbsoluteSymbolRef()) {
// already assigned
continue;
@@ -156,7 +156,7 @@ static bool lowerExecSyncGlobalVariables(
for (GlobalVariable *GV : OrderedGVs) {
// GV could also be used directly by other kernels. If so, we need to
// create a new GV used only by this kernel and its function.
- auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ auto *NewGV = uniquifyGVPerKernel(M, GV, F);
Changed |= (NewGV != GV);
unsigned BarrierScope = AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
unsigned BarId = Kernel2BarId[F];
@@ -164,12 +164,13 @@ static bool lowerExecSyncGlobalVariables(
unsigned BarCnt = GV->getGlobalSize(DL) / 16;
Kernel2BarId[F] += BarCnt;
unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
- recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ recordAbsoluteAddress(&M, NewGV, Offset);
}
OrderedGVs.clear();
}
- // Also erase those special LDS variables from indirect_access.
- for (auto &K : LDSUsesInfo.IndirectAccess) {
+ // TODO: is this even necessary?
+ // Also erase those special variables from indirect_access.
+ for (auto &K : UsesInfo.IndirectAccess) {
assert(isKernel(*K.first));
for (GlobalVariable *GV : K.second) {
if (isNamedBarrier(*GV))
@@ -179,18 +180,6 @@ static bool lowerExecSyncGlobalVariables(
return Changed;
}
-static bool hasBarrierToLower(const GVUsesInfoTy &LDSUsesInfo) {
- for (auto &Map : {LDSUsesInfo.DirectAccess, LDSUsesInfo.IndirectAccess}) {
- for (auto &[Fn, GVs] : Map) {
- for (auto &GV : GVs) {
- if (AMDGPU::isNamedBarrier(*GV))
- return true;
- }
- }
- }
- return false;
-}
-
// With object linking, barrier ID assignment is deferred to the linker.
// Externalize named barrier globals and emit self-contained metadata so the
// AsmPrinter can generate the callgraph entries the linker needs.
@@ -238,27 +227,25 @@ static bool runLowerExecSyncGlobals(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = false;
Changed |=
- eliminateGVConstantExprUsesFromAllInstructions(M, isLDSVariableToLower);
+ eliminateGVConstantExprUsesFromAllInstructions(M, isNamedBarrierToLower);
// For each kernel, what variables does it access directly or through
// callees
- GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
+ GVUsesInfoTy BarrierUsesInfo =
+ getTransitiveUsesOfGV(CG, M, isNamedBarrierToLower);
// For each variable accessed through callees, which kernels access it
- VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
- for (auto &K : LDSUsesInfo.IndirectAccess) {
+ VariableFunctionMap BarriersToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : BarrierUsesInfo.IndirectAccess) {
Function *F = K.first;
assert(isKernel(*F));
for (GlobalVariable *GV : K.second) {
- LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ BarriersToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
}
}
- if (hasBarrierToLower(LDSUsesInfo)) {
- // Special LDS variables need special address assignment
- Changed |= lowerExecSyncGlobalVariables(
- M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
- }
+ Changed |= lowerExecSyncGlobalVariables(
+ M, BarrierUsesInfo, BarriersToKernelsThatNeedToAccessItIndirectly);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index ef86d279d193b..12e2478e055b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -937,16 +937,6 @@ class AMDGPULowerModuleLDS {
for (auto &[F, Vars] : FunctionLDSUses)
AllLDSUses[F].insert(Vars.begin(), Vars.end());
- // Named barriers are handled by AMDGPULowerExecSync; filter them out.
- for (auto &[F, Vars] : AllLDSUses) {
- SmallVector<GlobalVariable *> Barriers;
- for (GlobalVariable *V : Vars)
- if (AMDGPU::isNamedBarrier(*V))
- Barriers.push_back(V);
- for (GlobalVariable *V : Barriers)
- Vars.erase(V);
- }
-
// Build reverse map: LDS variable -> functions that use it.
DenseMap<GlobalVariable *, SmallVector<Function *, 4>> VarToFuncs;
for (auto &[F, Vars] : AllLDSUses) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index cdc0b0a371e45..1cd6a13ec4d0e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -31,6 +31,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
@@ -277,7 +278,8 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
// Intercept LDS variables with known addresses
if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) {
if (std::optional<uint32_t> Address =
- AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::LOCAL_ADDRESS)) {
auto *IntTy = Type::getInt32Ty(CV->getContext());
return AsmPrinter::lowerConstant(ConstantInt::get(IntTy, *Address),
BaseCV, Offset);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
index 3e8a75a7eb840..3c917d850a240 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
@@ -15,6 +15,7 @@
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -97,17 +98,8 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- if (AMDGPU::isNamedBarrier(GV)) {
- std::optional<unsigned> BarAddr = getLDSAbsoluteAddress(GV);
- if (!BarAddr)
- llvm_unreachable("named barrier should have an assigned address");
- Entry.first->second = BarAddr.value();
- unsigned BarCnt = GV.getGlobalSize(DL) / 16;
- recordNumNamedBarriers(BarAddr.value(), BarCnt);
- return BarAddr.value();
- }
-
- std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
+ std::optional<uint32_t> MaybeAbs =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
if (MaybeAbs) {
// Absolute address LDS variables that exist prior to the LDS lowering
// pass raise a fatal error in that pass. These failure modes are only
@@ -165,6 +157,19 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
+unsigned
+AMDGPUMachineFunctionInfo::allocateBarrierGlobal(const DataLayout &DL,
+ const GlobalVariable &GV) {
+ assert(AMDGPU::isNamedBarrier(GV));
+ std::optional<unsigned> BarAddr =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::EXECSYNC);
+ if (!BarAddr)
+ llvm_unreachable("named barrier should have an assigned address");
+ unsigned BarCnt = GV.getGlobalSize(DL) / 16;
+ recordNumNamedBarriers(BarAddr.value(), BarCnt);
+ return BarAddr.value();
+}
+
std::optional<uint32_t>
AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
@@ -182,8 +187,9 @@ AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
}
std::optional<uint32_t>
-AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(const GlobalValue &GV) {
- if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(const GlobalValue &GV,
+ unsigned AS) {
+ if (GV.getAddressSpace() != AS)
return {};
std::optional<ConstantRange> AbsSymRange = GV.getAbsoluteSymbolRange();
@@ -221,7 +227,8 @@ void AMDGPUMachineFunctionInfo::setDynLDSAlign(const Function &F,
const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
if (Dyn) {
unsigned Offset = LDSSize; // return this?
- std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+ std::optional<uint32_t> Expect =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
if (!Expect || (Offset != *Expect)) {
report_fatal_error("Inconsistent metadata on dynamic LDS variable");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
index 36db6c2dd0d12..6cd683361a50f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
@@ -109,8 +109,13 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
Align Trailing);
+ unsigned allocateBarrierGlobal(const DataLayout &DL,
+ const GlobalVariable &GV);
+
static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
- static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
+
+ static std::optional<uint32_t> get32BitAbsoluteAddress(const GlobalValue &GV,
+ unsigned AS);
Align getDynLDSAlign() const { return DynLDSAlign; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index 5c2172ff26cef..b8154e0d48505 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-memory-utils"
@@ -59,6 +60,8 @@ static TargetExtType *getTargetExtType(const GlobalVariable &GV) {
}
TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
+ if (GV.getAddressSpace() != AMDGPUAS::EXECSYNC)
+ return nullptr;
if (TargetExtType *Ty = getTargetExtType(GV))
return Ty->getName() == "amdgcn.named.barrier" ? Ty : nullptr;
return nullptr;
@@ -275,15 +278,6 @@ GVUsesInfoTy getTransitiveUsesOfLDSForLowering(const CallGraph &CG, Module &M) {
if (IsDirectMapDynLDSGV)
continue;
- // TODO: Remove once barriers are no longer in the LDS AS.
- if (isNamedBarrier(*GV)) {
- if (IsAbsolute) {
- UsesInfo.DirectAccess[Fn].erase(GV);
- UsesInfo.IndirectAccess[Fn].erase(GV);
- }
- continue;
- }
-
if (HasAbsoluteGVs.has_value()) {
if (*HasAbsoluteGVs != IsAbsolute) {
reportFatalUsageError(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 64d7a9b299a0b..c8a4290442904 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -43,6 +43,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -8353,9 +8354,11 @@ bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
// linker can assign their offsets.
if (AMDGPUTargetMachine::EnableObjectLinking) {
if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
- if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
- "when object linking is enabled");
+ if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ GVar->getAddressSpace() == AMDGPUAS::EXECSYNC) {
+ assert(GVar->isDeclaration() &&
+ "AS 3 & 13 GVs should be declaration here "
+ "when object linking is enabled");
return false;
}
}
@@ -9000,7 +9003,7 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
if (Subtarget->hasApertureRegs()) {
- const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
+ const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::EXECSYNC)
? AMDGPU::SRC_SHARED_BASE
: AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
@@ -9099,10 +9102,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
- // flat -> local/private
+ // flat -> local/private/execsync
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
- DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS || DestAS == AMDGPUAS::EXECSYNC) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -9129,10 +9132,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}
}
- // local/private -> flat
+ // local/private/execsync -> flat
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS || SrcAS == AMDGPUAS::EXECSYNC) {
SDValue CvtPtr;
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
@@ -9711,12 +9714,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
EVT PtrVT = Op.getValueType();
const GlobalValue *GV = GSD->getGlobal();
- if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ const unsigned AS = GSD->getAddressSpace();
+ if (((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::EXECSYNC) &&
shouldUseLDSConstAddress(GV)) ||
- GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
- GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- GV->hasExternalLinkage()) {
+ AS == AMDGPUAS::REGION_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
// HIP uses an unsized array `extern __shared__ T s[]` or similar
// zero-sized type in other languages to declare the dynamic shared
@@ -9736,7 +9738,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
}
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ if (AS == AMDGPUAS::EXECSYNC) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+ SIInstrInfo::MO_ABS32_LO);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0);
+ }
+
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
SIInstrInfo::MO_ABS32_LO);
return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
@@ -12293,8 +12301,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
BarVal = C->getZExtValue();
else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
- if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(
- *GA->getGlobal()))
+ if (auto Addr = AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GA->getGlobal(), AMDGPUAS::EXECSYNC))
BarVal = *Addr + GA->getOffset();
if (BarVal) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 768d9c1f91470..d33874686a7ab 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -569,7 +569,7 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32DataTypes: RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16]>;
-def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6]>;
+def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6, p15]>;
def Reg32Types : RegisterTypes<!listconcat(Reg32DataTypes.types, Reg32PtrTypes.types)>;
def Reg64DataTypes: RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16]>;
def Reg64PtrTypes: RegisterTypes<[p0, p1, p4]>;
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index ed04ccb53d8e6..4f93a069d20c1 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -274,8 +274,9 @@ static std::string computeAMDDataLayout(const Triple &TT) {
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
- "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
+ "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32"
+ "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:"
+ "512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-execsync.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-execsync.ll
new file mode 100644
index 0000000000000..a4a02691d316b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-execsync.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030,GFX1030-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030,GFX1030-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+; TODO: GlobalISel addrspacecast lowering
+
+define amdgpu_kernel void @execsync_to_generic(ptr %out) {
+; GFX942-SDAG-LABEL: execsync_to_generic:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x802010
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: execsync_to_generic:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: execsync_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, 0x802010
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: execsync_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: execsync_to_generic:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0x802010
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: execsync_to_generic:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) @bar to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @generic_to_execsync(ptr %generic, ptr %out) {
+; GFX942-SDAG-LABEL: generic_to_execsync:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: generic_to_execsync:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: generic_to_execsync:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: generic_to_execsync:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: generic_to_execsync:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-SDAG-NEXT: flat_store_b32 v0, v1, s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: generic_to_execsync:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr %generic to ptr addrspace(15)
+ store ptr addrspace(15) %res, ptr %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1030: {{.*}}
+; GFX1250: {{.*}}
+; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
index 215fb06106e11..fb92804b205c9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
@@ -6,29 +6,29 @@
; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
-; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]]
; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
;.
define void @func1() #0 {
; CHECK-LABEL: define void @func1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -36,14 +36,14 @@ define void @func1() #0 {
define void @func2() #0 {
; CHECK-LABEL: define void @func2(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
store i8 7, ptr addrspace(3) @lds1, align 4
ret void
@@ -53,20 +53,20 @@ define amdgpu_kernel void @kernel1() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1.kernel1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1.kernel1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1.kernel1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -78,15 +78,15 @@ define amdgpu_kernel void @kernel2() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
; CHECK-SAME: ) #[[ATTR1]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
store i8 10, ptr addrspace(3) @lds1, align 4
@@ -96,13 +96,13 @@ define amdgpu_kernel void @kernel2() #0 {
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
index 05f2f07c84503..75a81939a4f0b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
@@ -4,24 +4,24 @@
; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario,
; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
;
define void @bar() #0 {
; CHECK-LABEL: define void @bar(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK: store i8 7, ptr addrspace(1) {{.*}}, align 4
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
store i8 7, ptr addrspace(3) @lds1, align 4
ret void
@@ -32,32 +32,21 @@ define amdgpu_kernel void @barkernel() #0 {
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
; CHECK: {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}})
; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK: call void @bar()
; CHECK: store i8 10, ptr addrspace(1) {{.*}}, align 4
; CHECK: call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}})
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @bar()
store i8 10, ptr addrspace(3) @lds1, align 4
ret void
}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
-
attributes #0 = { nounwind sanitize_address }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index 74e6d83ed2d94..357b4c618ca11 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -4,38 +4,38 @@
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
-; CHECK: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar1.kernel1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2]]
;.
define void @func1() {
; CHECK-LABEL: define void @func1() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
define void @func2() {
; CHECK-LABEL: define void @func2() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -43,19 +43,19 @@ define void @func2() {
define amdgpu_kernel void @kernel1() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1.kernel1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1.kernel1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1.kernel1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1.kernel1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -65,14 +65,14 @@ define amdgpu_kernel void @kernel1() #0 {
define amdgpu_kernel void @kernel2() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -82,13 +82,13 @@ define amdgpu_kernel void @kernel2() #0 {
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index b86f621b9e0cf..489061d329320 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -489,8 +489,8 @@ attributes #1 = { nounwind }
; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
; HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
;.
-; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10}
-; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10}
-; HSA: [[META2]] = !{i32 2, i32 10}
-; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
+; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 16}
+; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 16}
+; HSA: [[META2]] = !{i32 2, i32 16}
+; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
index ccc434842e498..ccb34288d5845 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
@@ -153,7 +153,7 @@ attributes #0 = { "amdgpu-no-flat-scratch-init" }
; GFX10: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
;.
-; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
-; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 1e8a33781ee8d..bf2e44c641664 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -877,15 +877,15 @@ define amdgpu_kernel void @with_inline_asm() {
; GFX10: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR4]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
;.
-; GFX9: [[META0]] = !{i32 2, i32 10}
-; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 2, i32 16}
+; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
;.
-; GFX10: [[META0]] = !{i32 2, i32 10}
-; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 2, i32 16}
+; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index d91b2117c7ad9..f699ba70bb935 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -633,7 +633,7 @@ define amdgpu_kernel void @no_alias_addr_space_has_meta(ptr addrspace(3) %sptr,
!0 = !{i32 2, i32 3, i32 4, i32 10}
;.
-; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 10}
-; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 10}
+; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 16}
+; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 16}
; CHECK: [[META2]] = !{i32 2, i32 3, i32 4, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
index 46d4c8db00f06..2f85330655780 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
@@ -5,7 +5,7 @@
; 2. group_segment_fixed_size = 0 (linker patches it)
; 3. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds)
- at bar = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
; CHECK-LABEL: kernel:
; CHECK: s_lshr_b32 s{{[0-9]+}}, __amdgpu_named_barrier.bar{{[^ @]*}}@abs32 at lo, 4
@@ -16,20 +16,17 @@
; KD: group_segment_fixed_size = 0 (linker will patch).
; CHECK: .amdhsa_group_segment_fixed_size 0
-; LDS symbol declaration
-; CHECK: .amdgpu_lds __amdgpu_named_barrier.bar{{[^ ,]*}}, 32, 4
-
define amdgpu_kernel void @kernel() {
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @helper()
ret void
}
declare void @helper()
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #0
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #0
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #0
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #0
declare void @llvm.amdgcn.s.barrier.wait(i16) #0
attributes #0 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
index e1ad6dcd0b6a3..d0157f8aab9ab 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
@@ -6,21 +6,21 @@
; 2. AMDGPULowerModuleLDS does not handle named barriers at all
; 3. amdgpu.lds.uses does NOT contain barrier entries
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
@lds = internal addrspace(3) global [4 x i32] poison, align 4
; Internal named barrier becomes external with a module-unique hash suffix.
-; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(3) global target("amdgcn.named.barrier", 0)
+; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(15) global target("amdgcn.named.barrier", 0)
; CHECK-NOT: !absolute_symbol
; Regular LDS is packed into the per-function struct (external, for linker).
; CHECK: @__amdgpu_lds.kernel = external dso_local addrspace(3) global %__amdgpu_lds.kernel.t, align 16
define amdgpu_kernel void @kernel(i32 %idx) {
; CHECK-LABEL: define amdgpu_kernel void @kernel(
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @[[BAR]], i32 3)
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @[[BAR]])
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @[[BAR]], i32 3)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @[[BAR]])
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
%gep = getelementptr [4 x i32], ptr addrspace(3) @lds, i32 0, i32 %idx
store i32 42, ptr addrspace(3) %gep, align 4
@@ -29,7 +29,7 @@ define amdgpu_kernel void @kernel(i32 %idx) {
; Named barrier metadata: (barrier_sym, func1, ...) -- emitted by ExecSync.
; CHECK-DAG: !amdgpu.named_barrier.uses = !{[[BAR_MD:![0-9]+]]}
-; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(3) @[[BAR]], ptr @kernel}
+; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(15) @[[BAR]], ptr @kernel}
; LDS metadata must have exactly one entry (the LDS struct), no barrier entries.
; CHECK-DAG: !amdgpu.lds.uses = !{[[LDS_MD:![0-9]+]]}
; CHECK-DAG: [[LDS_MD]] = !{ptr @kernel, ptr addrspace(3) @__amdgpu_lds.kernel}
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 1552014dc24e0..3bc163e4d21e6 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -63,7 +63,7 @@
@nullptr14 = global ptr addrspace(14) addrspacecast (ptr null to ptr addrspace(14))
; CHECK-LABEL: nullptr15:
-; R600-NEXT: .long 0
+; R600-NEXT: .long -1
@nullptr15 = global ptr addrspace(15) addrspacecast (ptr null to ptr addrspace(15))
; CHECK-LABEL: nullptr16:
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 49916ba1ce160..292ae9601e71c 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -3,27 +3,27 @@
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
-; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
-; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
-; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
; SOUT: .set .Lfunc1.num_named_barrier, 7
define void @func1() {
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
; SOUT: .set .Lfunc2.num_named_barrier, 2
define void @func2() {
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -31,11 +31,11 @@ define void @func2() {
; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel1.num_named_barrier, max(6, .Lfunc1.num_named_barrier, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel1() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1.kernel1, i32 11)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -45,26 +45,15 @@ define amdgpu_kernel void @kernel1() #0 {
; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel2.num_named_barrier, max(6, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel2() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
ret void
}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
-
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 248d6a7a5f1d2..5356de057d66b 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -2,9 +2,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
define void @func1() {
; GFX12-SDAG-LABEL: func1:
@@ -33,8 +33,8 @@ define void @func1() {
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -66,13 +66,13 @@ define void @func2() {
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX12-SDAG-LABEL: kernel1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -188,17 +188,17 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
; GFX12-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
- call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %in, i32 9)
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %in)
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @llvm.amdgcn.s.barrier.leave(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
- %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar)
+ %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %in)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -206,7 +206,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
-define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX12-SDAG-LABEL: kernel2:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -250,8 +250,8 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -268,11 +268,11 @@ define void @signal_var_cnt0_const_bar() {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_barrier_signal 2
; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 0)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 0)
ret void
}
-define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
+define void @signal_var_cnt0_dynamic_bar(ptr addrspace(15) inreg %bar) {
; GFX12-LABEL: signal_var_cnt0_dynamic_bar:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -285,7 +285,7 @@ define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
; GFX12-NEXT: s_and_b32 m0, s0, 63
; GFX12-NEXT: s_barrier_signal m0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %bar, i32 0)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar, i32 0)
ret void
}
@@ -311,13 +311,13 @@ define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrs
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
index 2ecb380e6c37e..ae339ea51770d 100644
--- a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
@@ -2,9 +2,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-GISEL %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX1250-SDAG-LABEL: kernel1:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -29,13 +29,13 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX1250-GISEL-NEXT: s_and_b32 m0, s0, 63
; GFX1250-GISEL-NEXT: s_wakeup_barrier m0
; GFX1250-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %in)
ret void
}
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index cb52a639d12bd..b65f35394125d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -59,5 +59,5 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
;.
-; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
>From 4bb101e9d77b08784eb0c4a6c9568183c4eb851c Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 4 May 2026 10:51:23 +0200
Subject: [PATCH 2/2] fix formatting
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c8a4290442904..3b9d2dba16635 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9003,9 +9003,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
if (Subtarget->hasApertureRegs()) {
- const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::EXECSYNC)
- ? AMDGPU::SRC_SHARED_BASE
- : AMDGPU::SRC_PRIVATE_BASE;
+ const unsigned ApertureRegNo =
+ (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::EXECSYNC)
+ ? AMDGPU::SRC_SHARED_BASE
+ : AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!Subtarget->hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
More information about the llvm-branch-commits
mailing list