[llvm-branch-commits] [clang] [lld] [llvm] [mlir] [RFC][AMDGPU] Add BARRIER address space (PR #195613)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jun 1 01:53:48 PDT 2026
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/195613
>From 005e56497833a6d0193dd27eb993b4c11dfe4157 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 22 Apr 2026 11:35:22 +0200
Subject: [PATCH 1/2] [RFC][AMDGPU] Add BARRIER address space
Add a new BARRIER address space that is used for global variables that are used to represent the barrier IDs in GFX12.5.
These barrier addresses just have values corresponding 1-1 to barrier IDs. They are still implemented on top of LDS, but the offsetting happens during an addrspacecast to generic, not whenever the barrier GV is used.
The motivation for this is to make the relation between LDS and barrier GVs explicit in the compiler. It does add a bit more complexity, but that complexity was already there, just hidden by pretending barrier GVs were actual LDS.
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 15 +-
clang/test/CodeGen/target-data.c | 4 +-
clang/test/CodeGenHIP/amdgpu-barrier-type.hip | 34 +-
clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl | 2 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 16 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 4 +-
lld/test/ELF/lto/amdgcn-oses.ll | 6 +-
lld/test/ELF/lto/amdgcn.ll | 2 +-
llvm/docs/AMDGPUUsage.rst | 34 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 11 +-
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 16 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 28 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 44 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 24 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 66 ++-
.../lib/Target/AMDGPU/AMDGPULowerExecSync.cpp | 29 +-
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 10 -
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 4 +-
.../AMDGPU/AMDGPUMachineFunctionInfo.cpp | 42 +-
.../Target/AMDGPU/AMDGPUMachineFunctionInfo.h | 9 +-
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 12 +-
llvm/lib/Target/AMDGPU/SIDefines.h | 4 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 107 +++--
llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +-
llvm/lib/TargetParser/TargetDataLayout.cpp | 5 +-
.../AMDGPU/always_uniform.ll | 6 +-
.../CodeGen/AMDGPU/addrspacecast-barrier.ll | 442 ++++++++++++++++++
.../amdgpu-lower-exec-sync-and-module-lds.ll | 64 +--
.../amdgpu-lower-exec-sync-and-sw-lds.ll | 39 +-
.../CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll | 64 +--
.../AMDGPU/annotate-kernel-features-hsa.ll | 8 +-
...utor-flatscratchinit-undefined-behavior.ll | 4 +-
.../AMDGPU/attributor-flatscratchinit.ll | 20 +-
.../AMDGPU/attributor-noalias-addrspace.ll | 4 +-
.../AMDGPU/barrier-addrspace-dereference.ll | 16 +
.../lds-link-time-codegen-named-barrier.ll | 13 +-
.../AMDGPU/lds-link-time-named-barrier.ll | 14 +-
.../CodeGen/AMDGPU/null-named-barrier-gv.ll | 31 ++
.../s-barrier-lowering-bad-absolute-symbol.ll | 16 +
.../s-barrier-lowering-wrong-gv-signature.ll | 27 ++
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 66 +--
llvm/test/CodeGen/AMDGPU/s-barrier.ll | 85 ++--
llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll | 14 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 2 +-
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 23 +-
mlir/test/Dialect/LLVMIR/rocdl.mlir | 30 +-
mlir/test/Target/LLVMIR/rocdl.mlir | 30 +-
48 files changed, 1108 insertions(+), 441 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 5a65d23ec4938..4f5f846a603c9 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -478,8 +478,21 @@ AMDGPUTargetCodeGenInfo::getSRetAddrSpace(const CXXRecordDecl *RD) const {
LangAS AMDGPUTargetCodeGenInfo::adjustGlobalVarAddressSpace(
CodeGenModule &CGM, const VarDecl *D, std::optional<LangAS> AS) const {
- if (AS)
+ if (AS) {
+ // NamedWorkgroupBarrier GVs are declared as __shared__, but the back-end
+ // models them as a separate address space.
+ const LangOptions &LangOpts = CGM.getLangOpts();
+ if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+ AS == LangAS::cuda_shared) {
+ const Type *Ty = D->getType().getCanonicalType().getTypePtr();
+ if (Ty->isArrayType())
+ Ty = Ty->getBaseElementTypeUnsafe();
+ const BuiltinType *BTy = dyn_cast<BuiltinType>(Ty);
+ if (BTy && BTy->getKind() == BuiltinType::AMDGPUNamedWorkgroupBarrier)
+ return getLangASFromTargetAS(llvm::AMDGPUAS::BARRIER);
+ }
return *AS;
+ }
LangAS DefaultGlobalAS = getLangASFromTargetAS(
CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 0047e1377a7f8..f9592f99ffc2d 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// Test default -target-cpu
// RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
index 947ceb56d279e..684ceb85f9185 100644
--- a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
+++ b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
@@ -1,6 +1,6 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
- // REQUIRES: amdgpu-registered-target
- // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
#define __shared__ __attribute__((shared))
@@ -11,8 +11,15 @@ __shared__ struct {
__amdgpu_named_workgroup_barrier_t y;
} str;
-__amdgpu_named_workgroup_barrier_t *getBar();
-void useBar(__amdgpu_named_workgroup_barrier_t *);
+//.
+// CHECK: @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef, align 4
+// CHECK: @arr = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef, align 4
+// CHECK: @str = internal addrspace(3) global %struct.anon undef, align 4
+// CHECK: @__hip_cuid_ = addrspace(1) global i8 0
+// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
+//.
+__attribute__((device)) __amdgpu_named_workgroup_barrier_t *getBar();
+__attribute__((device)) void useBar(__amdgpu_named_workgroup_barrier_t *);
// CHECK-LABEL: define {{[^@]+}}@_Z7testSemPu34__amdgpu_named_workgroup_barrier_t
// CHECK-SAME: (ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -22,15 +29,15 @@ void useBar(__amdgpu_named_workgroup_barrier_t *);
// CHECK-NEXT: store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[TMP0]]) #[[ATTR2:[0-9]+]]
-// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(1) @bar to ptr)) #[[ATTR2]]
-// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(1) @arr to ptr), i64 16)) #[[ATTR2]]
-// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(1) @str to ptr), i64 16)) #[[ATTR2]]
+// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(15) @bar to ptr)) #[[ATTR2]]
+// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(15) @arr to ptr), i64 16)) #[[ATTR2]]
+// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @str to ptr), i64 16)) #[[ATTR2]]
// CHECK-NEXT: [[CALL:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
// CHECK-NEXT: call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[CALL]]) #[[ATTR2]]
// CHECK-NEXT: [[CALL1:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
// CHECK-NEXT: ret ptr [[CALL1]]
//
-__amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
+__attribute__((device)) __amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
useBar(p);
useBar(&bar);
useBar(&arr[1]);
@@ -38,3 +45,12 @@ __amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *
useBar(getBar());
return getBar();
}
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "uniform-work-group-size" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "uniform-work-group-size" }
+// CHECK: attributes #[[ATTR2]] = { convergent nounwind "uniform-work-group-size" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..fcee9b3b20813 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
void foo(void) {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 332a2fa94ee92..8ea4d4e2b32e2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -83,9 +83,9 @@ void test_s_barrier_signal()
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
void test_s_barrier_signal_var(void *bar, int a)
@@ -132,9 +132,9 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
void test_s_barrier_init(void *bar, int a)
@@ -147,8 +147,8 @@ void test_s_barrier_init(void *bar, int a)
// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_barrier_join(void *bar)
@@ -189,8 +189,8 @@ unsigned test_s_get_barrier_state(int a)
// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT: ret i32 [[TMP3]]
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 8b09216057167..9368c2971a643 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -1362,8 +1362,8 @@ void test_s_cluster_barrier()
// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: store ptr [[BAR:%.*]], ptr addrspace(5) [[BAR_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[BAR_ADDR]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
-// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(15)
+// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) [[TMP1]])
// CHECK-NEXT: ret void
//
void test_s_wakeup_barrier(void *bar)
diff --git a/lld/test/ELF/lto/amdgcn-oses.ll b/lld/test/ELF/lto/amdgcn-oses.ll
index b3caf0f0de3b9..7c2101266a85c 100644
--- a/lld/test/ELF/lto/amdgcn-oses.ll
+++ b/lld/test/ELF/lto/amdgcn-oses.ll
@@ -25,7 +25,7 @@
;--- amdhsa.ll
target triple = "amdgcn-amd-amdhsa"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
@@ -36,7 +36,7 @@ define void @_start() {
;--- amdpal.ll
target triple = "amdgcn-amd-amdpal"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
define amdgpu_cs void @_start() {
ret void
@@ -44,7 +44,7 @@ define amdgpu_cs void @_start() {
;--- mesa3d.ll
target triple = "amdgcn-amd-mesa3d"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
define void @_start() {
ret void
diff --git a/lld/test/ELF/lto/amdgcn.ll b/lld/test/ELF/lto/amdgcn.ll
index 186185c44a2c2..1dc2d86c48364 100644
--- a/lld/test/ELF/lto/amdgcn.ll
+++ b/lld/test/ELF/lto/amdgcn.ll
@@ -5,7 +5,7 @@
; Make sure the amdgcn triple is handled
target triple = "amdgcn-amd-amdhsa"
-target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p15:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
define void @_start() {
ret void
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8dd743a995a9d..ef18c5aa7f34a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -930,6 +930,7 @@ supported for the ``amdgcn`` target.
*reserved for downstream use (LLPC)* 12
*reserved for future use* 13
*reserved for future use* 14
+ Barrier 15 N/A N/A 32 0
*reserved for future use* 16
Streamout Registers 128 N/A GS_REGS
===================================== =============== =========== ================ ======= ============================
@@ -1143,6 +1144,23 @@ supported for the ``amdgcn`` target.
a buffer strided pointer, this means that the base pointer is ``align(4)``, that
the offset is a multiple of 4 bytes, and that the stride is a multiple of 4.
+**Barrier**
+ This address space represents barrier IDs (introduced in GFX12) as addresses.
+ It does not map directly to any addressable memory, thus pointers into this address space:
+
+ * Never alias with any other pointers outside this address space.
+ * Cannot be dereferenced.
+ * Can only be consumed by intrinsics.
+ * Are always uniform.
+
+ Pointer are 32 bits and directly correspond to valid barrier IDs. All barrier pointers must,
+ when interpreted as signed 32 bit integers, have a value corresponding to a valid barrier ID
+ on the target. Otherwise, the behavior is undefined.
+
+ Due to these pointers being a compiler abstraction without a corresponding hardware aperture,
+ the back-end handles them as-if they were local pointers with a very large offset as to not
+ overlap with any addressable local memory.
+
**Streamout Registers**
Dedicated registers used by the GS NGG Streamout Instructions. The register
file is modelled as a memory in a distinct address space because it is indexed
@@ -1304,10 +1322,8 @@ Named barriers are fixed function hardware barrier objects that are available
in gfx12.5+ in addition to the traditional default barriers.
In LLVM IR, named barriers are represented by global variables of type
-``target("amdgcn.named.barrier", 0)`` in the LDS address space. Named barrier
-global variables do not occupy actual LDS memory, but their lifetime and
-allocation scope matches that of global variables in LDS. Programs in LLVM IR
-refer to named barriers using pointers.
+``target("amdgcn.named.barrier", 0)`` in the Execution Synchronization Resources
+address space. Programs in LLVM IR refer to named barriers using pointers.
The following named barrier types are supported in global variables, defined
recursively:
@@ -1318,14 +1334,14 @@ recursively:
.. code-block:: llvm
- @bar = addrspace(3) global target("amdgcn.named.barrier", 0) undef
- @foo = addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] undef
- @baz = addrspace(3) global { target("amdgcn.named.barrier", 0) } undef
+ @bar = addrspace(15) global target("amdgcn.named.barrier", 0) undef
+ @foo = addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] undef
+ @baz = addrspace(15) global { target("amdgcn.named.barrier", 0) } undef
...
- %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(3) @foo, i32 0, i32 %i
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %foo.i, i32 0)
+ %foo.i = getelementptr [2 x target("amdgcn.named.barrier", 0)], ptr addrspace(15) @foo, i32 0, i32 %i
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %foo.i, i32 0)
Named barrier types may not be used in ``alloca``.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2fd5285dfc330..62107d541e173 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -13,6 +13,7 @@
def flat_ptr_ty : LLVMQualPointerType<0>;
def global_ptr_ty : LLVMQualPointerType<1>;
def local_ptr_ty : LLVMQualPointerType<3>;
+def barrier_ptr_ty : LLVMQualPointerType<15>;
// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
// by the backend cause whole-program undefined behavior when violated, such as
@@ -295,7 +296,7 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa
// If %memberCnt is 0, the member count is retained from the previous
// s_barrier_init or s_barrier_signal operation.
def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
- Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [barrier_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType)
@@ -307,19 +308,19 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri
// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
- Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ Intrinsic<[], [barrier_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
- Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
- Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
@@ -342,7 +343,7 @@ def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrie
// uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier)
// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">,
- Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[llvm_i32_ty], [barrier_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 5fe52dc1279bd..810a2aff8ef0c 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -26,8 +26,7 @@ namespace llvm {
/// memory locations.
namespace AMDGPUAS {
enum : unsigned {
- // The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 9,
+ MAX_AMDGPU_ADDRESS = 15,
FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
@@ -47,6 +46,14 @@ enum : unsigned {
BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
///< pointers with an additional index.
+ RESERVED_0 = 10,
+ RESERVED_1 = 11,
+ RESERVED_2 = 12,
+ RESERVED_3 = 13,
+ RESERVED_4 = 14,
+
+ BARRIER = 15, ///< Address space for modeling barrier IDs as addresses.
+
RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
/// Internal address spaces. Can be freely renumbered.
@@ -84,6 +91,11 @@ enum : unsigned {
// Some places use this if the address space can't be determined.
UNKNOWN_ADDRESS_SPACE = ~0u,
};
+
+/// The BARRIER AS is does not have an aperture in HW, so when converting
+/// BARRIER addresses from/to generic, we represent them as LDS addresses
+/// offset by a large amount so they can never alias with real LDS memory.
+static constexpr unsigned BarrierAddrLDSOffset = 0x802000u;
} // end namespace AMDGPUAS
namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c6dd1dbb62449..e6eb95247d09b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -622,17 +622,23 @@ static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
// clang-format off
static const bool ASAliasRules[][AMDGPUAS::MAX_AMDGPU_ADDRESS + 1] = {
- /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
- /* Flat */ {true, true, false, true, true, true, true, true, true, true},
- /* Global */ {true, true, false, false, true, false, true, true, true, true},
- /* Region */ {false, false, true, false, false, false, false, false, false, false},
- /* Local */ {true, false, false, true, false, false, false, false, false, false},
- /* Constant */ {true, true, false, false, false, false, true, true, true, true},
- /* Private */ {true, false, false, false, false, true, false, false, false, false},
- /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true},
- /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true},
- /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true},
- /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true},
+ /* Flat Global Region Local Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr Reserved Reserved Reserved Reserved Reserved Barrier */
+ /* Flat */ {true, true, false, true, true, true, true, true, true, true, false, false, false, false, false, false},
+ /* Global */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Region */ {false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Local */ {true, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Constant */ {true, true, false, false, false, false, true, true, true, true, false, false, false, false, false, false},
+ /* Private */ {true, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false},
+ /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true, false, false, false, false, false, false},
+ /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Reserved */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false},
+ /* Barrier */ {false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true},
};
// clang-format on
static_assert(std::size(ASAliasRules) == AMDGPUAS::MAX_AMDGPU_ADDRESS + 1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 2cf998b218412..7186bc1d76f40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -18,6 +18,7 @@ def p3 : PtrValueType<i32, 3>;
def p4 : PtrValueType<i64, 4>;
def p5 : PtrValueType<i32, 5>;
def p6 : PtrValueType<i32, 6>;
+def p15 : PtrValueType<i32, 15>;
//===-----------------------------------------------------------------------===//
// AMDGPU Subtarget Feature (device properties)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f0918a86be757..983be7c84ac2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Target/TargetMachine.h"
@@ -1534,25 +1535,42 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
+ const auto TrapAndPoison = [&] {
+ SDLoc DL(Op);
+ SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
+ SDValue OutputChain =
+ DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Trap, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ return DAG.getPOISON(Op.getValueType());
+ };
+
+ if (G->getAddressSpace() == AMDGPUAS::BARRIER) {
+ const GlobalVariable *GVar = cast<GlobalVariable>(GV);
+
+ if (!AMDGPU::isNamedBarrier(*GVar)) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ Fn, "Unsupported use of BARRIER address space!",
+ SDLoc(Op).getDebugLoc(), DS_Error));
+ return TrapAndPoison();
+ }
+
+ unsigned Offset = MFI->allocateBarrierGlobal(DL, *cast<GlobalVariable>(GV));
+ return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+ }
+
if (!MFI->isModuleEntryFunction()) {
- auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
if (std::optional<uint32_t> Address =
- AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
- if (IsNamedBarrier) {
- unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
- MFI->recordNumNamedBarriers(Address.value(), BarCnt);
- }
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::LOCAL_ADDRESS)) {
return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
- } else if (IsNamedBarrier) {
- llvm_unreachable("named barrier should have an assigned address");
}
}
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds" &&
- !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+ GV->getName() != "llvm.amdgcn.module.lds") {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
@@ -1564,11 +1582,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
// functions that use local objects. However, if these dead functions are
// not eliminated, we don't want a compile time error. Just emit a warning
// and a trap, since there should be no callable path here.
- SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
- SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- Trap, DAG.getRoot());
- DAG.setRoot(OutputChain);
- return DAG.getPOISON(Op.getValueType());
+ return TrapAndPoison();
}
// XXX: What does the value of G->getOffset() mean?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 463b8c40350b2..9bf098bdc8423 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -7234,7 +7234,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
std::optional<int64_t> BarValImm =
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
if (BarValImm) {
- auto BarID = ((*BarValImm) >> 4) & 0x3F;
+ uint32_t BarID = *BarValImm & 0x3F;
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
.addImm(BarID);
I.eraseFromParent();
@@ -7243,16 +7243,10 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
}
}
- // BarID = (BarOp >> 4) & 0x3F
- Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
- .add(BarOp)
- .addImm(4u)
- .setOperandDead(3); // Dead scc
-
+ // BarID = BarOp & 0x3F
Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
- .addReg(TmpReg0)
+ .add(BarOp)
.addImm(0x3F)
.setOperandDead(3); // Dead scc
@@ -7301,16 +7295,10 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
if (!BarValImm) {
- // BarID = (BarOp >> 4) & 0x3F
- Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
- .addReg(BarOp.getReg())
- .addImm(4u)
- .setOperandDead(3); // Dead scc;
-
+ // BarID = BarOp & 0x3F
Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
- .addReg(TmpReg0)
+ .addReg(BarOp.getReg())
.addImm(0x3F)
.setOperandDead(3); // Dead scc;
@@ -7333,7 +7321,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
}
if (BarValImm) {
- auto BarId = ((*BarValImm) >> 4) & 0x3F;
+ uint32_t BarId = *BarValImm & 0x3F;
MIB.addImm(BarId);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8b5076cdd7712..7d791ad881f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -36,6 +36,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
@@ -2387,15 +2388,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
+ bool IsLDS = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER);
+ assert(IsLDS || AS == AMDGPUAS::PRIVATE_ADDRESS);
if (ST.hasApertureRegs()) {
// Note: this register is somewhat broken. When used as a 32-bit operand,
// it only returns zeroes. The real value is in the upper 32 bits.
// Thus, we must emit extract the high 32 bits.
- const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
- ? AMDGPU::SRC_SHARED_BASE
- : AMDGPU::SRC_PRIVATE_BASE;
+ const unsigned ApertureRegNo =
+ IsLDS ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!ST.hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
@@ -2450,7 +2451,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
- uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+ uint32_t StructOffset = IsLDS ? 0x40 : 0x44;
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo,
@@ -2518,7 +2519,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
}
if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
- (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+ (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::BARRIER ||
DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -2536,7 +2537,17 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
return B.buildIntToPtr(Dst, Sub).getReg(0);
}
- // Extract low 32-bits of the pointer.
+ if (DestAS == AMDGPUAS::BARRIER) {
+ // flat -> barrier: extract the low 32 bits, then sub the barrier AS
+ // offset.
+ Register LoBits = B.buildExtract(S32, Src, 0).getReg(0);
+ Register Sub =
+ B.buildSub(S32, LoBits,
+ B.buildConstant(S32, AMDGPUAS::BarrierAddrLDSOffset))
+ .getReg(0);
+ return B.buildIntToPtr(Dst, Sub).getReg(0);
+ }
+
return B.buildExtract(Dst, Src, 0).getReg(0);
};
@@ -2565,7 +2576,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
}
if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
- (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::BARRIER ||
SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
// Coerce the type of the low half of the result so we can use
@@ -2607,6 +2618,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (!ApertureReg.isValid())
return false;
+ if (SrcAS == AMDGPUAS::BARRIER) {
+ // barrier -> flat: add the barrier AS offset
+ SrcAsInt =
+ B.buildAdd(S32, SrcAsInt,
+ B.buildConstant(S32, AMDGPUAS::BarrierAddrLDSOffset))
+ .getReg(0);
+ }
+
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
@@ -3302,10 +3321,32 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
MachineFunction &MF = B.getMF();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const auto TrapAndPoison = [&] {
+ B.buildTrap();
+ B.buildUndef(DstReg);
+ MI.eraseFromParent();
+ return true;
+ };
+
+ if (AS == AMDGPUAS::BARRIER) {
+ const GlobalVariable *GVar = cast<GlobalVariable>(GV);
+ if (!AMDGPU::isNamedBarrier(*GVar)) {
+ const Function &Fn = MF.getFunction();
+ Fn.getContext().diagnose(DiagnosticInfoUnsupported(
+ Fn, "Unsupported use of BARRIER address space!", MI.getDebugLoc(),
+ DS_Error));
+ return TrapAndPoison();
+ }
+
+ B.buildConstant(DstReg,
+ MFI->allocateBarrierGlobal(B.getDataLayout(), *GVar));
+ MI.eraseFromParent();
+ return true;
+ }
+
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds" &&
- !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
+ GV->getName() != "llvm.amdgcn.module.lds") {
const Function &Fn = MF.getFunction();
Fn.getContext().diagnose(DiagnosticInfoUnsupported(
Fn, "local memory global used by non-kernel function",
@@ -3316,10 +3357,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
// functions that use local objects. However, if these dead functions are
// not eliminated, we don't want a compile time error. Just emit a warning
// and a trap, since there should be no callable path here.
- B.buildTrap();
- B.buildUndef(DstReg);
- MI.eraseFromParent();
- return true;
+ return TrapAndPoison();
}
// TODO: We could emit code to handle the initialization somewhere.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
index 707c3ec975d73..95ba2586efb09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerExecSync.cpp
@@ -6,12 +6,10 @@
//
//===----------------------------------------------------------------------===//
//
-// Lower LDS global variables with target extension type "amdgpu.named.barrier"
+// Lower global variables with target extension type "amdgpu.named.barrier"
// that require specialized address assignment. It assigns a unique
-// barrier identifier to each named-barrier LDS variable and encodes
+// barrier identifier to each named-barrier variable and encodes
// this identifier within the !absolute_symbol metadata of that global.
-// This encoding ensures that subsequent LDS lowering passes can process these
-// barriers correctly without conflicts.
//
//===----------------------------------------------------------------------===//
@@ -37,10 +35,14 @@ using namespace AMDGPU;
namespace {
+static bool isNamedBarrierToLower(const GlobalVariable &GV) {
+ return isNamedBarrier(GV) && !GV.isAbsoluteSymbolRef();
+}
+
// Write the specified address into metadata where it can be retrieved by
// the assembler. Format is a half open range, [Address Address+1)
-static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
- uint32_t Address) {
+static void recordAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
LLVMContext &Ctx = M->getContext();
auto *IntTy = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
@@ -137,14 +139,12 @@ static bool lowerExecSyncGlobalVariables(Module &M, GVUsesInfoTy &GVUsesInfo) {
LLVM_DEBUG(GV->printAsOperand(dbgs(), false);
dbgs() << " was assigned barrier id: " << BarID
<< " id-count: " << BarCnt << "\n");
- // 4 bits for alignment, 5 bits for the barrier num,
- // 3 bits for the barrier scope
- Offset = 0x802000u | BarrierScope << 9 | BarID << 4;
+ Offset = BarID;
} else {
llvm_unreachable("Unhandled special variable type.");
}
- recordLDSAbsoluteAddress(&M, GV, Offset);
+ recordAbsoluteAddress(&M, GV, Offset);
}
// Also erase those special LDS variables from indirect_access.
@@ -214,15 +214,16 @@ static bool runLowerExecSyncGlobals(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = false;
Changed |=
- eliminateGVConstantExprUsesFromAllInstructions(M, isLDSVariableToLower);
+ eliminateGVConstantExprUsesFromAllInstructions(M, isNamedBarrierToLower);
// For each kernel, what variables does it access directly or through
// callees
- GVUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDSForLowering(CG, M);
+ GVUsesInfoTy BarrierUsesInfo =
+ getTransitiveUsesOfGV(CG, M, isNamedBarrierToLower);
- if (hasBarrierToLower(LDSUsesInfo)) {
+ if (hasBarrierToLower(BarrierUsesInfo)) {
// Special LDS variables need special address assignment
- Changed |= lowerExecSyncGlobalVariables(M, LDSUsesInfo);
+ Changed |= lowerExecSyncGlobalVariables(M, BarrierUsesInfo);
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index ef86d279d193b..12e2478e055b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -937,16 +937,6 @@ class AMDGPULowerModuleLDS {
for (auto &[F, Vars] : FunctionLDSUses)
AllLDSUses[F].insert(Vars.begin(), Vars.end());
- // Named barriers are handled by AMDGPULowerExecSync; filter them out.
- for (auto &[F, Vars] : AllLDSUses) {
- SmallVector<GlobalVariable *> Barriers;
- for (GlobalVariable *V : Vars)
- if (AMDGPU::isNamedBarrier(*V))
- Barriers.push_back(V);
- for (GlobalVariable *V : Barriers)
- Vars.erase(V);
- }
-
// Build reverse map: LDS variable -> functions that use it.
DenseMap<GlobalVariable *, SmallVector<Function *, 4>> VarToFuncs;
for (auto &[F, Vars] : AllLDSUses) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2863f263fcf94..0ab3ca0687608 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -31,6 +31,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
@@ -277,7 +278,8 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
// Intercept LDS variables with known addresses
if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) {
if (std::optional<uint32_t> Address =
- AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(*GV)) {
+ AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GV, AMDGPUAS::LOCAL_ADDRESS)) {
auto *IntTy = Type::getInt32Ty(CV->getContext());
return AsmPrinter::lowerConstant(ConstantInt::get(IntTy, *Address),
BaseCV, Offset);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
index 3e8a75a7eb840..540e0981a95a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.cpp
@@ -15,6 +15,7 @@
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -97,17 +98,8 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- if (AMDGPU::isNamedBarrier(GV)) {
- std::optional<unsigned> BarAddr = getLDSAbsoluteAddress(GV);
- if (!BarAddr)
- llvm_unreachable("named barrier should have an assigned address");
- Entry.first->second = BarAddr.value();
- unsigned BarCnt = GV.getGlobalSize(DL) / 16;
- recordNumNamedBarriers(BarAddr.value(), BarCnt);
- return BarAddr.value();
- }
-
- std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
+ std::optional<uint32_t> MaybeAbs =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
if (MaybeAbs) {
// Absolute address LDS variables that exist prior to the LDS lowering
// pass raise a fatal error in that pass. These failure modes are only
@@ -165,6 +157,26 @@ unsigned AMDGPUMachineFunctionInfo::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
+unsigned
+AMDGPUMachineFunctionInfo::allocateBarrierGlobal(const DataLayout &DL,
+ const GlobalVariable &GV) {
+ assert(AMDGPU::isNamedBarrier(GV));
+ std::optional<unsigned> BarAddr =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::BARRIER);
+ if (!BarAddr)
+ llvm_unreachable("named barrier should have an assigned address");
+ if (*BarAddr == 0) {
+ // We cannot allow this because some places in CodeGen (rightfully) assume a
+ // GV address is never null. For example, there are no null checks on
+ // addrspacecast if the pointer is a GV pointer.
+ report_fatal_error(
+ "named barrier GV cannot be used to represent the NULL named barrier");
+ }
+ unsigned BarCnt = GV.getGlobalSize(DL) / 16;
+ recordNumNamedBarriers(BarAddr.value(), BarCnt);
+ return BarAddr.value();
+}
+
std::optional<uint32_t>
AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
@@ -182,8 +194,9 @@ AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(const Function &F) {
}
std::optional<uint32_t>
-AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(const GlobalValue &GV) {
- if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(const GlobalValue &GV,
+ unsigned AS) {
+ if (GV.getAddressSpace() != AS)
return {};
std::optional<ConstantRange> AbsSymRange = GV.getAbsoluteSymbolRange();
@@ -221,7 +234,8 @@ void AMDGPUMachineFunctionInfo::setDynLDSAlign(const Function &F,
const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
if (Dyn) {
unsigned Offset = LDSSize; // return this?
- std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+ std::optional<uint32_t> Expect =
+ get32BitAbsoluteAddress(GV, AMDGPUAS::LOCAL_ADDRESS);
if (!Expect || (Offset != *Expect)) {
report_fatal_error("Inconsistent metadata on dynamic LDS variable");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
index 36db6c2dd0d12..c65592bd965ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunctionInfo.h
@@ -82,7 +82,7 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt) {
NumNamedBarriers =
- std::max(NumNamedBarriers, ((GVAddr & 0x1ff) >> 4) + BarCnt - 1);
+ std::max(NumNamedBarriers, (GVAddr & 0x1ff) + BarCnt - 1);
}
uint32_t getNumNamedBarriers() const { return NumNamedBarriers; }
@@ -109,8 +109,13 @@ class AMDGPUMachineFunctionInfo : public MachineFunctionInfo {
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
Align Trailing);
+ unsigned allocateBarrierGlobal(const DataLayout &DL,
+ const GlobalVariable &GV);
+
static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
- static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
+
+ static std::optional<uint32_t> get32BitAbsoluteAddress(const GlobalValue &GV,
+ unsigned AS);
Align getDynLDSAlign() const { return DynLDSAlign; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index 5c2172ff26cef..33e598d781eca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#define DEBUG_TYPE "amdgpu-memory-utils"
@@ -59,6 +60,8 @@ static TargetExtType *getTargetExtType(const GlobalVariable &GV) {
}
TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
+ if (GV.getAddressSpace() != AMDGPUAS::BARRIER)
+ return nullptr;
if (TargetExtType *Ty = getTargetExtType(GV))
return Ty->getName() == "amdgcn.named.barrier" ? Ty : nullptr;
return nullptr;
@@ -275,15 +278,6 @@ GVUsesInfoTy getTransitiveUsesOfLDSForLowering(const CallGraph &CG, Module &M) {
if (IsDirectMapDynLDSGV)
continue;
- // TODO: Remove once barriers are no longer in the LDS AS.
- if (isNamedBarrier(*GV)) {
- if (IsAbsolute) {
- UsesInfo.DirectAccess[Fn].erase(GV);
- UsesInfo.IndirectAccess[Fn].erase(GV);
- }
- continue;
- }
-
if (HasAbsoluteGVs.has_value()) {
if (*HasAbsoluteGVs != IsAbsolute) {
reportFatalUsageError(
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 9867f5b02d7a7..6b0b360ebbd2e 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1132,10 +1132,6 @@ enum Type {
NAMED_BARRIER_LAST = 16,
};
-enum {
- BARRIER_SCOPE_WORKGROUP = 0,
-};
-
} // namespace Barrier
} // namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f03c5748455ee..087d96714f60d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -43,6 +43,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
@@ -8415,9 +8416,11 @@ bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
// linker can assign their offsets.
if (AMDGPUTargetMachine::EnableObjectLinking) {
if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
- if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
- "when object linking is enabled");
+ if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ GVar->getAddressSpace() == AMDGPUAS::BARRIER) {
+ assert(GVar->isDeclaration() &&
+ "AS 3 & 13 GVs should be declaration here "
+ "when object linking is enabled");
return false;
}
}
@@ -9134,10 +9137,11 @@ SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
+ const bool IsLDS = (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER);
+
if (Subtarget->hasApertureRegs()) {
- const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
- ? AMDGPU::SRC_SHARED_BASE
- : AMDGPU::SRC_PRIVATE_BASE;
+ const unsigned ApertureRegNo =
+ IsLDS ? AMDGPU::SRC_SHARED_BASE : AMDGPU::SRC_PRIVATE_BASE;
assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
!Subtarget->hasGloballyAddressableScratch()) &&
"Cannot use src_private_base with globally addressable scratch!");
@@ -9159,8 +9163,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// implicit kernargs.
const Module *M = DAG.getMachineFunction().getFunction().getParent();
if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
- ImplicitParameter Param =
- (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
+ ImplicitParameter Param = IsLDS ? SHARED_BASE : PRIVATE_BASE;
return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
}
@@ -9178,7 +9181,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
- uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+ uint32_t StructOffset = IsLDS ? 0x40 : 0x44;
SDValue Ptr =
DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
@@ -9234,10 +9237,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
- // flat -> local/private
+ // flat -> local/private/barrier
if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
- DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS || DestAS == AMDGPUAS::BARRIER) {
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
@@ -9250,6 +9253,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
0);
Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
+ } else if (DestAS == AMDGPUAS::BARRIER) {
+ // flat -> barrier: sub the barrier AS offset.
+ Ptr = DAG.getNode(
+ ISD::SUB, SL, MVT::i32, Ptr,
+ DAG.getConstant(AMDGPUAS::BarrierAddrLDSOffset, SL, MVT::i32));
}
if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
@@ -9264,10 +9272,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}
}
- // local/private -> flat
+ // local/private/barrier -> flat
if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS || SrcAS == AMDGPUAS::BARRIER) {
SDValue CvtPtr;
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
Subtarget->hasGloballyAddressableScratch()) {
@@ -9299,7 +9307,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
} else {
SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
- CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+ if (SrcAS == AMDGPUAS::BARRIER) {
+ // barrier -> flat: add the barrier AS offset.
+ SDValue SrcOffset = DAG.getNode(
+ ISD::ADD, SL, MVT::i32, Src,
+ DAG.getConstant(AMDGPUAS::BarrierAddrLDSOffset, SL, MVT::i32));
+ CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, SrcOffset,
+ Aperture);
+ } else {
+ CvtPtr =
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+ }
+
CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
}
@@ -9846,12 +9866,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
EVT PtrVT = Op.getValueType();
const GlobalValue *GV = GSD->getGlobal();
- if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ const unsigned AS = GSD->getAddressSpace();
+ if (((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::BARRIER) &&
shouldUseLDSConstAddress(GV)) ||
- GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
- GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- GV->hasExternalLinkage()) {
+ AS == AMDGPUAS::REGION_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
// HIP uses an unsized array `extern __shared__ T s[]` or similar
// zero-sized type in other languages to declare the dynamic shared
@@ -9871,7 +9890,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
}
- if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ if (AS == AMDGPUAS::BARRIER) {
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+ SIInstrInfo::MO_ABS32_LO);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0);
+ }
+
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
SIInstrInfo::MO_ABS32_LO);
return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
@@ -11842,7 +11867,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (isa<ConstantSDNode>(Op->getOperand(2))) {
uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
- BarID = (BarID >> 4) & 0x3F;
+ BarID = BarID & 0x3F;
Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
@@ -11850,13 +11875,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
} else {
Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
- SDValue M0Val;
- M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
- DAG.getShiftAmountConstant(4, MVT::i32, DL));
- M0Val = SDValue(
- DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
- DAG.getTargetConstant(0x3F, DL, MVT::i32)),
- 0);
+ SDValue M0Val =
+ SDValue(DAG.getMachineNode(
+ AMDGPU::S_AND_B32, DL, MVT::i32, Op->getOperand(2),
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
} else
Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
@@ -12464,12 +12487,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
BarVal = C->getZExtValue();
else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
- if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(
- *GA->getGlobal()))
+ if (auto Addr = AMDGPUMachineFunctionInfo::get32BitAbsoluteAddress(
+ *GA->getGlobal(), AMDGPUAS::BARRIER))
BarVal = *Addr + GA->getOffset();
if (BarVal) {
- unsigned BarID = (*BarVal >> 4) & 0x3F;
+ unsigned BarID = *BarVal & 0x3F;
Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
Ops.push_back(Chain);
auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
@@ -12489,12 +12512,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
? AMDGPU::S_BARRIER_INIT_M0
: AMDGPU::S_BARRIER_SIGNAL_M0;
- // extract the BarrierID from bits 4-9 of BarOp
- SDValue BarID;
- BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
- DAG.getShiftAmountConstant(4, MVT::i32, DL));
- BarID =
- SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
+ // extract the BarrierID from bits 0-5 of BarOp
+ SDValue BarID =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarOp,
DAG.getTargetConstant(0x3F, DL, MVT::i32)),
0);
// Member count should be put into M0[ShAmt:+6]
@@ -12539,8 +12559,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
break;
}
- // extract the BarrierID from bits 4-9 of the immediate
- unsigned BarID = (BarVal >> 4) & 0x3F;
+ // extract the BarrierID from bits 0-5 of the immediate
+ unsigned BarID = BarVal & 0x3F;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
Ops.push_back(Chain);
@@ -12555,12 +12575,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
break;
}
- // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
- SDValue M0Val;
- M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
- DAG.getShiftAmountConstant(4, MVT::i32, DL));
- M0Val =
- SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+ // extract the BarrierID from bits 0-5 of BarOp, copy to M0[5:0]
+ SDValue M0Val =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarOp,
DAG.getTargetConstant(0x3F, DL, MVT::i32)),
0);
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index a0a7fb62f9159..dfdea342aa8b1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -595,7 +595,7 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32DataTypes: RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16]>;
-def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6]>;
+def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6, p15]>;
def Reg32Types : RegisterTypes<!listconcat(Reg32DataTypes.types, Reg32PtrTypes.types)>;
def Reg64DataTypes: RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16]>;
def Reg64PtrTypes: RegisterTypes<[p0, p1, p4]>;
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..a846ad17ccf3c 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -274,8 +274,9 @@ static std::string computeAMDDataLayout(const Triple &TT) {
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
- "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
+ "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p15:32:32"
+ "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:"
+ "512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 0fab1395ffc6e..953959ad6a0c6 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -128,10 +128,10 @@ define i32 @s_get_barrier_state(i32 %bar) {
}
; CHECK-LABEL: for function 's_get_named_barrier_state':
-; CHECK: DIVERGENT: ptr addrspace(3) %bar
+; CHECK: DIVERGENT: ptr addrspace(15) %bar
; CHECK-NOT: DIVERGENT
-define i32 @s_get_named_barrier_state(ptr addrspace(3) %bar) {
- %result = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %bar)
+define i32 @s_get_named_barrier_state(ptr addrspace(15) %bar) {
+ %result = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %bar)
ret i32 %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
new file mode 100644
index 0000000000000..34f9a51a0455c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-barrier.ll
@@ -0,0 +1,442 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030,GFX1030-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030,GFX1030-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+define amdgpu_kernel void @barrier_to_generic(ptr addrspace(15) %bar, ptr %out) {
+; GFX942-SDAG-LABEL: barrier_to_generic:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_add_i32 s4, s0, 0x802000
+; GFX942-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: barrier_to_generic:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_add_u32 s0, s6, 0x802000
+; GFX942-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX942-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT: s_clause 0x1
+; GFX1030-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: s_add_i32 s4, s0, 0x802000
+; GFX1030-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1030-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT: s_clause 0x1
+; GFX1030-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX1030-GISEL-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8
+; GFX1030-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT: s_add_u32 s0, s4, 0x802000
+; GFX1030-GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_to_generic:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-SDAG-NEXT: s_clause 0x1
+; GFX1200-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: s_add_co_i32 s4, s0, 0x802000
+; GFX1200-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1200-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_to_generic:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_clause 0x1
+; GFX1200-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0
+; GFX1200-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
+; GFX1200-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: s_add_co_u32 s0, s6, 0x802000
+; GFX1200-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: barrier_to_generic:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s0, 0x802000
+; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s1, s1, 0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-SDAG-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: barrier_to_generic:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s6, 0x802000
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-NEXT: s_cselect_b64 s[0:1], s[0:1], 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) %bar to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @barrier_gv_to_generic(ptr %out) {
+; GFX942-SDAG-LABEL: barrier_gv_to_generic:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x802001
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: barrier_gv_to_generic:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX942-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, 0x802001
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GFX1030-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1030-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, 0x802001 :: v_dual_mov_b32 v1, s1
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1200-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1200-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1200-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: barrier_gv_to_generic:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0x802001
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: barrier_gv_to_generic:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, 0x802001
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: flat_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) @bar to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+
+define amdgpu_kernel void @barrier_null_to_generic(ptr %out) {
+; GFX942-LABEL: barrier_null_to_generic:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX1030-SDAG-LABEL: barrier_null_to_generic:
+; GFX1030-SDAG: ; %bb.0:
+; GFX1030-SDAG-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-SDAG-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-SDAG-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX1030-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX1030-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-SDAG-NEXT: s_endpgm
+;
+; GFX1030-GISEL-LABEL: barrier_null_to_generic:
+; GFX1030-GISEL: ; %bb.0:
+; GFX1030-GISEL-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-GISEL-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-GISEL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1030-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX1030-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX1030-GISEL-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: barrier_null_to_generic:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s0
+; GFX1200-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: barrier_null_to_generic:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1200-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-LABEL: barrier_null_to_generic:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
+ %res = addrspacecast ptr addrspace(15) null to ptr
+ store ptr %res, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @generic_to_barrier(ptr %generic, ptr %out) {
+; GFX942-SDAG-LABEL: generic_to_barrier:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT: s_add_i32 s2, s0, 0xff7fe000
+; GFX942-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-SDAG-NEXT: s_cselect_b32 s0, s2, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-SDAG-NEXT: flat_store_dword v[0:1], v2
+; GFX942-SDAG-NEXT: s_endpgm
+;
+; GFX942-GISEL-LABEL: generic_to_barrier:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_add_i32 s4, s0, 0xff7fe000
+; GFX942-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX942-GISEL-NEXT: s_cselect_b32 s0, s4, 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX942-GISEL-NEXT: s_endpgm
+;
+; GFX1030-LABEL: generic_to_barrier:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_add_u32 s12, s12, s17
+; GFX1030-NEXT: s_addc_u32 s13, s13, 0
+; GFX1030-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX1030-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: s_add_i32 s4, s0, 0xff7fe000
+; GFX1030-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s0
+; GFX1030-NEXT: flat_store_dword v[0:1], v2
+; GFX1030-NEXT: s_endpgm
+;
+; GFX1200-SDAG-LABEL: generic_to_barrier:
+; GFX1200-SDAG: ; %bb.0:
+; GFX1200-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1200-SDAG-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1200-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1200-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1200-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1200-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1200-SDAG-NEXT: flat_store_b32 v[0:1], v2
+; GFX1200-SDAG-NEXT: s_endpgm
+;
+; GFX1200-GISEL-LABEL: generic_to_barrier:
+; GFX1200-GISEL: ; %bb.0:
+; GFX1200-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1200-GISEL-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1200-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1200-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1200-GISEL-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1200-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s0
+; GFX1200-GISEL-NEXT: flat_store_b32 v[0:1], v2
+; GFX1200-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: generic_to_barrier:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-SDAG-NEXT: flat_store_b32 v0, v1, s[2:3]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: generic_to_barrier:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_i32 s4, s0, 0xff7fe000
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s4, 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-GISEL-NEXT: flat_store_b32 v1, v0, s[2:3]
+; GFX1250-GISEL-NEXT: s_endpgm
+ %res = addrspacecast ptr %generic to ptr addrspace(15)
+ store ptr addrspace(15) %res, ptr %out
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1200: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
index 0f9cfb48b577d..4a7d7330200b7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-module-lds.ll
@@ -6,28 +6,28 @@
; where amdgpu-lower-module-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META3:![0-9]+]]
; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
;.
define void @func1() #0 {
; CHECK-LABEL: define void @func1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -35,14 +35,14 @@ define void @func1() #0 {
define void @func2() #0 {
; CHECK-LABEL: define void @func2(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: store i8 7, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
store i8 7, ptr addrspace(3) @lds1, align 4
ret void
@@ -52,20 +52,20 @@ define amdgpu_kernel void @kernel1() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: store i8 9, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -77,15 +77,15 @@ define amdgpu_kernel void @kernel2() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
; CHECK-SAME: ) #[[ATTR1]] {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: store i8 10, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
store i8 10, ptr addrspace(3) @lds1, align 4
@@ -95,13 +95,13 @@ define amdgpu_kernel void @kernel2() #0 {
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
@@ -113,8 +113,8 @@ attributes #2 = { nounwind readnone }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
-; CHECK: [[META2]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 7, i32 8}
+; CHECK: [[META2]] = !{i32 1, i32 2}
; CHECK: [[META3]] = !{i32 0, i32 1}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
index 16533e0a204d4..905b1174db711 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync-and-sw-lds.ll
@@ -4,24 +4,24 @@
; Test to ensure that LDS variables like named barriers are lowered correctly in asan scenario,
; where amdgpu-sw-lower-lds pass runs in pipeline after amdgpu-lower-exec-sync pass.
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
@lds1 = internal addrspace(3) global [1 x i8] poison, align 4
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META1:![0-9]+]]
;
define void @bar() #0 {
; CHECK-LABEL: define void @bar(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK: store i8 7, ptr addrspace(1) {{.*}}, align 4
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
store i8 7, ptr addrspace(3) @lds1, align 4
ret void
@@ -32,32 +32,21 @@ define amdgpu_kernel void @barkernel() #0 {
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] {
; CHECK: {{.*}} = call i64 @__asan_malloc_impl(i64 {{.*}}, i64 {{.*}})
; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
; CHECK: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK: call void @bar()
; CHECK: store i8 10, ptr addrspace(1) {{.*}}, align 4
; CHECK: call void @__asan_free_impl(i64 {{.*}}, i64 {{.*}})
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @bar()
store i8 10, ptr addrspace(3) @lds1, align 4
ret void
}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
-
attributes #0 = { nounwind sanitize_address }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
@@ -68,6 +57,6 @@ attributes #2 = { nounwind readnone }
; CHECK: attributes #[[ATTR0]] = { nounwind sanitize_address }
; CHECK: attributes #[[ATTR1]] = { nounwind sanitize_address "amdgpu-lds-size"="8" }
;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 1, i32 2}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
index ddf58900518b5..fd11c076eebcb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-lower-exec-sync.ll
@@ -4,37 +4,37 @@
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
;.
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
-; CHECK: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
;.
define void @func1() {
; CHECK-LABEL: define void @func1() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
define void @func2() {
; CHECK-LABEL: define void @func2() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -42,19 +42,19 @@ define void @func2() {
define amdgpu_kernel void @kernel1() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+; CHECK-NEXT: [[STATE:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -64,14 +64,14 @@ define amdgpu_kernel void @kernel1() #0 {
define amdgpu_kernel void @kernel2() #0 {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -81,13 +81,13 @@ define amdgpu_kernel void @kernel2() #0 {
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
@@ -96,7 +96,7 @@ attributes #2 = { nounwind readnone }
; CHECK: attributes #[[ATTR0]] = { nounwind }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
;.
-; CHECK: [[META0]] = !{i32 8396880, i32 8396881}
-; CHECK: [[META1]] = !{i32 8396912, i32 8396913}
-; CHECK: [[META2]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 5, i32 6}
+; CHECK: [[META1]] = !{i32 7, i32 8}
+; CHECK: [[META2]] = !{i32 1, i32 2}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index b86f621b9e0cf..489061d329320 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -489,8 +489,8 @@ attributes #1 = { nounwind }
; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
; HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
;.
-; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10}
-; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10}
-; HSA: [[META2]] = !{i32 2, i32 10}
-; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
+; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 16}
+; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 16}
+; HSA: [[META2]] = !{i32 2, i32 16}
+; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
index ccc434842e498..ccb34288d5845 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
@@ -153,7 +153,7 @@ attributes #0 = { "amdgpu-no-flat-scratch-init" }
; GFX10: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
;.
-; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
-; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 1e8a33781ee8d..bf2e44c641664 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -877,15 +877,15 @@ define amdgpu_kernel void @with_inline_asm() {
; GFX10: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
; GFX10: attributes #[[ATTR4]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" "target-cpu"="gfx1010" }
;.
-; GFX9: [[META0]] = !{i32 2, i32 10}
-; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX9: [[META0]] = !{i32 2, i32 16}
+; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX9: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX9: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX9: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
;.
-; GFX10: [[META0]] = !{i32 2, i32 10}
-; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
-; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 10}
-; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
-; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 10}
+; GFX10: [[META0]] = !{i32 2, i32 16}
+; GFX10: [[META1]] = !{i32 1, i32 2, i32 3, i32 16}
+; GFX10: [[META2]] = !{i32 1, i32 3, i32 4, i32 16}
+; GFX10: [[META3]] = !{i32 1, i32 4, i32 5, i32 16}
+; GFX10: [[META4]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
index d91b2117c7ad9..f699ba70bb935 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll
@@ -633,7 +633,7 @@ define amdgpu_kernel void @no_alias_addr_space_has_meta(ptr addrspace(3) %sptr,
!0 = !{i32 2, i32 3, i32 4, i32 10}
;.
-; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 10}
-; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 10}
+; CHECK: [[META0]] = !{i32 2, i32 3, i32 4, i32 5, i32 6, i32 16}
+; CHECK: [[META1]] = !{i32 2, i32 3, i32 5, i32 16}
; CHECK: [[META2]] = !{i32 2, i32 3, i32 4, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll b/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
new file mode 100644
index 0000000000000..fa8e079e55b7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/barrier-addrspace-dereference.ll
@@ -0,0 +1,16 @@
+; Check we cannot dereference a barrier GV.
+
+; RUN: not --crash llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefixes=DAGISEL %s
+; RUN: not llc -O0 -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefixes=GISEL %s
+
+; TODO: It'd be nicer to have a Verifier diagnostic for this.
+
+; DAGISEL: LLVM ERROR: {{.*}} store<(store (s32) into @bar, addrspace 15)>
+; GISEL: LLVM ERROR: {{.*}} G_LOAD %6:sgpr(p15) :: (load (s32) from @bar, addrspace 15) (in function: func1)
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+define amdgpu_kernel void @func1() {
+ %val = load i32, ptr addrspace(15) @bar
+ store i32 %val, ptr addrspace(15) @bar
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
index f573a3180c067..5f21822874683 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-codegen-named-barrier.ll
@@ -7,10 +7,9 @@
; 3. group_segment_fixed_size = 0 (linker patches it)
; 4. Named barrier is emitted as an SHN_AMDGPU_LDS symbol (.amdgpu_lds)
- at bar = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
; CHECK-LABEL: kernel:
-; CHECK: s_lshr_b32 s{{[0-9]+}}, __amdgpu_named_barrier.bar{{[^ @]*}}@abs32 at lo, 4
; CHECK: s_barrier_join m0
; CHECK: s_barrier_signal m0
; CHECK: s_barrier_wait 1
@@ -26,8 +25,6 @@
; CHECK: .amdgpu_call helper
; CHECK: .end_amdgpu_info
-; CHECK: .amdgpu_lds __amdgpu_named_barrier.bar{{[^ ,]*}}, 32, 4
-
; ELF: Section {
; ELF: Name: .amdgpu.info
; ELF: Type: SHT_PROGBITS
@@ -39,16 +36,16 @@
; ELF-DAG: R_AMDGPU_ABS64 helper
define amdgpu_kernel void @kernel() {
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @helper()
ret void
}
declare void @helper()
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #0
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #0
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #0
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #0
declare void @llvm.amdgcn.s.barrier.wait(i16) #0
attributes #0 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
index 62b32901d281d..9ec6fc523dc63 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-link-time-named-barrier.ll
@@ -6,21 +6,21 @@
; 2. AMDGPULowerModuleLDS does not handle named barriers at all
; 3. amdgpu.lds.uses does NOT contain barrier entries
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
@lds = internal addrspace(3) global [4 x i32] poison, align 4
; Internal named barrier becomes external with a module-unique hash suffix.
-; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(3) global target("amdgcn.named.barrier", 0)
+; CHECK: @[[BAR:__amdgpu_named_barrier\.bar\.[a-f0-9]+]] = external dso_local addrspace(15) global target("amdgcn.named.barrier", 0)
; CHECK-NOT: !absolute_symbol
; Regular LDS is packed into the per-function struct (external, for linker).
; CHECK: @__amdgpu_lds.kernel = external dso_local addrspace(3) global %__amdgpu_lds.kernel.t, align 16
define amdgpu_kernel void @kernel(i32 %idx) {
; CHECK-LABEL: define amdgpu_kernel void @kernel(
-; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @[[BAR]])
-; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @[[BAR]], i32 3)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 3)
+; CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @[[BAR]])
+; CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @[[BAR]], i32 3)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
%gep = getelementptr [4 x i32], ptr addrspace(3) @lds, i32 0, i32 %idx
store i32 42, ptr addrspace(3) %gep, align 4
@@ -29,7 +29,7 @@ define amdgpu_kernel void @kernel(i32 %idx) {
; Named barrier metadata: (barrier_sym, func1, ...) -- emitted by ExecSync.
; CHECK-DAG: !amdgpu.named_barrier.uses = !{[[BAR_MD:![0-9]+]]}
-; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(3) @[[BAR]], ptr @kernel}
+; CHECK-DAG: [[BAR_MD]] = !{ptr addrspace(15) @[[BAR]], ptr @kernel}
; LDS metadata must have exactly one entry (the LDS struct), no barrier entries.
; CHECK-DAG: !amdgpu.lds.uses = !{[[LDS_MD:![0-9]+]]}
; CHECK-DAG: [[LDS_MD]] = !{ptr @kernel, ptr addrspace(3) @__amdgpu_lds.kernel}
diff --git a/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll b/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
new file mode 100644
index 0000000000000..12be207e7e884
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/null-named-barrier-gv.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-kernel.ll 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-kernel.ll 2>&1 | FileCheck %s
+
+; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-func.ll 2>&1 | FileCheck %s
+; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %t/null-named-barrier-func.ll 2>&1 | FileCheck %s
+
+; CHECK: named barrier GV cannot be used to represent the NULL named barrier
+
+;--- null-named-barrier-kernel.ll
+
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+
+define amdgpu_kernel void @func1() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ ret void
+}
+
+!0 = !{ i32 0, i32 1 }
+
+;--- null-named-barrier-func.ll
+
+ at bar = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
+
+define void @func1() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ ret void
+}
+
+!0 = !{ i32 0, i32 1 }
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
new file mode 100644
index 0000000000000..ae18ab914cc49
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-bad-absolute-symbol.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+
+; The absolute_address of the GV can never be null.
+
+; CHECK: LLVM ERROR: named barrier GV cannot be used to represent the NULL named barrie
+
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+
+define void @func() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ ret void
+}
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
new file mode 100644
index 0000000000000..ae5731b875f37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering-wrong-gv-signature.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
+
+; Check what happens when the type or the AS of the barrier GV is wrong.
+; Using such a GV in a barrier intrinsic would be UB of course, but we should not crash.
+
+; @addrspacecasted doesn't trip up on this because we are not doing any unsupported
+; operation.
+;
+; CHECK: in function wrong_type void (): Unsupported use of BARRIER address space!
+
+ at bar = internal global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global i32 poison
+
+define void @addrspacecasted() {
+ %bar.ascast = addrspacecast ptr @bar to ptr addrspace(15)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %bar.ascast)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar.ascast, i32 7)
+ ret void
+}
+
+define void @wrong_type() {
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 78c134827be4d..d3b5dcaebfb68 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -3,26 +3,30 @@
%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
- at bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
+ at bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
-; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
-; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
-; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; Test using the workgroup barrier with the GV.
+ at wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+
+; CHECK: @bar2 = internal addrspace(15) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK-NEXT: @bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META1:![0-9]+]]
+; CHECK-NEXT: @bar1 = internal addrspace(15) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol [[META2:![0-9]+]]
+; CHECK-NEXT: @wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META3:![0-9]+]]
; SOUT: .set .Lfunc1.num_named_barrier, 7
define void @func1() {
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
; SOUT: .set .Lfunc2.num_named_barrier, 6
define void @func2() {
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -30,44 +34,44 @@ define void @func2() {
; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel1.num_named_barrier, max(4, .Lfunc1.num_named_barrier, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel1() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 11)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
ret void
}
-; SOUT: .amdhsa_named_barrier_count 2
+; SOUT: .amdhsa_kernel kernel2
+; SOUT: .amdhsa_named_barrier_count 2
; SOUT: .set .Lkernel2.num_named_barrier, max(4, .Lfunc2.num_named_barrier)
define amdgpu_kernel void @kernel2() #0 {
-; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar1)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
ret void
}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
-declare void @llvm.amdgcn.s.barrier.leave(i16) #1
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+; SOUT: .amdhsa_kernel wgbarr_as_gv
+; SOUT: .amdhsa_named_barrier_count 0
+define amdgpu_kernel void @wgbarr_as_gv() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @wgbarr, i32 7)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
-; CHECK: !0 = !{i32 8396880, i32 8396881}
-; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
-; CHECK-NEXT: !2 = !{i32 8396816, i32 8396817}
+!0 = !{i32 -1, i32 0}
+
+; CHECK: [[META0:![0-9]+]] = !{i32 5, i32 6}
+; CHECK-NEXT: [[META1:![0-9]+]] = !{i32 7, i32 8}
+; CHECK-NEXT: [[META2:![0-9]+]] = !{i32 1, i32 2}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 495b7b02223e5..3dafedaf22813 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -2,9 +2,12 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+
+; Test using the workgroup barrier with the GV.
+ at wgbarr = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
define void @func1() {
; GFX12-SDAG-LABEL: func1:
@@ -33,8 +36,8 @@ define void @func1() {
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar3, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
@@ -66,13 +69,13 @@ define void @func2() {
; GFX12-GISEL-NEXT: s_barrier_signal m0
; GFX12-GISEL-NEXT: s_barrier_wait 1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX12-SDAG-LABEL: kernel1:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -85,9 +88,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
@@ -141,9 +143,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
@@ -188,17 +189,17 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
; GFX12-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
- call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %in, i32 9)
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %in)
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @llvm.amdgcn.s.barrier.leave(i16 1)
- %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
- %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) @bar)
+ %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %in)
call void @llvm.amdgcn.s.barrier()
call void @func1()
call void @func2()
@@ -206,7 +207,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
-define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX12-SDAG-LABEL: kernel2:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
@@ -250,8 +251,8 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
; GFX12-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -268,11 +269,11 @@ define void @signal_var_cnt0_const_bar() {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_barrier_signal 1
; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 0)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 0)
ret void
}
-define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
+define void @signal_var_cnt0_dynamic_bar(ptr addrspace(15) inreg %bar) {
; GFX12-LABEL: signal_var_cnt0_dynamic_bar:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -280,12 +281,10 @@ define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s0, s0, 4
-; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_b32 m0, s0, 63
; GFX12-NEXT: s_barrier_signal m0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %bar, i32 0)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %bar, i32 0)
ret void
}
@@ -308,17 +307,41 @@ define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrs
ret void
}
+
+define amdgpu_kernel void @wgbarr_as_gv() {
+; GFX12-LABEL: wgbarr_as_gv:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_mov_b32 m0, 0x7003f
+; GFX12-NEXT: s_barrier_signal m0
+; GFX12-NEXT: s_barrier_wait -1
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @wgbarr, i32 7)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ ret void
+}
+
+define amdgpu_kernel void @null_barrier() {
+; GFX12-LABEL: null_barrier:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_barrier_join 0
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) null)
+ ret void
+}
+
declare void @llvm.amdgcn.s.barrier() #1
declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
-declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(15), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(15)) #1
declare void @llvm.amdgcn.s.barrier.leave(i16) #1
declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
-declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
+
+!0 = !{i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
index 2ecb380e6c37e..5d81f6b649de9 100644
--- a/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-wakeup-barrier.ll
@@ -2,9 +2,9 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-SDAG %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-GISEL %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) #0 {
; GFX1250-SDAG-LABEL: kernel1:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -12,8 +12,6 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX1250-SDAG-NEXT: s_mov_b32 m0, 1
; GFX1250-SDAG-NEXT: s_wakeup_barrier m0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_lshr_b32 s0, s0, 4
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_and_b32 m0, s0, 63
; GFX1250-SDAG-NEXT: s_wakeup_barrier m0
; GFX1250-SDAG-NEXT: s_endpgm
@@ -24,18 +22,16 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c nv
; GFX1250-GISEL-NEXT: s_wakeup_barrier 1
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: s_lshr_b32 s0, s0, 4
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_and_b32 m0, s0, 63
; GFX1250-GISEL-NEXT: s_wakeup_barrier m0
; GFX1250-GISEL-NEXT: s_endpgm
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %in)
ret void
}
-declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15)) #1
attributes #0 = { nounwind }
attributes #1 = { convergent nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index cb52a639d12bd..b65f35394125d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -59,5 +59,5 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-no-wwm" }
;.
-; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 16}
;.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 1149725a24c50..d1960c77b27c8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -551,16 +551,17 @@ def ROCDL_WaveBarrierOp : ROCDL_ConcreteNonMemIntrOp<"wave.barrier", [], 0> {
def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
+def ROCDLExecSync : LLVM_PointerInAddressSpace<15>;
def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$memberCnt)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
let description = [{
Available on gfx1250+.
Example:
```mlir
// Initialize a named barrier with member count.
- rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+ rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
```
}];
let results = (outs);
@@ -583,7 +584,7 @@ def ROCDL_BarrierSignalOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal", [], 0
}
def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$memberCnt)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
let description = [{
Available on gfx1250+.
@@ -592,7 +593,7 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
Example:
```mlir
// Signal a named barrier with variable ID.
- rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+ rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
```
}];
let results = (outs);
@@ -600,14 +601,14 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
}
def ROCDL_BarrierJoinOp : ROCDL_IntrOp<"s.barrier.join", [], [], [], 0>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
let description = [{
Available on gfx1250+.
Example:
```mlir
// Join a named barrier.
- rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+ rocdl.s.barrier.join %ptr : !llvm.ptr<15>
```
}];
let results = (outs);
@@ -675,14 +676,14 @@ def ROCDL_GetBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.barrier.state",
}
def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barrier.state", [], 1, [], []>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
let description = [{
Available on gfx1250+.
Example:
```mlir
// Query named barrier state by pointer.
- %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+ %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
```
}];
let results = (outs I32:$res);
@@ -690,7 +691,7 @@ def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barri
}
def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0, [], []>,
- Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
let description = [{
Wakes up waves associated with a given named barrier. Note, This op does not release waves waiting
at the barrier. It just signal other waves in the same work-group waiting on the indicated named barrier
@@ -700,7 +701,7 @@ def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0
Example:
```mlir
// Wake up waves waiting on a named barrier.
- rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+ rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
```
}];
let assemblyFormat = "$ptr attr-dict `:` qualified(type($ptr))";
@@ -1507,7 +1508,7 @@ class ROCDL_Sudot_IntrOp<string mnemonic> :
the same for `b`. `clamp` controls result clamping.
These ops correspond to RDNA's unified mixed-sign `v_dot4_i32_iu8`
- and `v_dot8_i32_iu4` instructions (gfx11+).
+ and `v_dot8_i32_iu4` instructions (gfx11+).
Example:
```mlir
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 5273c955c0121..44c57ec3bf606 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1163,10 +1163,10 @@ llvm.func @rocdl.s.barrier() {
llvm.return
}
-llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.init
- // CHECK: rocdl.s.barrier.init %{{.*}} member_cnt = 1 : !llvm.ptr<3>
- rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: rocdl.s.barrier.init %{{.*}} member_cnt = 1 : !llvm.ptr<15>
+ rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
@@ -1177,17 +1177,17 @@ llvm.func @rocdl.s.barrier.signal() {
llvm.return
}
-llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.signal.var
- // CHECK: rocdl.s.barrier.signal.var %{{.*}} member_cnt = 1 : !llvm.ptr<3>
- rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: rocdl.s.barrier.signal.var %{{.*}} member_cnt = 1 : !llvm.ptr<15>
+ rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
-llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.join
- // CHECK: rocdl.s.barrier.join %{{.*}} : !llvm.ptr<3>
- rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+ // CHECK: rocdl.s.barrier.join %{{.*}} : !llvm.ptr<15>
+ rocdl.s.barrier.join %ptr : !llvm.ptr<15>
llvm.return
}
@@ -1219,17 +1219,17 @@ llvm.func @rocdl.s.get.barrier.state() {
llvm.return
}
-llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.get.named.barrier.state
- // CHECK: rocdl.s.get.named.barrier.state %{{.*}} : !llvm.ptr<3> -> i32
- %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+ // CHECK: rocdl.s.get.named.barrier.state %{{.*}} : !llvm.ptr<15> -> i32
+ %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
llvm.return
}
-llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.wakeup.barrier
- // CHECK: rocdl.s.wakeup.barrier %{{.*}} : !llvm.ptr<3>
- rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+ // CHECK: rocdl.s.wakeup.barrier %{{.*}} : !llvm.ptr<15>
+ rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 29e81c8208243..b0d3c0d551eb9 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -269,10 +269,10 @@ llvm.func @rocdl.wave_barrier() {
llvm.return
}
-llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.init
- // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %{{.*}}, i32 1)
- rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(15) %{{.*}}, i32 1)
+ rocdl.s.barrier.init %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
@@ -283,17 +283,17 @@ llvm.func @rocdl.s.barrier.signal() {
llvm.return
}
-llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.signal.var
- // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %{{.*}}, i32 1)
- rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) %{{.*}}, i32 1)
+ rocdl.s.barrier.signal.var %ptr member_cnt = 1 : !llvm.ptr<15>
llvm.return
}
-llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.barrier.join
- // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %{{.*}})
- rocdl.s.barrier.join %ptr : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) %{{.*}})
+ rocdl.s.barrier.join %ptr : !llvm.ptr<15>
llvm.return
}
@@ -325,17 +325,17 @@ llvm.func @rocdl.s.get.barrier.state() {
llvm.return
}
-llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.get.named.barrier.state(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.get.named.barrier.state
- // CHECK: %{{.*}} = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %{{.*}})
- %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<3> -> i32
+ // CHECK: %{{.*}} = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(15) %{{.*}})
+ %0 = rocdl.s.get.named.barrier.state %ptr : !llvm.ptr<15> -> i32
llvm.return
}
-llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<3>) {
+llvm.func @rocdl.s.wakeup.barrier(%ptr : !llvm.ptr<15>) {
// CHECK-LABEL: rocdl.s.wakeup.barrier
- // CHECK: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %{{.*}})
- rocdl.s.wakeup.barrier %ptr : !llvm.ptr<3>
+ // CHECK: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(15) %{{.*}})
+ rocdl.s.wakeup.barrier %ptr : !llvm.ptr<15>
llvm.return
}
>From c9f6a05b51855457243bcb150be012f5dcf8e9ad Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 1 Jun 2026 10:49:30 +0200
Subject: [PATCH 2/2] Fix MLIR
---
.../CodeGen/AMDGPU/s-barrier-id-allocation.ll | 42 +++++++++----------
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 14 ++++---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 +-
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 4 +-
.../gpu-to-rocdl-barriers-gfx12.mlir | 8 ++--
5 files changed, 36 insertions(+), 34 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll
index 6df651b033a5f..892be0e7a3a32 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-id-allocation.ll
@@ -1,42 +1,42 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-exec-sync < %s 2>&1 | FileCheck %s
- at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
- at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison
;.
-; CHECK: @bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
-; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0]]
+; CHECK: @bar = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @bar2 = internal addrspace(15) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol [[META0]]
;.
define void @func1() {
; CHECK-LABEL: define void @func1() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
define void @func2() {
; CHECK-LABEL: define void @func2() {
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
-; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
; CHECK-NEXT: ret void
;
- call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
- call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(15) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(15) @bar2, i32 7)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
-define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(15) %in) {
; CHECK-LABEL: define amdgpu_kernel void @kernel1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: [[STATE3:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
; CHECK-NEXT: ret void
@@ -46,9 +46,9 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
-define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(15) %in) {
; CHECK-LABEL: define amdgpu_kernel void @kernel2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
; CHECK-NEXT: call void @func1()
; CHECK-NEXT: ret void
;
@@ -56,9 +56,9 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
-define amdgpu_kernel void @kernel3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel3(ptr addrspace(1) %out, ptr addrspace(15) %in) {
; CHECK-LABEL: define amdgpu_kernel void @kernel3(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: [[STATE3:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
; CHECK-NEXT: ret void
@@ -68,9 +68,9 @@ define amdgpu_kernel void @kernel3(ptr addrspace(1) %out, ptr addrspace(3) %in)
ret void
}
-define amdgpu_kernel void @kernel4(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+define amdgpu_kernel void @kernel4(ptr addrspace(1) %out, ptr addrspace(15) %in) {
; CHECK-LABEL: define amdgpu_kernel void @kernel4(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(15) [[IN:%.*]]) {
; CHECK-NEXT: call void @func2()
; CHECK-NEXT: ret void
;
@@ -82,5 +82,5 @@ define amdgpu_kernel void @kernel4(ptr addrspace(1) %out, ptr addrspace(3) %in)
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
;.
-; CHECK: [[META0]] = !{i32 8396816, i32 8396817}
+; CHECK: [[META0]] = !{i32 1, i32 2}
;.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index d1960c77b27c8..4eaa7e953d4ef 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -135,6 +135,8 @@ def ROCDL_Dialect : Dialect {
static constexpr unsigned kConstantMemoryAddressSpace = 4;
/// The address space value that represents private memory.
static constexpr unsigned kPrivateMemoryAddressSpace = 5;
+ /// The address space value that represents barriers.
+ static constexpr unsigned kBarrierAddressSpace = 15;
}];
let discardableAttrs = (ins
@@ -551,10 +553,10 @@ def ROCDL_WaveBarrierOp : ROCDL_ConcreteNonMemIntrOp<"wave.barrier", [], 0> {
def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
-def ROCDLExecSync : LLVM_PointerInAddressSpace<15>;
+def ROCDLBarrier : LLVM_PointerInAddressSpace<15>;
def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
- Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
+ Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr, I32Attr:$memberCnt)> {
let description = [{
Available on gfx1250+.
@@ -584,7 +586,7 @@ def ROCDL_BarrierSignalOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal", [], 0
}
def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], 0, 0, 0, 0, [1], ["memberCnt"]>,
- Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr, I32Attr:$memberCnt)> {
+ Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr, I32Attr:$memberCnt)> {
let description = [{
Available on gfx1250+.
@@ -601,7 +603,7 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [],
}
def ROCDL_BarrierJoinOp : ROCDL_IntrOp<"s.barrier.join", [], [], [], 0>,
- Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr)> {
let description = [{
Available on gfx1250+.
@@ -676,7 +678,7 @@ def ROCDL_GetBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.barrier.state",
}
def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barrier.state", [], 1, [], []>,
- Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr)> {
let description = [{
Available on gfx1250+.
@@ -691,7 +693,7 @@ def ROCDL_GetNamedBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.named.barri
}
def ROCDL_WakeupBarrierOp : ROCDL_ConcreteNonMemIntrOp<"s.wakeup.barrier", [], 0, [], []>,
- Arguments<(ins Arg<ROCDLExecSync, "", []>:$ptr)> {
+ Arguments<(ins Arg<ROCDLBarrier, "", []>:$ptr)> {
let description = [{
Wakes up waves associated with a given named barrier. Note, This op does not release waves waiting
at the barrier. It just signal other waves in the same work-group waiting on the indicated named barrier
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 48bd89b3a2fb7..00f33faa343c7 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -4516,7 +4516,7 @@ void mlir::amdgpu::populateCommonGPUTypeAndAttributeConversions(
});
typeConverter.addConversion([](gpu::NamedBarrierType type) {
return LLVM::LLVMPointerType::get(
- type.getContext(), ROCDL::ROCDLDialect::kSharedMemoryAddressSpace);
+ type.getContext(), ROCDL::ROCDLDialect::kBarrierAddressSpace);
});
}
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index a3819df4f8a84..7865a2a10aa0e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -683,7 +683,7 @@ struct GPUInitializeNamedBarrierOpLowering final
auto targetTy = LLVM::LLVMTargetExtType::get(
rewriter.getContext(), "amdgcn.named.barrier", {}, {0});
- auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 3);
+ auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), 15);
// Build the global detached so SymbolTable::insert can both place it and
// rename it as needed without creating a transient name conflict in IR.
@@ -691,7 +691,7 @@ struct GPUInitializeNamedBarrierOpLowering final
auto globalOp = LLVM::GlobalOp::create(
detachedBuilder, loc, targetTy, /*isConstant=*/false,
LLVM::Linkage::Internal, "__named_barrier", /*value=*/Attribute(),
- /*alignment=*/0, /*addrSpace=*/3);
+ /*alignment=*/0, /*addrSpace=*/15);
// Initialize with poison.
{
Region ®ion = globalOp.getInitializerRegion();
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir
index c6a9574ca43c1..402dcae5e9832 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-barriers-gfx12.mlir
@@ -5,7 +5,7 @@ gpu.module @test_module {
// CHECK-LABEL: func @named_barrier
func.func @named_barrier() {
%member_count = arith.constant 4 : i32
- // CHECK: %[[ADDR:.*]] = llvm.mlir.addressof @[[NB:__named_barrier[_0-9]*]] : !llvm.ptr<3>
+ // CHECK: %[[ADDR:.*]] = llvm.mlir.addressof @[[NB:__named_barrier[_0-9]*]] : !llvm.ptr<15>
// CHECK: rocdl.s.barrier.init %[[ADDR]] member_cnt = 4
%nb = gpu.initialize_named_barrier %member_count : i32 -> !gpu.named_barrier
// CHECK: llvm.fence syncscope("workgroup") release
@@ -21,10 +21,10 @@ func.func @named_barrier() {
func.func @two_named_barriers() {
%c4 = arith.constant 4 : i32
%c8 = arith.constant 8 : i32
- // CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @[[NB0:__named_barrier[_0-9]*]] : !llvm.ptr<3>
+ // CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @[[NB0:__named_barrier[_0-9]*]] : !llvm.ptr<15>
// CHECK: rocdl.s.barrier.init %[[ADDR0]] member_cnt = 4
%nb0 = gpu.initialize_named_barrier %c4 : i32 -> !gpu.named_barrier
- // CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @[[NB1:__named_barrier[_0-9]*]] : !llvm.ptr<3>
+ // CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @[[NB1:__named_barrier[_0-9]*]] : !llvm.ptr<15>
// CHECK: rocdl.s.barrier.init %[[ADDR1]] member_cnt = 8
%nb1 = gpu.initialize_named_barrier %c8 : i32 -> !gpu.named_barrier
// CHECK: rocdl.s.barrier.join %[[ADDR0]]
@@ -49,6 +49,6 @@ func.func @cluster_scope() {
}
// One LDS global per gpu.initialize_named_barrier.
-// CHECK-COUNT-3: llvm.mlir.global internal @__named_barrier{{[_0-9]*}}() {addr_space = 3 : i32} : !llvm.target<"amdgcn.named.barrier", 0>
+// CHECK-COUNT-3: llvm.mlir.global internal @__named_barrier{{[_0-9]*}}() {addr_space = 15 : i32} : !llvm.target<"amdgcn.named.barrier", 0>
}
More information about the llvm-branch-commits
mailing list