[clang] [llvm] [AMDGPU] Introduce asyncmark/wait intrinsics (PR #173259)
Sameer Sahasrabuddhe via cfe-commits
cfe-commits at lists.llvm.org
Tue Jan 20 02:26:20 PST 2026
https://github.com/ssahasra updated https://github.com/llvm/llvm-project/pull/173259
>From 876159572be266ecd3eab7a8820d8dfa41ae1b65 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 20 Jan 2026 12:11:37 +0530
Subject: [PATCH] [AMDGPU] Introduce asyncmark/wait intrinsics
Asynchronous operations are memory transfers (usually between the global memory
and LDS) that are completed independently at an unspecified scope. A thread that
requests one or more asynchronous transfers can use *async marks* to track their
completion. The thread waits for each mark to be *completed*, which indicates
that requests initiated in program order before this mark have also completed.
For now, we implement asyncmark/wait operations on pre-GFX12 architectures that
support "LDS DMA" operations. Future work will extend support to GFX12Plus
architectures that support "true" async operations.
Co-authored-by: Ryan Mitchell <ryan.mitchell at amd.com>
Co-authored-by: Nicolai Haehnle <nicolai.haehnle at amd.com>
Fixes: SWDEV-521121
---
clang/include/clang/Basic/BuiltinsAMDGPU.td | 12 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 5 +
clang/lib/Sema/SemaAMDGPU.cpp | 6 +-
.../builtins-amdgcn-asyncmark-errs.cl | 7 +
.../builtins-amdgcn-asyncmark.cl | 16 +
.../builtins-amdgcn-global-load-lds.cl | 12 +
.../builtins-amdgcn-load-to-lds.cl | 12 +
.../builtins-amdgcn-raw-buffer-load-lds.cl | 6 +-
llvm/docs/AMDGPUAsyncOperations.rst | 236 ++++++
llvm/docs/UserGuides.rst | 4 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 32 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 9 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 29 +
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 16 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 20 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 2 +
llvm/lib/Target/AMDGPU/BUFInstructions.td | 20 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 25 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 43 +-
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 280 ++++++-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +
llvm/lib/Target/AMDGPU/SOPInstructions.td | 12 +-
.../test/CodeGen/AMDGPU/async-buffer-loads.ll | 58 ++
llvm/test/CodeGen/AMDGPU/asyncmark-err.ll | 19 +
.../CodeGen/AMDGPU/asyncmark-max-pregfx12.ll | 285 +++++++
.../test/CodeGen/AMDGPU/asyncmark-pregfx12.ll | 751 ++++++++++++++++++
.../hazard-flat-instruction-valu-check.mir | 4 +-
.../AMDGPU/insert-waitcnts-fence-soft.mir | 20 +-
llvm/test/CodeGen/AMDGPU/lds-dma-hazards.mir | 4 +-
llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir | 14 +-
.../AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll | 24 +
.../AMDGPU/sched.group.classification.mir | 8 +-
33 files changed, 1913 insertions(+), 84 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl
create mode 100644 llvm/docs/AMDGPUAsyncOperations.rst
create mode 100644 llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/asyncmark-err.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 12ffad305e7c0..16dd7f7fad696 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -190,7 +190,17 @@ def __builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f64 : AMDGPUBuiltin<"double(doub
def __builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64 : AMDGPUBuiltin<"double(double, __amdgpu_buffer_rsrc_t, int, int, _Constant int)", [], "atomic-fmin-fmax-global-f64">;
def __builtin_amdgcn_raw_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
+def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
+def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">;
+
+//===----------------------------------------------------------------------===//
+// Async mark builtins.
+//===----------------------------------------------------------------------===//
+
+// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+def __builtin_amdgcn_asyncmark : AMDGPUBuiltin<"void()", [], "vmem-to-lds-load-insts">;
+def __builtin_amdgcn_wait_asyncmark : AMDGPUBuiltin<"void(_Constant unsigned short)", [], "vmem-to-lds-load-insts">;
//===----------------------------------------------------------------------===//
// Ballot builtins.
@@ -288,7 +298,9 @@ def __builtin_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_ExtVector<2, sh
def __builtin_amdgcn_ds_atomic_fadd_v2bf16 : AMDGPUBuiltin<"_ExtVector<2, short>(_ExtVector<2, short address_space<3> *>, _ExtVector<2, short>)", [CustomTypeChecking], "atomic-ds-pk-add-16-insts">;
def __builtin_amdgcn_ds_atomic_fadd_v2f16 : AMDGPUBuiltin<"_ExtVector<2, _Float16>(_ExtVector<2, _Float16 address_space<3> *>, _ExtVector<2, _Float16>)", [CustomTypeChecking], "atomic-ds-pk-add-16-insts">;
def __builtin_amdgcn_load_to_lds : AMDGPUBuiltin<"void(void *, void address_space<3> *, _Constant unsigned int, _Constant int, _Constant unsigned int)", [], "vmem-to-lds-load-insts">;
+def __builtin_amdgcn_load_async_to_lds : AMDGPUBuiltin<"void(void *, void address_space<3> *, _Constant unsigned int, _Constant int, _Constant unsigned int)", [], "vmem-to-lds-load-insts">;
def __builtin_amdgcn_global_load_lds : AMDGPUBuiltin<"void(void address_space<1> *, void address_space<3> *, _Constant unsigned int, _Constant int, _Constant unsigned int)", [], "vmem-to-lds-load-insts">;
+def __builtin_amdgcn_global_load_async_lds : AMDGPUBuiltin<"void(void address_space<1> *, void address_space<3> *, _Constant unsigned int, _Constant int, _Constant unsigned int)", [], "vmem-to-lds-load-insts">;
//===----------------------------------------------------------------------===//
// Deep learning builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index a8a5bc348f00c..5162e084013b3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -842,6 +842,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return emitBuiltinWithOneOverloadedType<5>(*this, E,
Intrinsic::amdgcn_load_to_lds);
}
+ case AMDGPU::BI__builtin_amdgcn_load_async_to_lds: {
+ // Should this have asan instrumentation?
+ return emitBuiltinWithOneOverloadedType<5>(
+ *this, E, Intrinsic::amdgcn_load_async_to_lds);
+ }
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index b6eebf35296ef..58b34a41cb459 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -38,9 +38,13 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_async_lds:
case AMDGPU::BI__builtin_amdgcn_struct_ptr_buffer_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_struct_ptr_buffer_load_async_lds:
case AMDGPU::BI__builtin_amdgcn_load_to_lds:
- case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
+ case AMDGPU::BI__builtin_amdgcn_load_async_to_lds:
+ case AMDGPU::BI__builtin_amdgcn_global_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_global_load_async_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
Expr *ArgExpr = TheCall->getArg(SizeIdx);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl
new file mode 100644
index 0000000000000..7d4a141fbde6e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl
@@ -0,0 +1,7 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -verify -S -o - %s
+
+void test_feature() {
+ __builtin_amdgcn_asyncmark(); // expected-error{{'__builtin_amdgcn_asyncmark' needs target feature vmem-to-lds-load-insts}}
+ __builtin_amdgcn_wait_asyncmark(0); // expected-error{{'__builtin_amdgcn_wait_asyncmark' needs target feature vmem-to-lds-load-insts}}
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl
new file mode 100644
index 0000000000000..976ae3cea5d6d
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl
@@ -0,0 +1,16 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+// CHECK-LABEL: @test_invocation(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.asyncmark()
+// CHECK-NEXT: call void @llvm.amdgcn.wait.asyncmark(i16 0)
+// CHECK-NEXT: ret void
+//
+void test_invocation() {
+ __builtin_amdgcn_asyncmark();
+ __builtin_amdgcn_wait_asyncmark(0);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl
index 62c8deb6e4a89..e7c81b000a8f0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl
@@ -19,10 +19,14 @@ typedef unsigned char u8;
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 4, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_global_load_lds_u32(global u32* src, local u32 *dst) {
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_global_load_async_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
}
// CHECK-LABEL: @test_global_load_lds_u16(
@@ -36,10 +40,14 @@ void test_global_load_lds_u32(global u32* src, local u32 *dst) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 2, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_global_load_lds_u16(global u16* src, local u16 *dst) {
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_global_load_async_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
}
// CHECK-LABEL: @test_global_load_lds_u8(
@@ -53,8 +61,12 @@ void test_global_load_lds_u16(global u16* src, local u16 *dst) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 1, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_global_load_lds_u8(global u8* src, local u8 *dst) {
__builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_global_load_async_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
index 6cdedb33bdd80..cc944204446ae 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
@@ -19,10 +19,14 @@ typedef unsigned char u8;
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.async.to.lds.p1(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 4, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_u32(global u32* src, local u32 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_load_async_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
}
// CHECK-LABEL: @test_load_to_lds_u16(
@@ -36,10 +40,14 @@ void test_load_to_lds_u32(global u32* src, local u32 *dst) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.async.to.lds.p1(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 2, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_u16(global u16* src, local u16 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_load_async_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
}
// CHECK-LABEL: @test_load_to_lds_u8(
@@ -53,8 +61,12 @@ void test_load_to_lds_u16(global u16* src, local u16 *dst) {
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.async.to.lds.p1(ptr addrspace(1) [[TMP2]], ptr addrspace(3) [[TMP3]], i32 1, i32 0, i32 0)
// CHECK-NEXT: ret void
//
void test_load_to_lds_u8(global u8* src, local u8 *dst) {
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_load_async_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
index 177165972b7a9..144cc7599bb5e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
@@ -5,17 +5,19 @@
// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_lds(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 1, i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
+// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) [[RSRC]], ptr addrspace(3) [[LDS]], i32 1, i32 [[OFFSET]], i32 [[SOFFSET]], i32 2, i32 3)
// CHECK-NEXT: ret void
//
void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) {
__builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3);
+ __builtin_amdgcn_raw_ptr_buffer_load_async_lds(rsrc, lds, 1, offset, soffset, 2, 3);
}
// CHECK-LABEL: @test_amdgcn_struct_ptr_buffer_load_lds(
// CHECK-NEXT: entry:
-// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
+// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
// CHECK-NEXT: ret void
//
void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset) {
- __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3);
+ __builtin_amdgcn_struct_ptr_buffer_load_async_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3);
}
diff --git a/llvm/docs/AMDGPUAsyncOperations.rst b/llvm/docs/AMDGPUAsyncOperations.rst
new file mode 100644
index 0000000000000..c82c85f2ad194
--- /dev/null
+++ b/llvm/docs/AMDGPUAsyncOperations.rst
@@ -0,0 +1,236 @@
+===============================
+ AMDGPU Asynchronous Operations
+===============================
+
+.. contents::
+ :local:
+
+Introduction
+============
+
+Asynchronous operations are memory transfers (usually between the global memory
+and LDS) that are completed independently at an unspecified scope. A thread that
+requests one or more asynchronous transfers can use *async marks* to track
+their completion. The thread waits for each mark to be *completed*, which
+indicates that requests initiated in program order before this mark have also
+completed.
+
+Operations
+==========
+
+Memory Accesses
+---------------
+
+LDS DMA Operations
+^^^^^^^^^^^^^^^^^^
+
+.. code-block:: llvm
+
+ ; "Legacy" LDS DMA operations
+ void @llvm.amdgcn.load.async.to.lds(ptr %src, ptr %dst)
+ void @llvm.amdgcn.global.load.async.lds(ptr %src, ptr %dst)
+ void @llvm.amdgcn.raw.buffer.load.async.lds(ptr %src, ptr %dst)
+ void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr %src, ptr %dst)
+ void @llvm.amdgcn.struct.buffer.load.async.lds(ptr %src, ptr %dst)
+ void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr %src, ptr %dst)
+
+Request an async operation that copies the specified number of bytes from the
+global/buffer pointer ``%src`` to the LDS pointer ``%dst``.
+
+.. note::
+
+ The above listing is *merely representative*. The actual function signatures
+ are identical to their non-async variants, and supported only on the
+ corresponding architectures (GFX9 and GFX10).
+
+Async Mark Operations
+---------------------
+
+An *async mark* in the abstract machine tracks all the async operations that
+are program ordered before that mark. A mark M is said to be *completed*
+only when all async operations program ordered before M are reported by the
+implementation as having finished, and it is said to be *outstanding* otherwise.
+
+Thus we have the following sufficient condition:
+
+ An async operation X is *completed* at a program point P if there exists a
+ mark M such that X is program ordered before M, M is program ordered before
+ P, and M is completed. X is said to be *outstanding* at P otherwise.
+
+The abstract machine maintains a sequence of *async marks* during the
+execution of a function body, which excludes any marks produced by calls to
+other functions encountered in the currently executing function.
+
+
+``@llvm.amdgcn.asyncmark()``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When executed, inserts an async mark in the sequence associated with the
+currently executing function body.
+
+``@llvm.amdgcn.wait.asyncmark(i16 %N)``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Waits until there are at most N outstanding marks in the sequence associated
+with the currently executing function body.
+
+Memory Consistency Model
+========================
+
+Each asynchronous operation consists of a non-atomic read on the source and a
+non-atomic write on the destination. Async "LDS DMA" intrinsics result in async
+accesses that guarantee visibility relative to other memory operations as
+follows:
+
+ An asynchronous operation `A` program ordered before an overlapping memory
+ operation `X` happens-before `X` only if `A` is completed before `X`.
+
+ A memory operation `X` program ordered before an overlapping asynchronous
+ operation `A` happens-before `A`.
+
+.. note::
+
+ The *only if* in the above wording implies that unlike the default LLVM
+ memory model, certain program order edges are not automatically included in
+ ``happens-before``.
+
+Examples
+========
+
+Uneven blocks of async transfers
+--------------------------------
+
+.. code-block:: c++
+
+ void foo(global int *g, local int *l) {
+ // first block
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ asyncmark();
+
+ // second block; longer
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ asyncmark();
+
+ // third block; shorter
+ async_load_to_lds(l, g);
+ async_load_to_lds(l, g);
+ asyncmark();
+
+ // Wait for first block
+ wait.asyncmark(2);
+ }
+
+Software pipeline
+-----------------
+
+.. code-block:: c++
+
+ void foo(global int *g, local int *l) {
+ // first block
+ asyncmark();
+
+ // second block
+ asyncmark();
+
+ // third block
+ asyncmark();
+
+ for (;;) {
+ wait.asyncmark(2);
+ // use data
+
+ // next block
+ asyncmark();
+ }
+
+ // flush one block
+ wait.asyncmark(2);
+
+ // flush one more block
+ wait.asyncmark(1);
+
+ // flush last block
+ wait.asyncmark(0);
+ }
+
+Ordinary function call
+----------------------
+
+.. code-block:: c++
+
+ extern void bar(); // may or may not make async calls
+
+ void foo(global int *g, local int *l) {
+ // first block
+ asyncmark();
+
+ // second block
+ asyncmark();
+
+ // function call
+ bar();
+
+ // third block
+ asyncmark();
+
+ wait.asyncmark(1); // will wait for at least the second block, possibly including bar()
+ wait.asyncmark(0); // will wait for third block, including bar()
+ }
+
+Implementation notes
+====================
+
+[This section is informational.]
+
+Optimization
+------------
+
+The implementation may eliminate async mark/wait intrinsics in the following cases:
+
+1. An ``asyncmark`` operation which is not included in the wait count of a later
+ wait operation in the current function. In particular, an ``asyncmark`` which
+ is not post-dominated by any ``wait.asyncmark``.
+2. A ``wait.asyncmark`` whose wait count is more than the outstanding async
+ marks at that point. In particular, a ``wait.asyncmark`` that is not
+ dominated by any ``asyncmark``.
+
+In general, at a function call, if the caller uses sufficient waits to track
+its own async operations, the actions performed by the callee cannot affect
+correctness. But inlining such a call may result in redundant waits.
+
+.. code-block:: c++
+
+ void foo() {
+ asyncmark(); // A
+ }
+
+ void bar() {
+ asyncmark(); // B
+ asyncmark(); // C
+ foo();
+ wait.asyncmark(1);
+ }
+
+Before inlining, the ``wait.asyncmark`` waits for mark B to be completed.
+
+.. code-block:: c++
+
+ void foo() {
+ }
+
+ void bar() {
+ asyncmark(); // B
+ asyncmark(); // C
+ asyncmark(); // A from call to foo()
+ wait.asyncmark(1);
+ }
+
+After inlining, the asyncmark-wait now waits for mark C to complete, which is
+longer than necessary. Ideally, the optimizer should have eliminated mark A in
+the body of foo() itself.
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index 10d7fef904d2d..a712d4eb4c13e 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -18,6 +18,7 @@ intermediate LLVM representation.
AdvancedBuilds
AliasAnalysis
AMDGPUUsage
+ AMDGPUAsyncOperations
Benchmarking
BigEndianNEON
BuildingADistribution
@@ -287,6 +288,9 @@ Additional Topics
:doc:`AMDGPUUsage`
This document describes using the AMDGPU backend to compile GPU kernels.
+:doc:`AMDGPUAsyncOperations`
+ Builtins for invoking asynchronous data transfer operations in AMD GPUs.
+
:doc:`AMDGPUDwarfExtensionsForHeterogeneousDebugging`
This document describes DWARF extensions to support heterogeneous debugging
for targets such as the AMDGPU backend.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a8eba9ed126b7..3d3631b5d92e8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1998,9 +1998,9 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
+def int_amdgcn_raw_buffer_load_async_lds : AMDGPURawBufferLoadLDS;
class AMDGPURawPtrBufferLoadLDS :
- ClangBuiltin<"__builtin_amdgcn_raw_ptr_buffer_load_lds">,
Intrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
@@ -2021,7 +2021,11 @@ class AMDGPURawPtrBufferLoadLDS :
WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS;
+def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS,
+ ClangBuiltin<"__builtin_amdgcn_raw_ptr_buffer_load_lds">;
+def int_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPURawPtrBufferLoadLDS,
+ ClangBuiltin<"__builtin_amdgcn_raw_ptr_buffer_load_async_lds">;
+
class AMDGPUStructBufferLoadLDS : Intrinsic <
[],
@@ -2042,9 +2046,9 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
+def int_amdgcn_struct_buffer_load_async_lds : AMDGPUStructBufferLoadLDS;
class AMDGPUStructPtrBufferLoadLDS :
- ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_lds">,
Intrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
@@ -2066,7 +2070,10 @@ class AMDGPUStructPtrBufferLoadLDS :
WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS;
+def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS,
+ ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_lds">;
+def int_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUStructPtrBufferLoadLDS,
+ ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_async_lds">;
def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic <
[],
@@ -2794,10 +2801,9 @@ class AMDGPULoadToLDS :
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
+def int_amdgcn_load_async_to_lds : AMDGPULoadToLDS;
-class AMDGPUGlobalLoadLDS
- : ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
- Intrinsic<
+class AMDGPUGlobalLoadLDS : Intrinsic<
[],
[LLVMQualPointerType<1>, // Base global pointer to load from
LLVMQualPointerType<3>, // LDS base pointer to store to
@@ -2813,12 +2819,22 @@ class AMDGPUGlobalLoadLDS
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]>;
-def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
+def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS, ClangBuiltin<"__builtin_amdgcn_global_load_lds">;
+def int_amdgcn_global_load_async_lds : AMDGPUGlobalLoadLDS, ClangBuiltin<"__builtin_amdgcn_global_load_async_lds">;
// This is IntrHasSideEffects because it reads from a volatile hardware register.
def int_amdgcn_pops_exiting_wave_id :
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>;
+// Sets a marker in the stream of async requests. Modelled as InaccessibleMem.
+def int_amdgcn_asyncmark : ClangBuiltin<"__builtin_amdgcn_asyncmark">,
+ Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+
+// Waits until the Nth previous marker is completed, if it exists.
+def int_amdgcn_wait_asyncmark :
+ ClangBuiltin<"__builtin_amdgcn_wait_asyncmark">,
+ Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+
//===----------------------------------------------------------------------===//
// GFX10 Intrinsics
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4eb192b3726b4..7d0c1feebba87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1502,7 +1502,7 @@ def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts",
"Has v_pk_add_{min|max}_{i|u}16 instructions"
>;
-def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
+def FeatureVMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
"HasVMemToLDSLoad",
"true",
"The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds."
@@ -1613,7 +1613,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
- FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad,
+ FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad,
FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
FeatureCvtPkNormVOP3Insts
@@ -1640,7 +1640,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
- FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeatureCubeInsts,
+ FeatureVmemWriteVgprInOrder, FeatureVMemToLDSLoad, FeatureCubeInsts,
FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts,
FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts,
FeatureCvtPkNormVOP3Insts
@@ -2646,6 +2646,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
+def HasVMemToLDSLoad : Predicate<"Subtarget->hasVMemToLDSLoad()">,
+ AssemblerPredicate<(all_of FeatureVMemToLDSLoad)>;
+
def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 15056a9a9cd65..626028597f7f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2366,17 +2366,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
return selectBufferLoadLds(I);
// Until we can store both the address space of the global and the LDS
// arguments by having tto MachineMemOperands on an intrinsic, we just trust
// that the argument is a global pointer (buffer pointers have been handled by
// a LLVM IR-level lowering).
case Intrinsic::amdgcn_load_to_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
return selectGlobalLoadLds(I);
+ case Intrinsic::amdgcn_asyncmark:
+ case Intrinsic::amdgcn_wait_asyncmark:
+ // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+ if (!Subtarget->hasVMemToLDSLoad())
+ return false;
+ break;
case Intrinsic::amdgcn_exp_compr:
if (!STI.hasCompressedExport()) {
Function &F = I.getMF()->getFunction();
@@ -3420,11 +3432,25 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
return true;
}
+static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
+ switch (Intr) {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
+ return true;
+ }
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
if (!Subtarget->hasVMemToLDSLoad())
return false;
unsigned Opc;
unsigned Size = MI.getOperand(3).getImm();
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
// The struct intrinsic variants add one additional operand over raw.
const bool HasVIndex = MI.getNumOperands() == 9;
@@ -3514,6 +3540,7 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
? 1
: 0); // swz
+ MIB.addImm(isAsyncLDSDMA(IntrinsicID));
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
// Don't set the offset value here because the pointer points to the base of
@@ -3635,6 +3662,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
unsigned Opc;
unsigned Size = MI.getOperand(3).getImm();
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (Size) {
default:
@@ -3705,6 +3733,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
unsigned Aux = MI.getOperand(5).getImm();
MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
+ MIB.addImm(isAsyncLDSDMA(IntrinsicID));
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 99c1ab8d379d5..fc408aa30dd87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -347,7 +347,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
}
} else {
// We don't want these pseudo instructions encoded. They are
- // placeholder terminator instructions and should only be printed as
+ // placeholder instructions and should only be printed as
// comments.
if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
if (isVerbose())
@@ -361,6 +361,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == AMDGPU::ASYNCMARK) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" asyncmark");
+ return;
+ }
+
+ if (MI->getOpcode() == AMDGPU::WAIT_ASYNCMARK) {
+ if (isVerbose()) {
+ OutStreamer->emitRawComment(" wait_asyncmark(" +
+ Twine(MI->getOperand(0).getImm()) + ")");
+ }
+ return;
+ }
+
if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) {
if (isVerbose()) {
std::string HexString;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7470fecd3c03f..eff7e185e43f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3296,7 +3296,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 1); // M0
return;
case Intrinsic::amdgcn_raw_buffer_load_lds:
- case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 1); // rsrc
constrainOpWithReadfirstlane(B, MI, 2); // M0
@@ -3304,7 +3306,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 1); // rsrc
constrainOpWithReadfirstlane(B, MI, 2); // M0
@@ -3320,7 +3324,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2);
return;
@@ -5439,7 +5445,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_raw_buffer_load_lds:
- case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -5472,7 +5480,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 7a91a40e18cde..6c6b048ede770 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -126,6 +126,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
ImmTySMEMOffsetMod,
ImmTyCPol,
ImmTyTFE,
+ ImmTyIsAsync,
ImmTyD16,
ImmTyClamp,
ImmTyOModSI,
@@ -1113,6 +1114,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
case ImmTyIndexKey16bit: OS << "index_key"; break;
case ImmTyIndexKey32bit: OS << "index_key"; break;
case ImmTyTFE: OS << "TFE"; break;
+ case ImmTyIsAsync: OS << "IsAsync"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
case ImmTyClamp: OS << "Clamp"; break;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index bb0e9380e956d..b8001a084cd15 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -417,13 +417,15 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> {
}
class getMUBUFInsDA<list<RegisterOperand> vdataList,
- list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
+ list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset,
+ bit isLds> {
RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret;
dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
- dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
+ dag IsAsyncOpnd = !if(isLds, (ins i1imm_0:$IsAsync), (ins));
+ dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz), IsAsyncOpnd);
dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
@@ -448,13 +450,13 @@ class getMUBUFElements<ValueType vt> {
);
}
-class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> {
+class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset, bit isLds = 0> {
dag ret =
- !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset, isLds>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset, isLds>.ret,
(ins))))));
}
@@ -499,7 +501,7 @@ class MUBUF_Load_Pseudo <string opName,
RegisterOperand vdata_op = getBUFVDataRegisterOperand<vdata_vt.Size, isTFE>.ret>
: MUBUF_Pseudo<opName,
!if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
- !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset>.ret,
+ !con(getMUBUFIns<addrKind, [], isTFE, hasRestrictedSOffset, isLds>.ret,
!if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
getMUBUFAsmOps<addrKind, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
pattern>,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c6937742e0a42..8f0604b49f9c2 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -390,13 +390,14 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
// Async loads, introduced in gfx1250, will store directly
// to a DS address in vdst (they will not use M0 for DS addess).
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> : FLAT_Pseudo<
opName,
(outs ),
!con(
!if(IsAsync, (ins VGPR_32:$vdst), (ins)),
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
- (ins flat_offset:$offset, CPol_0:$cpol)),
+ (ins flat_offset:$offset, CPol_0:$cpol),
+ !if(IsLegacyLDSDMA, (ins i1imm_0:$IsAsync), (ins))),
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
let LGKM_CNT = !not(IsAsync);
let VM_CNT = !not(IsAsync);
@@ -416,10 +417,10 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
let SchedRW = [WriteVMEM, WriteLDS];
}
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
- def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0, bit IsLegacyLDSDMA = 0> {
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync, IsLegacyLDSDMA>,
GlobalSaddrTable<0, opName>;
- def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync, IsLegacyLDSDMA>,
GlobalSaddrTable<1, opName>;
}
@@ -1212,15 +1213,15 @@ let SubtargetPredicate = HasGFX10_BEncoding in {
VGPROp_32, i32>;
}
-defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
-defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">;
-defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort">;
-defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
-defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
+defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte", 0, 1>;
+defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte", 0, 1>;
+defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ushort", 0, 1>;
+defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort", 0, 1>;
+defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword", 0, 1>;
let SubtargetPredicate = HasGFX950Insts in {
-defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
-defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
+defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3", 0, 1>;
+defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4", 0, 1>;
}
let SubtargetPredicate = isGFX12PlusNot12_50 in
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d7d7381475b7..737db7e285009 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1425,9 +1425,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
break;
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(1);
@@ -1627,7 +1631,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
@@ -1733,7 +1739,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
@@ -11150,6 +11158,19 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
return VData;
}
+static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
+ switch (Intr) {
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_async_lds:
+ return true;
+ }
+ return false;
+}
+
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -11346,15 +11367,21 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_buffer_load_async_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
- case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_async_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
unsigned Opc;
bool HasVIndex =
IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
- IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
+ IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
+ IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
+ IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
unsigned OpOffset = HasVIndex ? 1 : 0;
SDValue VOffset = Op.getOperand(5 + OpOffset);
bool HasVOffset = !isNullConstant(VOffset);
@@ -11426,6 +11453,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
? 1
: 0,
DL, MVT::i8)); // swz
+ Ops.push_back(
+ DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
@@ -11460,7 +11489,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
// for "trust me" that the remaining cases are global pointers until
// such time as we can put two mem operands on an intrinsic.
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
if (!Subtarget->hasVMemToLDSLoad())
return SDValue();
@@ -11529,6 +11560,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Aux = Op.getConstantOperandVal(6);
Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
MVT::i32)); // CPol
+ Ops.push_back(
+ DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0a20b03819c13..f755466045186 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -626,6 +626,24 @@ class SIInsertWaitcnts {
std::optional<WaitEventType>
getExpertSchedulingEventType(const MachineInstr &Inst) const;
+ bool isAsync(const MachineInstr &MI) const {
+ if (!SIInstrInfo::isLDSDMA(MI))
+ return false;
+ if (SIInstrInfo::usesASYNC_CNT(MI))
+ return true;
+ const MachineOperand *Async =
+ TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
+ return Async && (Async->getImm());
+ }
+
+ bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const {
+ return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI);
+ }
+
+ bool isAsyncLdsDmaWrite(const MachineInstr &MI) const {
+ return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
+ }
+
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@@ -733,11 +751,13 @@ class WaitcntBrackets {
AMDGPU::Waitcnt &Wait) const;
void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
AMDGPU::Waitcnt &Wait) const;
+ AMDGPU::Waitcnt determineAsyncWait(unsigned N);
void tryClearSCCWriteEvent(MachineInstr *Inst);
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
void updateByEvent(WaitEventType E, MachineInstr &MI);
+ void recordAsyncMark(MachineInstr &MI);
unsigned hasPendingEvent() const { return PendingEvents; }
unsigned hasPendingEvent(WaitEventType E) const {
@@ -828,11 +848,15 @@ class WaitcntBrackets {
unsigned OtherShift;
};
+ using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
+
void determineWaitForScore(InstCounterType T, unsigned Score,
AMDGPU::Waitcnt &Wait) const;
static bool mergeScore(const MergeInfo &M, unsigned &Score,
unsigned OtherScore);
+ bool mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
+ ArrayRef<CounterValueArray> OtherMarks);
iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
@@ -909,8 +933,8 @@ class WaitcntBrackets {
// TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
struct VMEMInfo {
- // Scores for all instruction counters.
- std::array<unsigned, NUM_INST_CNTS> Scores = {0};
+ // Scores for all instruction counters. Zero-initialized.
+ CounterValueArray Scores{};
// Bitmask of the VmemTypes of VMEM instructions for this VGPR.
unsigned VMEMTypes = 0;
@@ -938,6 +962,14 @@ class WaitcntBrackets {
// Store representative LDS DMA operations. The only useful info here is
// alias info. One store is kept per unique AAInfo.
SmallVector<const MachineInstr *> LDSDMAStores;
+
+ // State of all counters at each async mark encountered so far.
+ SmallVector<CounterValueArray> AsyncMarks;
+ static constexpr unsigned MaxAsyncMarks = 16;
+
+ // Track the upper bound score for async operations that are not part of a
+ // mark yet. Initialized to all zeros.
+ CounterValueArray AsyncScore{};
};
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
@@ -1139,7 +1171,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
setScoreByOperand(Op, T, CurrScore);
}
if (Inst.mayStore() &&
- (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+ (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
// written can be accessed. A load from LDS to VMEM does not need a wait.
//
@@ -1183,6 +1215,14 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
}
+ // FIXME: Not supported on GFX12 yet. Newer async operations use other
+ // counters too, so will need a map from instruction or event types to
+ // counter types.
+ if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) {
+ assert(!SIInstrInfo::usesASYNC_CNT(Inst));
+ AsyncScore[T] = CurrScore;
+ }
+
if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
setRegScore(AMDGPU::SCC, T, CurrScore);
PendingSCCWrite = &Inst;
@@ -1190,13 +1230,28 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
}
}
+void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
+ // In the absence of loops, AsyncMarks can grow linearly with the program
+ // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a
+ // limit every time we push a new mark, but that seems like unnecessary work
+ // in practical cases. We do separately truncate the array when processing a
+ // loop, which should be sufficient.
+ AsyncMarks.push_back(AsyncScore);
+ AsyncScore = {};
+ LLVM_DEBUG({
+ dbgs() << "recordAsyncMark:\n" << Inst;
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+}
+
void WaitcntBrackets::print(raw_ostream &OS) const {
const GCNSubtarget *ST = Context->ST;
- OS << '\n';
for (auto T : inst_counter_types(Context->MaxCounter)) {
unsigned SR = getScoreRange(T);
-
switch (T) {
case LOAD_CNT:
OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
@@ -1289,6 +1344,53 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
}
OS << '\n';
+ OS << "Async score: ";
+ if (AsyncScore.empty())
+ OS << "none";
+ else
+ llvm::interleaveComma(AsyncScore, OS);
+ OS << '\n';
+
+ OS << "Async marks: " << AsyncMarks.size() << '\n';
+
+ for (const auto &Mark : AsyncMarks) {
+ for (auto T : inst_counter_types()) {
+ unsigned MarkedScore = Mark[T];
+ switch (T) {
+ case LOAD_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM")
+ << "_CNT: " << MarkedScore;
+ break;
+ case DS_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM")
+ << "_CNT: " << MarkedScore;
+ break;
+ case EXP_CNT:
+ OS << " EXP_CNT: " << MarkedScore;
+ break;
+ case STORE_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS")
+ << "_CNT: " << MarkedScore;
+ break;
+ case SAMPLE_CNT:
+ OS << " SAMPLE_CNT: " << MarkedScore;
+ break;
+ case BVH_CNT:
+ OS << " BVH_CNT: " << MarkedScore;
+ break;
+ case KM_CNT:
+ OS << " KM_CNT: " << MarkedScore;
+ break;
+ case X_CNT:
+ OS << " X_CNT: " << MarkedScore;
+ break;
+ default:
+ OS << " UNKNOWN: " << MarkedScore;
+ break;
+ }
+ }
+ OS << '\n';
+ }
OS << '\n';
}
@@ -1369,6 +1471,49 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T,
}
}
+AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) {
+ LLVM_DEBUG({
+ dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size()
+ << ":\n";
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ AMDGPU::Waitcnt Wait;
+ if (AsyncMarks.size() == MaxAsyncMarks) {
+ // Enforcing MaxAsyncMarks here is unnecessary work because the size of
+ // MaxAsyncMarks is linear when traversing straightline code. But we do
+ // need to check if truncation may have occured at a merge, and adjust N
+ // to ensure that a wait is generated.
+ LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n");
+ N = std::min(N, (unsigned)MaxAsyncMarks - 1);
+ }
+
+ if (AsyncMarks.size() <= N) {
+ LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n");
+ return Wait;
+ }
+
+ size_t MarkIndex = AsyncMarks.size() - N - 1;
+ const auto &RequiredMark = AsyncMarks[MarkIndex];
+ for (InstCounterType T : inst_counter_types())
+ determineWaitForScore(T, RequiredMark[T], Wait);
+
+ // Immediately remove the waited mark and all older ones
+ // This happens BEFORE the wait is actually inserted, which is fine
+ // because we've already extracted the wait requirements
+ LLVM_DEBUG({
+ dbgs() << "Removing " << (MarkIndex + 1)
+ << " async marks after determining wait\n";
+ });
+ AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
+
+ LLVM_DEBUG(dbgs() << "Waits to add: " << Wait);
+ return Wait;
+}
+
void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
AMDGPU::Waitcnt &Wait) const {
if (Reg == AMDGPU::SCC) {
@@ -1618,6 +1763,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
// possibility in an articial MIR test since such a situation cannot be
// recreated by running the memory legalizer.
II.eraseFromParent();
+ } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
+ unsigned N = II.getOperand(0).getImm();
+ LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';);
+ AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N);
+ Wait = Wait.combined(OldWait);
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1875,6 +2025,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
// LDS, so no work required here yet.
II.eraseFromParent();
continue;
+ } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
+ reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet");
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2202,6 +2354,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
+ LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
setForceEmitWaitcnt();
assert(!MI.isMetaInstruction());
@@ -2763,6 +2916,84 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
return OtherShifted > MyShifted;
}
+bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
+ ArrayRef<CounterValueArray> OtherMarks) {
+ bool StrictDom = false;
+
+ LLVM_DEBUG(dbgs() << "Merging async marks ...");
+ // Early exit: both empty
+ if (AsyncMarks.empty() && OtherMarks.empty()) {
+ LLVM_DEBUG(dbgs() << " nothing to merge\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << '\n');
+
+ // Determine maximum length needed after merging
+ auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size());
+
+ // For each backedge in isolation, the algorithm reachs a fixed point after
+ // the first call to merge(). This is unchanged even with the AsyncMarks
+ // array because we call mergeScore just like the other cases.
+ //
+ // But in the rare pathological case, a nest of loops that pushes marks
+ // without waiting on any mark can cause AsyncMarks to grow very large. We cap
+ // it to a reasonable limit. We can tune this later or potentially introduce a
+ // user option to control the value.
+ MaxSize = std::min(MaxSize, MaxAsyncMarks);
+
+ // Keep only the most recent marks within our limit.
+ if (AsyncMarks.size() > MaxSize)
+ AsyncMarks.erase(AsyncMarks.begin(),
+ AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
+
+ // Pad with zero-filled marks if our list is shorter. Zero represents "no
+ // pending async operations at this checkpoint" and acts as the identity
+ // element for max() during merging. We pad at the beginning since the marks
+ // need to be aligned in most-recent order.
+ CounterValueArray ZeroMark{};
+ AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
+
+ LLVM_DEBUG({
+ dbgs() << "Before merge:\n";
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ LLVM_DEBUG({
+ dbgs() << "Other marks:\n";
+ for (const auto &Mark : OtherMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ // Merge element-wise using the existing mergeScore function and the
+ // appropriate MergeInfo for each counter type. Iterate only while we have
+ // elements in both vectors.
+ unsigned OtherSize = OtherMarks.size();
+ unsigned OurSize = AsyncMarks.size();
+ unsigned MergeCount = std::min(OtherSize, OurSize);
+ assert(OurSize == MaxSize);
+ for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) {
+ for (auto T : inst_counter_types(Context->MaxCounter)) {
+ StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
+ OtherMarks[OtherSize - Idx][T]);
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "After merge:\n";
+ for (const auto &Mark : AsyncMarks) {
+ llvm::interleaveComma(Mark, dbgs());
+ dbgs() << '\n';
+ }
+ });
+
+ return StrictDom;
+}
+
/// Merge the pending events and associater score brackets of \p Other into
/// this brackets status.
///
@@ -2778,6 +3009,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (auto K : Other.SGPRs.keys())
SGPRs.try_emplace(K);
+ // Array to store MergeInfo for each counter type
+ MergeInfo MergeInfos[NUM_INST_CNTS];
+
for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
@@ -2794,7 +3028,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (NewUB < ScoreLBs[T])
report_fatal_error("waitcnt score overflow");
- MergeInfo M;
+ MergeInfo &M = MergeInfos[T];
M.OldLB = ScoreLBs[T];
M.OtherLB = Other.ScoreLBs[T];
M.MyShift = NewUB - ScoreUBs[T];
@@ -2841,6 +3075,10 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
}
+ StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks);
+ for (auto T : inst_counter_types(Context->MaxCounter))
+ StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]);
+
purgeEmptyTrackingData();
return StrictDom;
}
@@ -2853,6 +3091,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
Opcode == AMDGPU::S_WAITCNT_lds_direct ||
+ Opcode == AMDGPU::WAIT_ASYNCMARK ||
counterTypeForInstr(Opcode).has_value();
}
@@ -2918,6 +3157,14 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
if (Block.getFirstTerminator() == Inst)
FlushFlags = isPreheaderToFlush(Block, ScoreBrackets);
+ if (Inst.getOpcode() == AMDGPU::ASYNCMARK) {
+ // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
+ assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
+ ScoreBrackets.recordAsyncMark(Inst);
+ ++Iter;
+ continue;
+ }
+
// Generate an s_waitcnt instruction to be placed before Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
FlushFlags);
@@ -3360,7 +3607,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
if (!SuccBI.Incoming) {
SuccBI.Dirty = true;
if (SuccBII <= BII) {
- LLVM_DEBUG(dbgs() << "repeat on backedge\n");
+ LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n");
Repeat = true;
}
if (!MoveBracketsToSucc) {
@@ -3368,11 +3615,20 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
} else {
SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
}
- } else if (SuccBI.Incoming->merge(*Brackets)) {
- SuccBI.Dirty = true;
- if (SuccBII <= BII) {
- LLVM_DEBUG(dbgs() << "repeat on backedge\n");
- Repeat = true;
+ } else {
+ LLVM_DEBUG({
+ dbgs() << "Try to merge ";
+ MBB->printName(dbgs());
+ dbgs() << " into ";
+ Succ->printName(dbgs());
+ dbgs() << '\n';
+ });
+ if (SuccBI.Incoming->merge(*Brackets)) {
+ SuccBI.Dirty = true;
+ if (SuccBII <= BII) {
+ LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n");
+ Repeat = true;
+ }
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index bf51b22274f3d..0c834ef730e82 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1015,6 +1015,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
}
+ static bool usesASYNC_CNT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::ASYNC_CNT;
+ }
+
static bool usesLGKM_CNT(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2ec3ec8674488..6ace994137111 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1261,6 +1261,8 @@ def CPol_NonGLC : ValuePredicatedOperand<CPol, "!(Op.getImm() & CPol::GLC)", 1>;
def CPol_GLC_WithDefault : DefaultOperand<CPol_GLC, !shl(1, CPolBit.GLC)>;
def CPol_NonGLC_WithDefault : DefaultOperand<CPol_NonGLC, 0>;
+def IsAsync : NamedBitOperand<"isasync">;
+
def TFE : NamedBitOperand<"tfe">;
def UNorm : NamedBitOperand<"unorm">;
def DA : NamedBitOperand<"da">;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 99b352bdf6765..a49895421c754 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1688,11 +1688,21 @@ let SubtargetPredicate = HasWaitXcnt in {
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
-
def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> {
let hasSideEffects = 0;
}
+let SubtargetPredicate = HasVMemToLDSLoad in {
+def ASYNCMARK : SPseudoInstSI<(outs), (ins),
+ [(int_amdgcn_asyncmark)]> {
+ let maybeAtomic = 0;
+}
+def WAIT_ASYNCMARK : SOPP_Pseudo <"", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_wait_asyncmark timm:$simm16)]> {
+ let maybeAtomic = 0;
+}
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> {
diff --git a/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll b/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
new file mode 100644
index 0000000000000..a0bbb9b5d0537
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
@@ -0,0 +1,58 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+
+define float @raw.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; CHECK-LABEL: raw.buffer.load:
+; CHECK: s_waitcnt vmcnt(2)
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+define float @raw.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; CHECK-LABEL: raw.ptr.buffer.load:
+; CHECK: s_waitcnt vmcnt(2)
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2)
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+define float @struct.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; CHECK-LABEL: struct.buffer.load:
+; CHECK: s_waitcnt vmcnt(2)
+main_body:
+ call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
+
+define float @struct.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; CHECK-LABEL: struct.ptr.buffer.load:
+; CHECK: s_waitcnt vmcnt(2)
+main_body:
+ call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2)
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %res = load float, ptr addrspace(3) %lds
+ ret float %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-err.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-err.ll
new file mode 100644
index 0000000000000..f929cb3e380b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-err.ll
@@ -0,0 +1,19 @@
+; RUN: split-file %s %t
+; RUN: not --crash llc -filetype=null -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 %t/mark.ll 2>&1 | FileCheck --ignore-case %s
+; RUN: not llc -filetype=null -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 %t/mark.ll 2>&1 | FileCheck --ignore-case %s
+; RUN: not --crash llc -filetype=null -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 %t/wait.ll 2>&1 | FileCheck --ignore-case %s
+; RUN: not llc -filetype=null -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 %t/wait.ll 2>&1 | FileCheck --ignore-case %s
+
+; CHECK: LLVM ERROR: Cannot select
+
+;--- mark.ll
+define void @async_err() {
+ call void @llvm.amdgcn.asyncmark()
+ ret void
+}
+
+;--- wait.ll
+define void @async_err() {
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll
new file mode 100644
index 0000000000000..1bd1b13f44340
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll
@@ -0,0 +1,285 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s
+
+; Loop body exceeds MaxAsyncMarkers(=16) on first iteration
+; Preloop: 5 markers
+; Loop body: 18 markers
+
+; CHECK-LABEL: test_loop_exceeds_max_first_iteration:
+; CHECK: ; wait_asyncmark(3)
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+
+define void @test_loop_exceeds_max_first_iteration(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) {
+entry:
+ ; Preloop: 5 async LDS DMA operations
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %loop_header
+
+loop_header:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop_body ]
+ %i.next = add i32 %i, 1
+ %cmp = icmp slt i32 %i, %n
+ br i1 %cmp, label %loop_body, label %exit
+
+loop_body:
+ ; Loop body with 18 async operations
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %loop_header
+
+exit:
+ call void @llvm.amdgcn.wait.asyncmark(i16 3)
+ %lds_val = load i32, ptr addrspace(3) %lds
+ store i32 %lds_val, ptr addrspace(1) %out
+ ret void
+}
+
+; Loop body does not exceed MaxAsyncMarkers(=16) on first iteration
+; Preloop: 5 markers
+; Loop body: 5 markers
+
+; CHECK-LABEL: test_loop_needs_more_iterations:
+; CHECK: ; wait_asyncmark(3)
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+
+define void @test_loop_needs_more_iterations(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) {
+entry:
+ ; Preloop: 5 async LDS DMA operations
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %loop_header
+
+loop_header:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop_body ]
+ %i.next = add i32 %i, 1
+ %cmp = icmp slt i32 %i, %n
+ br i1 %cmp, label %loop_body, label %exit
+
+loop_body:
+ ; Loop body with 5 async operations
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %loop_header
+
+exit:
+ call void @llvm.amdgcn.wait.asyncmark(i16 3)
+ %lds_val = load i32, ptr addrspace(3) %lds
+ store i32 %lds_val, ptr addrspace(1) %out
+ ret void
+}
+
+; Merge exceeds MaxAsyncMarkers(=16)
+
+; CHECK-LABEL: max_when_merged:
+; CHECK: ; wait_asyncmark(17)
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+
+define void @max_when_merged(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) {
+entry:
+ %cmp = icmp slt i32 0, %n
+ br i1 %cmp, label %then, label %else
+
+then:
+ ; 5 async operations
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %endif
+
+else:
+ ; 18 async operations
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %endif
+
+endif:
+ call void @llvm.amdgcn.wait.asyncmark(i16 17)
+ %lds_val = load i32, ptr addrspace(3) %lds
+ store i32 %lds_val, ptr addrspace(1) %out
+ ret void
+}
+
+; Straightline exceeds MaxAsyncMarkers(=16)
+
+; CHECK-LABEL: no_max_in_straightline:
+; CHECK: ; wait_asyncmark(17)
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+
+define void @no_max_in_straightline(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) {
+ ; 18 async operations
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.wait.asyncmark(i16 17)
+ %lds_val = load i32, ptr addrspace(3) %lds
+ store i32 %lds_val, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
new file mode 100644
index 0000000000000..40a227fc850bd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
@@ -0,0 +1,751 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX900
+; RUN: llc -march=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942
+; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX1010
+
+; Demonstrate that wait.asyncmark is a code motion barrier for loads from LDS.
+; This is the simplest demo possible. We don't actually use async ops, but just
+; a pair of adjacent LDS loads. In the absence of the async mark, these get
+; coalesced into a wider LDS load.
+
+define void @code_barrier(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(3) %out) {
+; GFX900-LABEL: code_barrier:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: ds_read_b32 v0, v2
+; GFX900-NEXT: ; wait_asyncmark(0)
+; GFX900-NEXT: ds_read_b32 v1, v2 offset:4
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX900-NEXT: ds_write_b32 v3, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: code_barrier:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: ds_read_b32 v0, v2
+; GFX942-NEXT: ; wait_asyncmark(0)
+; GFX942-NEXT: ds_read_b32 v1, v2 offset:4
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX942-NEXT: ds_write_b32 v3, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: code_barrier:
+; GFX1010: ; %bb.0:
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: ds_read_b32 v0, v2
+; GFX1010-NEXT: ; wait_asyncmark(0)
+; GFX1010-NEXT: ds_read_b32 v1, v2 offset:4
+; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX1010-NEXT: ds_write_b32 v3, v0
+; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+ %lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1
+ %val1 = load i32, ptr addrspace(3) %lds
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ %val2 = load i32, ptr addrspace(3) %lds_gep1
+ %sum = add i32 %val1, %val2
+ store i32 %sum, ptr addrspace(3) %out
+ ret void
+}
+
+; Test async mark/wait with global_load_lds and global loads
+; This version uses wave barriers to enforce program order so that unrelated vmem
+; instructions do not get reordered before reaching this point.
+
+define void @interleaved_global_and_dma(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
+; GFX900-LABEL: interleaved_global_and_dma:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_readfirstlane_b32 s4, v2
+; GFX900-NEXT: global_load_dword v7, v[3:4], off
+; GFX900-NEXT: global_load_dword v8, v[0:1], off
+; GFX900-NEXT: s_mov_b32 m0, s4
+; GFX900-NEXT: ; wave barrier
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dword v[3:4], off lds
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: global_load_dword v0, v[0:1], off
+; GFX900-NEXT: ; wave barrier
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: global_load_dword v[3:4], off lds
+; GFX900-NEXT: ; wave barrier
+; GFX900-NEXT: global_load_dword v1, v[3:4], off
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: ; wait_asyncmark(1)
+; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: ds_read_b32 v3, v2
+; GFX900-NEXT: ; wait_asyncmark(0)
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: ds_read_b32 v2, v2
+; GFX900-NEXT: v_add_u32_e32 v4, v8, v7
+; GFX900-NEXT: s_waitcnt lgkmcnt(1)
+; GFX900-NEXT: v_add3_u32 v0, v4, v3, v0
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX900-NEXT: global_store_dword v[5:6], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: interleaved_global_and_dma:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-NEXT: v_mov_b32_e32 v8, v3
+; GFX942-NEXT: v_readfirstlane_b32 s0, v2
+; GFX942-NEXT: global_load_dword v3, v[8:9], off
+; GFX942-NEXT: global_load_dword v4, v[0:1], off
+; GFX942-NEXT: s_mov_b32 m0, s0
+; GFX942-NEXT: ; wave barrier
+; GFX942-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-NEXT: global_load_lds_dword v[8:9], off
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: global_load_dword v0, v[0:1], off
+; GFX942-NEXT: ; wave barrier
+; GFX942-NEXT: v_mov_b32_e32 v6, v5
+; GFX942-NEXT: global_load_lds_dword v[8:9], off
+; GFX942-NEXT: ; wave barrier
+; GFX942-NEXT: global_load_dword v1, v[8:9], off
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: ; wait_asyncmark(1)
+; GFX942-NEXT: s_waitcnt vmcnt(3)
+; GFX942-NEXT: ds_read_b32 v5, v2
+; GFX942-NEXT: ; wait_asyncmark(0)
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: ds_read_b32 v2, v2
+; GFX942-NEXT: v_add_u32_e32 v3, v4, v3
+; GFX942-NEXT: s_waitcnt lgkmcnt(1)
+; GFX942-NEXT: v_add3_u32 v0, v3, v5, v0
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX942-NEXT: global_store_dword v[6:7], v0, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: interleaved_global_and_dma:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-NEXT: global_load_dword v7, v[3:4], off
+; GFX1010-NEXT: global_load_dword v8, v[0:1], off
+; GFX1010-NEXT: ; wave barrier
+; GFX1010-NEXT: s_mov_b32 m0, s4
+; GFX1010-NEXT: global_load_dword v[3:4], off lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: global_load_dword v0, v[0:1], off
+; GFX1010-NEXT: ; wave barrier
+; GFX1010-NEXT: global_load_dword v[3:4], off lds
+; GFX1010-NEXT: ; wave barrier
+; GFX1010-NEXT: global_load_dword v1, v[3:4], off
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: ; wait_asyncmark(1)
+; GFX1010-NEXT: s_waitcnt vmcnt(3)
+; GFX1010-NEXT: ds_read_b32 v3, v2
+; GFX1010-NEXT: ; wait_asyncmark(0)
+; GFX1010-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-NEXT: ds_read_b32 v2, v2
+; GFX1010-NEXT: v_add_nc_u32_e32 v4, v8, v7
+; GFX1010-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1010-NEXT: v_add3_u32 v0, v4, v3, v0
+; GFX1010-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX1010-NEXT: global_store_dword v[5:6], v0, off
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ; First batch: global load, global load, async global-to-LDS
+ %bar_v11 = load i32, ptr addrspace(1) %bar
+ %foo_v1 = load i32, ptr addrspace(1) %foo
+ call void @llvm.amdgcn.wave.barrier()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %bar, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Second batch: global load, async global-to-LDS, global load
+ %foo_v2 = load i32, ptr addrspace(1) %foo
+ call void @llvm.amdgcn.wave.barrier()
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %bar, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.wave.barrier()
+ %bar_v12 = load i32, ptr addrspace(1) %bar
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Wait for first async mark and read from LDS
+ ; This results in vmcnt(3) corresponding to the second batch.
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %lds_val21 = load i32, ptr addrspace(3) %lds
+
+ ; Wait for the next lds dma
+ ; This results in vmcnt(1), corresponding to %bar_v12. Could have been combined with the lgkmcnt(1) for %lds_val21.
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ %lds_val22 = load i32, ptr addrspace(3) %lds
+ %sum1 = add i32 %foo_v1, %bar_v11
+ %sum2 = add i32 %sum1, %lds_val21
+ %sum3 = add i32 %sum2, %foo_v2
+ ; Finally a vmcnt(0) for %bar_v12, which was not included in the async mark that followed it.
+ %sum4 = add i32 %sum3, %bar_v12
+ %sum5 = add i32 %sum4, %lds_val22
+ store i32 %sum5, ptr addrspace(1) %out
+
+ ret void
+}
+
+define void @interleaved_buffer_and_dma(ptr addrspace(8) inreg %buf, ptr addrspace(1) %foo, ptr addrspace(3) inreg %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
+; GFX900-LABEL: interleaved_buffer_and_dma:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 m0, s20
+; GFX900-NEXT: global_load_dword v6, v[2:3], off
+; GFX900-NEXT: global_load_dword v7, v[0:1], off
+; GFX900-NEXT: v_mov_b32_e32 v8, 0x54
+; GFX900-NEXT: ; wave barrier
+; GFX900-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: global_load_dword v0, v[0:1], off
+; GFX900-NEXT: v_mov_b32_e32 v1, 0x58
+; GFX900-NEXT: ; wave barrier
+; GFX900-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
+; GFX900-NEXT: ; wave barrier
+; GFX900-NEXT: global_load_dword v1, v[2:3], off
+; GFX900-NEXT: v_mov_b32_e32 v2, s20
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: ; wait_asyncmark(1)
+; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: ds_read_b32 v3, v2
+; GFX900-NEXT: ; wait_asyncmark(0)
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: ds_read_b32 v2, v2
+; GFX900-NEXT: v_add_u32_e32 v6, v7, v6
+; GFX900-NEXT: s_waitcnt lgkmcnt(1)
+; GFX900-NEXT: v_add3_u32 v0, v6, v3, v0
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX900-NEXT: global_store_dword v[4:5], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: interleaved_buffer_and_dma:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_mov_b32 m0, s16
+; GFX942-NEXT: global_load_dword v6, v[2:3], off
+; GFX942-NEXT: global_load_dword v7, v[0:1], off
+; GFX942-NEXT: v_mov_b32_e32 v8, 0x54
+; GFX942-NEXT: ; wave barrier
+; GFX942-NEXT: buffer_load_dword v8, s[0:3], 0 offen lds
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: global_load_dword v0, v[0:1], off
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x58
+; GFX942-NEXT: ; wave barrier
+; GFX942-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
+; GFX942-NEXT: ; wave barrier
+; GFX942-NEXT: global_load_dword v1, v[2:3], off
+; GFX942-NEXT: v_mov_b32_e32 v2, s16
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: ; wait_asyncmark(1)
+; GFX942-NEXT: s_waitcnt vmcnt(3)
+; GFX942-NEXT: ds_read_b32 v3, v2
+; GFX942-NEXT: ; wait_asyncmark(0)
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: ds_read_b32 v2, v2
+; GFX942-NEXT: v_add_u32_e32 v6, v7, v6
+; GFX942-NEXT: s_waitcnt lgkmcnt(1)
+; GFX942-NEXT: v_add3_u32 v0, v6, v3, v0
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX942-NEXT: global_store_dword v[4:5], v0, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: interleaved_buffer_and_dma:
+; GFX1010: ; %bb.0: ; %entry
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_mov_b32_e32 v6, 0x54
+; GFX1010-NEXT: global_load_dword v7, v[2:3], off
+; GFX1010-NEXT: global_load_dword v8, v[0:1], off
+; GFX1010-NEXT: s_mov_b32 m0, s20
+; GFX1010-NEXT: ; wave barrier
+; GFX1010-NEXT: buffer_load_dword v6, s[16:19], 0 offen lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: v_mov_b32_e32 v6, 0x58
+; GFX1010-NEXT: global_load_dword v0, v[0:1], off
+; GFX1010-NEXT: ; wave barrier
+; GFX1010-NEXT: buffer_load_dword v6, s[16:19], 0 offen lds
+; GFX1010-NEXT: ; wave barrier
+; GFX1010-NEXT: global_load_dword v1, v[2:3], off
+; GFX1010-NEXT: v_mov_b32_e32 v2, s20
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: ; wait_asyncmark(1)
+; GFX1010-NEXT: s_waitcnt vmcnt(3)
+; GFX1010-NEXT: ds_read_b32 v3, v2
+; GFX1010-NEXT: ; wait_asyncmark(0)
+; GFX1010-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-NEXT: ds_read_b32 v2, v2
+; GFX1010-NEXT: v_add_nc_u32_e32 v6, v8, v7
+; GFX1010-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1010-NEXT: v_add3_u32 v0, v6, v3, v0
+; GFX1010-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_add3_u32 v0, v0, v1, v2
+; GFX1010-NEXT: global_store_dword v[4:5], v0, off
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+entry:
+ ; First batch: global load, global load, async global-to-LDS.
+ %bar_v11 = load i32, ptr addrspace(1) %bar
+ %foo_v1 = load i32, ptr addrspace(1) %foo
+ call void @llvm.amdgcn.wave.barrier()
+ call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 84, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Second batch: global load, async global-to-LDS, global load.
+ %foo_v2 = load i32, ptr addrspace(1) %foo
+ call void @llvm.amdgcn.wave.barrier()
+ call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 88, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.wave.barrier()
+ %bar_v12 = load i32, ptr addrspace(1) %bar
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Wait for first async mark and read from LDS.
+ ; This results in vmcnt(3) corresponding to the second batch.
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %lds_val21 = load i32, ptr addrspace(3) %lds
+
+ ; Wait for the next lds dma.
+ ; This results in vmcnt(1) because the last global load is not async.
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ %lds_val22 = load i32, ptr addrspace(3) %lds
+ %sum1 = add i32 %foo_v1, %bar_v11
+ %sum2 = add i32 %sum1, %lds_val21
+ %sum3 = add i32 %sum2, %foo_v2
+ %sum4 = add i32 %sum3, %bar_v12
+ %sum5 = add i32 %sum4, %lds_val22
+ store i32 %sum5, ptr addrspace(1) %out
+
+ ret void
+}
+
+; A perfect loop that is unlikely to exist in real life. It uses only async LDS
+; DMA operations, and result in vmcnt waits that exactly match the stream of
+; those outstanding operations.
+
+define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
+; GFX900-LABEL: test_pipelined_loop:
+; GFX900: ; %bb.0: ; %prolog
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_readfirstlane_b32 s4, v2
+; GFX900-NEXT: s_mov_b32 m0, s4
+; GFX900-NEXT: v_mov_b32_e32 v5, 0
+; GFX900-NEXT: global_load_dword v[0:1], off lds
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: global_load_dword v[0:1], off lds
+; GFX900-NEXT: s_mov_b32 s6, 2
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: .LBB3_1: ; %loop_body
+; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT: v_readfirstlane_b32 s7, v2
+; GFX900-NEXT: s_mov_b32 m0, s7
+; GFX900-NEXT: s_add_i32 s6, s6, 1
+; GFX900-NEXT: global_load_dword v[0:1], off lds
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: ; wait_asyncmark(2)
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: ds_read_b32 v6, v2
+; GFX900-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v5, v5, v6
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execnz .LBB3_1
+; GFX900-NEXT: ; %bb.2: ; %epilog
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: ; wait_asyncmark(1)
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: ds_read_b32 v0, v2
+; GFX900-NEXT: ; wait_asyncmark(0)
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v0, v5, v0
+; GFX900-NEXT: global_store_dword v[3:4], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_pipelined_loop:
+; GFX942: ; %bb.0: ; %prolog
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_readfirstlane_b32 s0, v2
+; GFX942-NEXT: s_mov_b32 m0, s0
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-NEXT: global_load_lds_dword v[0:1], off
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: global_load_lds_dword v[0:1], off
+; GFX942-NEXT: v_mov_b32_e32 v4, v3
+; GFX942-NEXT: s_mov_b32 s2, 2
+; GFX942-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: .LBB3_1: ; %loop_body
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s3, v2
+; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: s_add_i32 s2, s2, 1
+; GFX942-NEXT: global_load_lds_dword v[0:1], off
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: ; wait_asyncmark(2)
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: ds_read_b32 v6, v2
+; GFX942-NEXT: v_cmp_ge_i32_e32 vcc, s2, v7
+; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_u32_e32 v3, v3, v6
+; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB3_1
+; GFX942-NEXT: ; %bb.2: ; %epilog
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: ; wait_asyncmark(1)
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: ds_read_b32 v0, v2
+; GFX942-NEXT: ; wait_asyncmark(0)
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_add_u32_e32 v0, v3, v0
+; GFX942-NEXT: global_store_dword v[4:5], v0, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: test_pipelined_loop:
+; GFX1010: ; %bb.0: ; %prolog
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-NEXT: v_mov_b32_e32 v5, 0
+; GFX1010-NEXT: s_mov_b32 s5, 2
+; GFX1010-NEXT: s_mov_b32 m0, s4
+; GFX1010-NEXT: s_mov_b32 s4, 0
+; GFX1010-NEXT: global_load_dword v[0:1], off lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: global_load_dword v[0:1], off lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: .LBB3_1: ; %loop_body
+; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1010-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-NEXT: s_add_i32 s5, s5, 1
+; GFX1010-NEXT: v_cmp_ge_i32_e32 vcc_lo, s5, v7
+; GFX1010-NEXT: s_mov_b32 m0, s6
+; GFX1010-NEXT: global_load_dword v[0:1], off lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: ; wait_asyncmark(2)
+; GFX1010-NEXT: s_waitcnt vmcnt(2)
+; GFX1010-NEXT: ds_read_b32 v6, v2
+; GFX1010-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-NEXT: v_add_nc_u32_e32 v5, v5, v6
+; GFX1010-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_cbranch_execnz .LBB3_1
+; GFX1010-NEXT: ; %bb.2: ; %epilog
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: ; wait_asyncmark(1)
+; GFX1010-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-NEXT: ds_read_b32 v0, v2
+; GFX1010-NEXT: ; wait_asyncmark(0)
+; GFX1010-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_add_nc_u32_e32 v0, v5, v0
+; GFX1010-NEXT: global_store_dword v[3:4], v0, off
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+prolog:
+ ; Load first iteration
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Load second iteration
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %loop_body
+
+loop_body:
+ %i = phi i32 [ 2, %prolog ], [ %i.next, %loop_body ]
+ %sum = phi i32 [ 0, %prolog ], [ %sum_i, %loop_body ]
+
+ ; Load next iteration
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Wait for iteration i-2 and process
+ call void @llvm.amdgcn.wait.asyncmark(i16 2)
+ %lds_idx = sub i32 %i, 2
+ %lds_val = load i32, ptr addrspace(3) %lds
+
+ %sum_i = add i32 %sum, %lds_val
+
+ %i.next = add i32 %i, 1
+ %cmp = icmp slt i32 %i.next, %n
+ br i1 %cmp, label %loop_body, label %epilog
+
+epilog:
+ ; Process remaining iterations
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %lds_val_n_2 = load i32, ptr addrspace(3) %lds
+ %sum_e2 = add i32 %sum_i, %lds_val_n_2
+
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ %lds_val_n_1 = load i32, ptr addrspace(3) %lds
+ %sum_e1 = add i32 %sum_e2, %lds_val_n_1
+ store i32 %sum_e2, ptr addrspace(1) %bar
+
+ ret void
+}
+
+; Software pipelined loop with async global-to-LDS and global loads
+
+define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
+; GFX900-LABEL: test_pipelined_loop_with_global:
+; GFX900: ; %bb.0: ; %prolog
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_readfirstlane_b32 s4, v2
+; GFX900-NEXT: s_mov_b32 m0, s4
+; GFX900-NEXT: global_load_dword v10, v[0:1], off
+; GFX900-NEXT: global_load_dword v14, v[3:4], off
+; GFX900-NEXT: s_mov_b32 s6, 2
+; GFX900-NEXT: global_load_dword v[0:1], off lds
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: global_load_dword v8, v[0:1], off
+; GFX900-NEXT: global_load_dword v9, v[3:4], off
+; GFX900-NEXT: s_mov_b64 s[4:5], 0
+; GFX900-NEXT: global_load_dword v[0:1], off lds
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: v_mov_b32_e32 v13, v8
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: v_mov_b32_e32 v15, v9
+; GFX900-NEXT: .LBB4_1: ; %loop_body
+; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT: v_readfirstlane_b32 s7, v2
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: v_mov_b32_e32 v12, v15
+; GFX900-NEXT: v_mov_b32_e32 v11, v13
+; GFX900-NEXT: global_load_dword v13, v[0:1], off
+; GFX900-NEXT: global_load_dword v15, v[3:4], off
+; GFX900-NEXT: s_mov_b32 m0, s7
+; GFX900-NEXT: s_add_i32 s6, s6, 1
+; GFX900-NEXT: global_load_dword v[0:1], off lds
+; GFX900-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7
+; GFX900-NEXT: v_mov_b32_e32 v16, v14
+; GFX900-NEXT: v_mov_b32_e32 v17, v10
+; GFX900-NEXT: v_mov_b32_e32 v10, v8
+; GFX900-NEXT: v_mov_b32_e32 v14, v9
+; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT: ; asyncmark
+; GFX900-NEXT: ; wait_asyncmark(2)
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT: s_cbranch_execnz .LBB4_1
+; GFX900-NEXT: ; %bb.2: ; %epilog
+; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT: ds_read_b32 v0, v2
+; GFX900-NEXT: ; wait_asyncmark(1)
+; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: ds_read_b32 v1, v2
+; GFX900-NEXT: ; wait_asyncmark(0)
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: ds_read_b32 v2, v2
+; GFX900-NEXT: v_add_u32_e32 v3, v17, v16
+; GFX900-NEXT: s_waitcnt lgkmcnt(2)
+; GFX900-NEXT: v_add3_u32 v0, v3, v0, v12
+; GFX900-NEXT: s_waitcnt lgkmcnt(1)
+; GFX900-NEXT: v_add3_u32 v0, v11, v0, v1
+; GFX900-NEXT: v_add_u32_e32 v1, v13, v15
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_add3_u32 v0, v1, v2, v0
+; GFX900-NEXT: global_store_dword v[5:6], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_pipelined_loop_with_global:
+; GFX942: ; %bb.0: ; %prolog
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_readfirstlane_b32 s0, v2
+; GFX942-NEXT: s_mov_b32 m0, s0
+; GFX942-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-NEXT: global_load_dword v16, v[0:1], off
+; GFX942-NEXT: global_load_dword v17, v[10:11], off
+; GFX942-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-NEXT: global_load_lds_dword v[0:1], off
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: global_load_dword v14, v[0:1], off
+; GFX942-NEXT: global_load_dword v15, v[10:11], off
+; GFX942-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-NEXT: global_load_lds_dword v[0:1], off
+; GFX942-NEXT: s_mov_b32 s2, 2
+; GFX942-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: v_mov_b32_e32 v18, v14
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_mov_b32_e32 v19, v15
+; GFX942-NEXT: .LBB4_1: ; %loop_body
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: global_load_dword v3, v[0:1], off
+; GFX942-NEXT: global_load_dword v4, v[10:11], off
+; GFX942-NEXT: v_readfirstlane_b32 s3, v2
+; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: s_add_i32 s2, s2, 1
+; GFX942-NEXT: global_load_lds_dword v[0:1], off
+; GFX942-NEXT: v_cmp_ge_i32_e32 vcc, s2, v7
+; GFX942-NEXT: v_mov_b32_e32 v5, v16
+; GFX942-NEXT: v_mov_b32_e32 v12, v17
+; GFX942-NEXT: v_mov_b32_e32 v6, v18
+; GFX942-NEXT: v_mov_b32_e32 v13, v19
+; GFX942-NEXT: v_mov_b32_e32 v16, v14
+; GFX942-NEXT: v_mov_b32_e32 v17, v15
+; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT: ; asyncmark
+; GFX942-NEXT: ; wait_asyncmark(2)
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: v_mov_b32_e32 v18, v3
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_mov_b32_e32 v19, v4
+; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB4_1
+; GFX942-NEXT: ; %bb.2: ; %epilog
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: ds_read_b32 v0, v2
+; GFX942-NEXT: ; wait_asyncmark(1)
+; GFX942-NEXT: ds_read_b32 v1, v2
+; GFX942-NEXT: ; wait_asyncmark(0)
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ds_read_b32 v2, v2
+; GFX942-NEXT: v_add_u32_e32 v5, v5, v12
+; GFX942-NEXT: s_waitcnt lgkmcnt(2)
+; GFX942-NEXT: v_add3_u32 v0, v5, v0, v13
+; GFX942-NEXT: s_waitcnt lgkmcnt(1)
+; GFX942-NEXT: v_add3_u32 v0, v6, v0, v1
+; GFX942-NEXT: v_add_u32_e32 v1, v3, v4
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add3_u32 v0, v1, v2, v0
+; GFX942-NEXT: global_store_dword v[8:9], v0, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: test_pipelined_loop_with_global:
+; GFX1010: ; %bb.0: ; %prolog
+; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1010-NEXT: global_load_dword v9, v[0:1], off
+; GFX1010-NEXT: global_load_dword v13, v[3:4], off
+; GFX1010-NEXT: s_mov_b32 s5, 2
+; GFX1010-NEXT: s_mov_b32 m0, s4
+; GFX1010-NEXT: s_mov_b32 s4, 0
+; GFX1010-NEXT: global_load_dword v[0:1], off lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: global_load_dword v10, v[0:1], off
+; GFX1010-NEXT: global_load_dword v12, v[3:4], off
+; GFX1010-NEXT: global_load_dword v[0:1], off lds
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: s_waitcnt vmcnt(2)
+; GFX1010-NEXT: v_mov_b32_e32 v8, v10
+; GFX1010-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-NEXT: v_mov_b32_e32 v11, v12
+; GFX1010-NEXT: .LBB4_1: ; %loop_body
+; GFX1010-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1010-NEXT: v_readfirstlane_b32 s6, v2
+; GFX1010-NEXT: s_waitcnt vmcnt(1)
+; GFX1010-NEXT: v_mov_b32_e32 v15, v11
+; GFX1010-NEXT: v_mov_b32_e32 v14, v8
+; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-NEXT: global_load_dword v8, v[0:1], off
+; GFX1010-NEXT: global_load_dword v11, v[3:4], off
+; GFX1010-NEXT: s_add_i32 s5, s5, 1
+; GFX1010-NEXT: s_mov_b32 m0, s6
+; GFX1010-NEXT: v_cmp_ge_i32_e32 vcc_lo, s5, v7
+; GFX1010-NEXT: global_load_dword v[0:1], off lds
+; GFX1010-NEXT: v_mov_b32_e32 v16, v13
+; GFX1010-NEXT: v_mov_b32_e32 v17, v9
+; GFX1010-NEXT: v_mov_b32_e32 v9, v10
+; GFX1010-NEXT: v_mov_b32_e32 v13, v12
+; GFX1010-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1010-NEXT: ; asyncmark
+; GFX1010-NEXT: ; wait_asyncmark(2)
+; GFX1010-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1010-NEXT: ; %bb.2: ; %epilog
+; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1010-NEXT: ds_read_b32 v0, v2
+; GFX1010-NEXT: ; wait_asyncmark(1)
+; GFX1010-NEXT: s_waitcnt vmcnt(3)
+; GFX1010-NEXT: ds_read_b32 v1, v2
+; GFX1010-NEXT: ; wait_asyncmark(0)
+; GFX1010-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-NEXT: ds_read_b32 v2, v2
+; GFX1010-NEXT: v_add_nc_u32_e32 v3, v17, v16
+; GFX1010-NEXT: s_waitcnt lgkmcnt(2)
+; GFX1010-NEXT: v_add3_u32 v0, v3, v0, v15
+; GFX1010-NEXT: s_waitcnt lgkmcnt(1)
+; GFX1010-NEXT: v_add3_u32 v0, v14, v0, v1
+; GFX1010-NEXT: v_add_nc_u32_e32 v1, v8, v11
+; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-NEXT: v_add3_u32 v0, v1, v2, v0
+; GFX1010-NEXT: global_store_dword v[5:6], v0, off
+; GFX1010-NEXT: s_setpc_b64 s[30:31]
+prolog:
+ ; Load first iteration
+ %v0 = load i32, ptr addrspace(1) %foo
+ %g0 = load i32, ptr addrspace(1) %bar
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Load second iteration
+ %v1 = load i32, ptr addrspace(1) %foo
+ %g1 = load i32, ptr addrspace(1) %bar
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ br label %loop_body
+
+ ; The vmcnt at the end of the prolog and at the start of the loop header seems
+ ; to be a result of the PHI nodes whose inputs are global loads. It is
+ ; stricter than necessary, to the point that the pipelined loop now has at
+ ; most two outstanding async ops instead of three. We could, in principle,
+ ; further relax the wait by introducing async global loads (not LDS DMA) in a
+ ; similar way.
+
+loop_body:
+ %i = phi i32 [ 2, %prolog ], [ %i.next, %loop_body ]
+ %prev_sum = phi i32 [ 0, %prolog ], [ %sum, %loop_body ]
+ %prev_v = phi i32 [ %v0, %prolog ], [ %v1, %loop_body ]
+ %prev_g = phi i32 [ %g0, %prolog ], [ %g1, %loop_body ]
+ %v1_phi = phi i32 [ %v1, %prolog ], [ %cur_v, %loop_body ]
+ %g1_phi = phi i32 [ %g1, %prolog ], [ %cur_g, %loop_body ]
+
+ ; Load next iteration
+ %cur_v = load i32, ptr addrspace(1) %foo
+ %cur_g = load i32, ptr addrspace(1) %bar
+ call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ ; Wait for iteration i-2 and process
+ call void @llvm.amdgcn.wait.asyncmark(i16 2)
+ %lds_idx = sub i32 %i, 2
+ %lds_val = load i32, ptr addrspace(3) %lds
+
+ %sum1 = add i32 %prev_v, %prev_g
+ %sum = add i32 %sum1, %lds_val
+
+ %i.next = add i32 %i, 1
+ %cmp = icmp slt i32 %i.next, %n
+ br i1 %cmp, label %loop_body, label %epilog
+
+epilog:
+ ; Process remaining iterations
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %lds_val_n_2 = load i32, ptr addrspace(3) %lds
+ %sum_e0 = add i32 %sum, %g1_phi
+ %sum_e1 = add i32 %v1_phi, %sum_e0
+ %sum_e2 = add i32 %sum_e1, %lds_val_n_2
+
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ %lds_val_n_1 = load i32, ptr addrspace(3) %lds
+ %sum_e3 = add i32 %cur_v, %cur_g
+ %sum_e4 = add i32 %sum_e3, %lds_val_n_1
+ %sum_e5 = add i32 %sum_e4, %sum_e2
+ store i32 %sum_e5, ptr addrspace(1) %out
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-flat-instruction-valu-check.mir b/llvm/test/CodeGen/AMDGPU/hazard-flat-instruction-valu-check.mir
index 1b2fb6ca1cdb7..a43b221a25fa2 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard-flat-instruction-valu-check.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard-flat-instruction-valu-check.mir
@@ -12,10 +12,10 @@ body: |
; GCN-LABEL: name: test_flat_valu_hazard
; GCN: liveins: $vgpr0, $vgpr1
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, implicit $m0, implicit $exec
+ ; GCN-NEXT: GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, 0, implicit $m0, implicit $exec
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
- GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, implicit $m0, implicit $exec
+ GLOBAL_LOAD_LDS_DWORD_SADDR killed $sgpr0_sgpr1, killed $vgpr0, 32, 2, 0, implicit $m0, implicit $exec
$vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
FLAT_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
...
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
index 675a1c94bc435..ca622b739a1b8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
@@ -10,12 +10,12 @@ body: |
; GCN-LABEL: name: dma_then_fence
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
- ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
S_ENDPGM 0
@@ -31,13 +31,13 @@ body: |
; GCN-LABEL: name: dma_then_global_load
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
- ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -71,12 +71,12 @@ body: |
bb.0:
; GCN-LABEL: name: dma_then_system_fence
; GCN: S_WAITCNT 0
- ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3953
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
$vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -93,13 +93,13 @@ body: |
; GCN-LABEL: name: merge_with_prev_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
- ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT 3952
S_WAITCNT_lds_direct
@@ -117,13 +117,13 @@ body: |
; GCN-LABEL: name: merge_with_next_wait
; GCN: S_WAITCNT 0
; GCN-NEXT: $m0 = S_MOV_B32 0
- ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
S_WAITCNT_lds_direct
S_WAITCNT 3952
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-hazards.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-hazards.mir
index 4fe0ec45048ce..31b56a67e0464 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-hazards.mir
@@ -9,7 +9,7 @@ name: buffer_load_dword_lds
body: |
bb.0:
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec, implicit $m0
+ BUFFER_LOAD_DWORD_LDS_ADDR64 $vgpr0_vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec, implicit $m0
...
# GCN-LABEL: name: buffer_store_lds_dword
@@ -33,7 +33,7 @@ name: global_load_lds_dword
body: |
bb.0:
$m0 = S_MOV_B32 0
- GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $m0
+ GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 0, 0, 0, implicit $exec, implicit $m0
...
# GCN-LABEL: name: scratch_load_lds_dword
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
index 21372c06d3223..e833626f6374e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
@@ -10,7 +10,7 @@ name: buffer_load_dword_lds_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
@@ -27,7 +27,7 @@ name: buffer_load_dword_lds_vmcnt_1
body: |
bb.0:
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr10 = BUFFER_LOAD_DWORD_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
@@ -44,7 +44,7 @@ name: buffer_load_dword_lds_flat_read
body: |
bb.0:
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
$vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr poison`)
S_ENDPGM 0
@@ -61,7 +61,7 @@ name: global_load_lds_dword_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
- GLOBAL_LOAD_LDS_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ GLOBAL_LOAD_LDS_DWORD $vgpr0_vgpr1, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
@@ -110,9 +110,9 @@ name: series_of_buffer_load_dword_lds_ds_read
body: |
bb.0:
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
- BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll
index 770283563001f..f9cfc1f487327 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.lds.err.ll
@@ -18,20 +18,44 @@ define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) i
ret void
}
+;--- struct.async.ll
+define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
+ call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret void
+}
+
;--- struct.ptr.ll
define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0)
ret void
}
+;--- struct.ptr.async.ll
+define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+ call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret void
+}
+
;--- raw.ll
define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
ret void
}
+;--- raw.async.ll
+define amdgpu_ps void @buffer_load_lds(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds) {
+ call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+ ret void
+}
+
;--- raw.ptr.ll
define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
ret void
}
+
+;--- raw.ptr.async.ll
+define amdgpu_ps void @buffer_load_lds(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+ call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
index a4aad574aaaf4..876a58f25e668 100644
--- a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -18,11 +18,11 @@ body: |
; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
; CHECK-NEXT: $m0 = S_MOV_B32 0
- ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, 0, implicit $exec, implicit $m0
; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
; CHECK-NEXT: $m0 = S_MOV_B32 1
- ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, 0, implicit $exec, implicit $m0
; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
@@ -41,9 +41,9 @@ body: |
%4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
$m0 = S_MOV_B32 0
- BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $m0
$m0 = S_MOV_B32 1
- BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $m0
%6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
%7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
%8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
More information about the cfe-commits
mailing list