[clang] [llvm] [AMDGPU][gfx1250] Add 128B cooperative atomics (PR #156418)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 2 00:36:39 PDT 2025
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/156418
- Add clang built-ins + sema/codegen
- Add IR Intrinsic + verifier
- Add DAG/GlobalISel codegen for the intrinsics
- Add lowering in SIMemoryLegalizer using a MMO flag.
>From d798d8c9e008e786f66a6991b3944cdbc9bc959f Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 2 Sep 2025 09:34:46 +0200
Subject: [PATCH] [AMDGPU][gfx1250] Add 128B cooperative atomics
- Add clang built-ins + sema/codegen
- Add IR Intrinsic + verifier
- Add DAG/GlobalISel codegen for the intrinsics
- Add lowering in SIMemoryLegalizer using a MMO flag.
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 10 +
.../clang/Basic/DiagnosticSemaKinds.td | 4 +-
clang/include/clang/Sema/SemaAMDGPU.h | 2 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 43 ++
clang/lib/Sema/SemaAMDGPU.cpp | 53 ++
...tins-amdgcn-gfx1250-cooperative-atomics.cl | 104 ++++
...mdgcn-error-gfx1250-cooperative-atomics.cl | 66 +++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 30 +
.../include/llvm/Target/TargetSelectionDAG.td | 13 +
llvm/lib/IR/Verifier.cpp | 22 +
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 6 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 +
llvm/lib/Target/AMDGPU/FLATInstructions.td | 4 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 75 +++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 +
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 44 +-
.../llvm.amdgcn.cooperative.atomic-agent.ll | 521 +++++++++++++++++
.../llvm.amdgcn.cooperative.atomic-basic.ll | 49 ++
....amdgcn.cooperative.atomic-singlethread.ll | 479 ++++++++++++++++
.../llvm.amdgcn.cooperative.atomic-system.ll | 533 ++++++++++++++++++
...llvm.amdgcn.cooperative.atomic-verifier.ll | 47 ++
...lvm.amdgcn.cooperative.atomic-wavefront.ll | 479 ++++++++++++++++
...lvm.amdgcn.cooperative.atomic-workgroup.ll | 479 ++++++++++++++++
24 files changed, 3078 insertions(+), 4 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl
create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6f5d1e024b91d..fe48f3d57ff4d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -828,5 +828,15 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbI
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+// GFX12.5 128B cooperative atomics
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_load_32x4B, "ii*IicC*", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_store_32x4B, "vi*iIicC*", "nc", "gfx1250-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_load_16x8B, "V2iV2i*IicC*", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_store_16x8B, "vV2i*V2iIicC*", "nc", "gfx1250-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_load_8x16B, "V4iV4i*IicC*", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_store_8x16B, "vV4i*V4iIicC*", "nc", "gfx1250-insts,wavefrontsize32")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c934fed2c7462..4e459d5819212 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10692,7 +10692,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
// Diagnostics based on the Lifetime safety analysis.
def warn_lifetime_safety_loan_expires_permissive : Warning<
- "object whose reference is captured does not live long enough">,
+ "object whose reference is captured does not live long enough">,
InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
def warn_lifetime_safety_loan_expires_strict : Warning<
"object whose reference is captured may not live long enough">,
@@ -13598,4 +13598,6 @@ def warn_acc_var_referenced_lacks_op
// AMDGCN builtins diagnostics
def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+
+def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d62c9bb65fadb..bac812a9d4fcf 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -26,6 +26,8 @@ class SemaAMDGPU : public SemaBase {
bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+ bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
+
bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
unsigned NumDataArgs);
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 433d76b2812db..7ad73c0324cff 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -680,6 +680,49 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return emitBuiltinWithOneOverloadedType<5>(*this, E,
Intrinsic::amdgcn_load_to_lds);
}
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
+ IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
+ IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
+ IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
+ IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
+ IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
+ IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
+ break;
+ }
+
+ LLVMContext &Ctx = CGM.getLLVMContext();
+ SmallVector<Value *, 5> Args;
+ // last argument is a MD string
+ const unsigned ScopeArg = E->getNumArgs() - 1;
+ for (unsigned i = 0; i != ScopeArg; ++i)
+ Args.push_back(EmitScalarExpr(E->getArg(i)));
+ StringRef Arg = cast<StringLiteral>(E->getArg(ScopeArg)->IgnoreParenCasts())
+ ->getString();
+ llvm::MDNode *MD = llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, Arg)});
+ Args.push_back(llvm::MetadataAsValue::get(Ctx, MD));
+ // Intrinsic is typed based on the pointer AS. Pointer is always the first
+ // argument.
+ llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
+ return Builder.CreateCall(F, {Args});
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 1913bb830ccd0..baba503239e9f 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -15,6 +15,7 @@
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Sema/Ownership.h"
#include "clang/Sema/Sema.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/AtomicOrdering.h"
#include <cstdint>
@@ -100,6 +101,14 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7);
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
+ return checkCoopAtomicFunctionCall(TheCall, /*IsStore=*/false);
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
+ case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
+ return checkCoopAtomicFunctionCall(TheCall, /*IsStore=*/true);
default:
return false;
}
@@ -145,6 +154,50 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
return false;
}
+bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
+ bool Fail = false;
+
+ // First argument is a global or generic pointer.
+ Expr *PtrArg = TheCall->getArg(0);
+ QualType PtrTy = PtrArg->getType()->getPointeeType();
+ unsigned AS = getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace());
+ if (AS != llvm::AMDGPUAS::FLAT_ADDRESS &&
+ AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) {
+ Fail = true;
+ Diag(TheCall->getBeginLoc(), diag::err_amdgcn_coop_atomic_invalid_as)
+ << PtrArg->getSourceRange();
+ }
+
+ // Check atomic ordering
+ Expr *AtomicOrdArg = TheCall->getArg(IsStore ? 2 : 1);
+ Expr::EvalResult AtomicOrdArgRes;
+ if (!AtomicOrdArg->EvaluateAsInt(AtomicOrdArgRes, getASTContext()))
+ llvm_unreachable("Intrinsic requires imm for atomic ordering argument!");
+ auto Ord =
+ llvm::AtomicOrderingCABI(AtomicOrdArgRes.Val.getInt().getZExtValue());
+
+ // Atomic ordering cannot be acq_rel in any case, acquire for stores or
+ // release for loads.
+ if (!llvm::isValidAtomicOrderingCABI((unsigned)Ord) ||
+ (Ord == llvm::AtomicOrderingCABI::acq_rel) ||
+ Ord == (IsStore ? llvm::AtomicOrderingCABI::acquire
+ : llvm::AtomicOrderingCABI::release)) {
+ return Diag(AtomicOrdArg->getBeginLoc(),
+ diag::warn_atomic_op_has_invalid_memory_order)
+ << 0 << AtomicOrdArg->getSourceRange();
+ }
+
+ // Last argument is a string literal
+ Expr *Arg = TheCall->getArg(TheCall->getNumArgs() - 1);
+ if (!isa<StringLiteral>(Arg->IgnoreParenImpCasts())) {
+ Fail = true;
+ Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
+ << Arg->getSourceRange();
+ }
+
+ return Fail;
+}
+
bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
unsigned NumDataArgs) {
assert(NumDataArgs <= 2);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl
new file mode 100644
index 0000000000000..8768f2f367654
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl
@@ -0,0 +1,104 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[VAL]], i32 0, metadata [[META4:![0-9]+]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_cooperative_atomic_store_32x4B(global int* gaddr, int val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent");
+}
+
+// CHECK-LABEL: define dso_local i32 @test_amdgcn_cooperative_atomic_load_32x4B(
+// CHECK-SAME: ptr noundef readonly captures(none) [[ADDR:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr [[ADDR]], i32 0, metadata [[META5:![0-9]+]])
+// CHECK-NEXT: ret i32 [[TMP0]]
+//
+int test_amdgcn_cooperative_atomic_load_32x4B(int* addr)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_32x4B(addr, __ATOMIC_RELAXED, "");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_16x8B(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], <2 x i32> noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p1(ptr addrspace(1) [[GADDR]], <2 x i32> [[VAL]], i32 0, metadata [[META5]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_cooperative_atomic_store_16x8B(global v2i* gaddr, v2i val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_16x8B(gaddr, val, __ATOMIC_RELAXED, "");
+}
+
+// CHECK-LABEL: define dso_local <2 x i32> @test_amdgcn_cooperative_atomic_load_16x8B(
+// CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p1(ptr addrspace(1) [[GADDR]], i32 0, metadata [[META6:![0-9]+]])
+// CHECK-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_cooperative_atomic_load_16x8B(global v2i* gaddr)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_16x8B(gaddr, __ATOMIC_RELAXED, "workgroup");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_8x16B(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], <4 x i32> noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p1(ptr addrspace(1) [[GADDR]], <4 x i32> [[VAL]], i32 0, metadata [[META7:![0-9]+]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_cooperative_atomic_store_8x16B(global v4i* gaddr, v4i val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_8x16B(gaddr, val, __ATOMIC_RELAXED, "singlethread");
+}
+
+// CHECK-LABEL: define dso_local <4 x i32> @test_amdgcn_cooperative_atomic_load_8x16B(
+// CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p1(ptr addrspace(1) [[GADDR]], i32 0, metadata [[META4]])
+// CHECK-NEXT: ret <4 x i32> [[TMP0]]
+//
+v4i test_amdgcn_cooperative_atomic_load_8x16B(global v4i* gaddr)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELAXED, "agent");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B_truncated(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i64 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[VAL]] to i32
+// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[CONV]], i32 0, metadata [[META4]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_cooperative_atomic_store_32x4B_truncated(global int* gaddr, long val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B_extended(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i8 noundef signext [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[CONV:%.*]] = sext i8 [[VAL]] to i32
+// CHECK-NEXT: tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[CONV]], i32 0, metadata [[META4]])
+// CHECK-NEXT: ret void
+//
+void test_amdgcn_cooperative_atomic_store_32x4B_extended(global int* gaddr, char val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent");
+}
+
+//.
+// CHECK: [[META4]] = !{!"agent"}
+// CHECK: [[META5]] = !{!""}
+// CHECK: [[META6]] = !{!"workgroup"}
+// CHECK: [[META7]] = !{!"singlethread"}
+//.
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
new file mode 100644
index 0000000000000..0ab9a5a43e718
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+
+void test_amdgcn_cooperative_atomic_store_32x4B(global int* gaddr, int val, const char* syncscope)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+int test_amdgcn_cooperative_atomic_load_32x4B(global int* gaddr, const char* syncscope)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_32x4B(gaddr, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+void test_amdgcn_cooperative_atomic_store_16x8B(global v2i* gaddr, v2i val, const char* syncscope)
+{
+ __builtin_amdgcn_cooperative_atomic_store_16x8B(gaddr, val, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+v2i test_amdgcn_cooperative_atomic_load_16x8B(global v2i* gaddr, const char* syncscope)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_16x8B(gaddr, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+void test_amdgcn_cooperative_atomic_store_8x16B(global v4i* gaddr, v4i val, const char* syncscope)
+{
+ __builtin_amdgcn_cooperative_atomic_store_8x16B(gaddr, val, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+v4i test_amdgcn_cooperative_atomic_load_8x16B(global v4i* gaddr, const char* syncscope)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+v4i test_amdgcn_cooperative_atomic_load_8x16B_release(global v4i* gaddr)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELEASE, ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
+
+v4i test_amdgcn_cooperative_atomic_load_8x16B_acq_rel(global v4i* gaddr)
+{
+ return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_ACQ_REL, ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__sharedptr(local int* addr, int val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{cooperative atomic requires a global or generic pointer}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__ordering_not_imm(local int* addr, int ord, int val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, ord, ""); // expected-error {{argument to '__builtin_amdgcn_cooperative_atomic_store_32x4B' must be a constant integer}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__acquire(int* addr, int ord, int val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_ACQUIRE, ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__acq_rel(int* addr, int ord, int val)
+{
+ __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_ACQ_REL, ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a4d4adae580d0..c9f89566b80c7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3769,6 +3769,36 @@ def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4"
DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
[IntrNoMem, IntrSpeculatable]>;
+class AMDGPUCooperativeAtomicStore<LLVMType Ty> : Intrinsic <
+ [],
+ [llvm_anyptr_ty, // pointer to store to
+ Ty, // value to store
+ llvm_i32_ty, // C ABI Atomic Ordering ID
+ llvm_metadata_ty], // syncscope
+ [IntrWriteMem, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>,
+ IntrNoCallback, IntrNoFree, IntrConvergent],
+ "",
+ [SDNPMemOperand, SDNPMayStore]
+>;
+
+class AMDGPUCooperativeAtomicLoad<LLVMType Ty> : Intrinsic <
+ [Ty],
+ [llvm_anyptr_ty, // pointer to load from
+ llvm_i32_ty, // C ABI Atomic Ordering ID
+ llvm_metadata_ty], // syncscope
+ [IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ IntrNoCallback, IntrNoFree, IntrConvergent],
+ "",
+ [SDNPMemOperand, SDNPMayLoad]
+>;
+
+def int_amdgcn_cooperative_atomic_load_32x4B : AMDGPUCooperativeAtomicLoad<llvm_i32_ty>;
+def int_amdgcn_cooperative_atomic_store_32x4B : AMDGPUCooperativeAtomicStore<llvm_i32_ty>;
+def int_amdgcn_cooperative_atomic_load_16x8B : AMDGPUCooperativeAtomicLoad<llvm_v2i32_ty>;
+def int_amdgcn_cooperative_atomic_store_16x8B : AMDGPUCooperativeAtomicStore<llvm_v2i32_ty>;
+def int_amdgcn_cooperative_atomic_load_8x16B : AMDGPUCooperativeAtomicLoad<llvm_v4i32_ty>;
+def int_amdgcn_cooperative_atomic_store_8x16B : AMDGPUCooperativeAtomicStore<llvm_v4i32_ty>;
+
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a4ed62bb5715c..b14078d306eda 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1960,6 +1960,12 @@ def atomic_load_nonext_64 :
let MemoryVT = i64;
}
+def atomic_load_nonext_128 :
+ PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
+ let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
+ let MemoryVT = i128;
+}
+
def atomic_load_zext_8 :
PatFrag<(ops node:$ptr), (atomic_load_zext node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
@@ -2191,6 +2197,13 @@ def atomic_store_64 :
let MemoryVT = i64;
}
+def atomic_store_128 :
+ PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store node:$val, node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = i128;
+}
+
//===----------------------------------------------------------------------===//
// Selection DAG Pattern Support.
//
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index da05ff166122f..ed08e7e93a83d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6776,6 +6776,28 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
"invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
break;
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ // Check we only use this intrinsic on the FLAT or GLOBAL address spaces.
+ Value *PtrArg = Call.getArgOperand(0);
+ const unsigned AS = PtrArg->getType()->getPointerAddressSpace();
+ Check(AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS,
+ "cooperative atomic intrinsics require a generic or global pointer",
+ &Call, PtrArg);
+
+ // Last argument must be a MD string
+ auto *Op = cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1));
+ MDNode *MD = cast<MDNode>(Op->getMetadata());
+ Check((MD->getNumOperands() == 1) && isa<MDString>(MD->getOperand(0)),
+ "cooperative atomic intrinsics require that the last argument is a "
+ "metadata string",
+ &Call, Op);
+ break;
+ }
case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
Value *V = Call.getArgOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index efcd87e466207..bd443b5b6f1e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -509,6 +509,10 @@ def atomic_load_nonext_64_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_64
let IsAtomic = 1;
}
+def atomic_load_nonext_128_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_128 node:$ptr)> {
+ let IsAtomic = 1;
+}
+
def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> {
let IsAtomic = 1;
}
@@ -573,6 +577,8 @@ def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr),
(atomic_store_32 node:$val, node:$ptr)>;
def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr),
(atomic_store_64 node:$val, node:$ptr)>;
+def atomic_store_128_#as : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_128 node:$val, node:$ptr)>;
} // End let IsAtomic = 1, AddressSpaces = ...
} // End foreach as
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 55a76f1172cb9..64ba53c5d45d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7819,6 +7819,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MI.eraseFromParent();
return true;
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+ assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+ B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
+ assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+ B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
+ MI.eraseFromParent();
+ return true;
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 80f0d504ea30c..f7ab2cdb724ce 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1895,6 +1895,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, atomic_load_nonext_128_flat, v4i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -1919,6 +1920,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, atomic_store_128_flat, v4i32>;
defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
@@ -2065,6 +2067,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, atomic_load_nonext_128_global, v4i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -2105,6 +2108,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, atomic_store_128_global, v4i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a861d9a96c9e3..cf19b66501e5e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1267,18 +1267,53 @@ static unsigned getIntrMemWidth(unsigned IntrID) {
return 8;
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
return 32;
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
return 64;
case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
return 128;
default:
llvm_unreachable("Unknown width");
}
}
+static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
+ TargetLoweringBase::IntrinsicInfo &Info) {
+ Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+ unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
+ switch (AtomicOrderingCABI(Ord)) {
+ case AtomicOrderingCABI::acquire:
+ Info.order = AtomicOrdering::Acquire;
+ break;
+ case AtomicOrderingCABI::release:
+ Info.order = AtomicOrdering::Release;
+ break;
+ case AtomicOrderingCABI::seq_cst:
+ Info.order = AtomicOrdering::SequentiallyConsistent;
+ break;
+ default:
+ Info.order = AtomicOrdering::Monotonic;
+ break;
+ }
+
+ Info.flags =
+ (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
+ Info.flags |= MOCooperative;
+
+ MDNode *ScopeMD = cast<MDNode>(
+ cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+ StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
+ Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1525,6 +1560,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad;
return true;
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
+ return true;
+ }
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.align.reset();
+ getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -10246,6 +10301,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
+ case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ EVT VT = Op->getValueType(0);
+ return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
+ Chain, Ptr, MII->getMemOperand());
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10878,6 +10943,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
+ case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+ case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+ MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDValue Ptr = Op->getOperand(2);
+ SDValue Val = Op->getOperand(3);
+ return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
+ Ptr, MII->getMemOperand());
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d43924d46b005..acc70bb52ba5f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9554,6 +9554,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
{
{MONoClobber, "amdgpu-noclobber"},
{MOLastUse, "amdgpu-last-use"},
+ {MOCooperative, "amdgpu-cooperative"},
};
return ArrayRef(TargetFlags);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fdbd9ce4a66bf..45d9683740f55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -48,6 +48,10 @@ static const MachineMemOperand::Flags MONoClobber =
static const MachineMemOperand::Flags MOLastUse =
MachineMemOperand::MOTargetFlag2;
+/// Mark the MMO of cooperative load/store atomics.
+static const MachineMemOperand::Flags MOCooperative =
+ MachineMemOperand::MOTargetFlag3;
+
/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
SIInstrWorklist() = default;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53f554eccb1fb..9e7ccec247bcd 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -103,6 +103,7 @@ class SIMemOpInfo final {
bool IsVolatile = false;
bool IsNonTemporal = false;
bool IsLastUse = false;
+ bool IsCooperative = false;
SIMemOpInfo(
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -112,14 +113,15 @@ class SIMemOpInfo final {
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false, bool IsNonTemporal = false,
- bool IsLastUse = false)
+ bool IsLastUse = false, bool IsCooperative = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
- IsLastUse(IsLastUse) {
+ IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
if (Ordering == AtomicOrdering::NotAtomic) {
+ assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
assert(Scope == SIAtomicScope::NONE &&
OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
!IsCrossAddressSpaceOrdering &&
@@ -209,6 +211,9 @@ class SIMemOpInfo final {
/// create this SIMemOpInfo is last use, false otherwise.
bool isLastUse() const { return IsLastUse; }
+ /// \returns True if this is a cooperative load or store atomic.
+ bool isCooperative() const { return IsCooperative; }
+
/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
bool isAtomic() const {
@@ -325,6 +330,12 @@ class SICacheControl {
return false;
};
+ /// Handle cooperative load/store atomics.
+ virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
+ llvm_unreachable(
+ "cooperative atomics are not available on this architecture");
+ }
+
/// Inserts any necessary instructions at position \p Pos relative
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -604,6 +615,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
+ virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
@@ -804,6 +817,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
bool IsNonTemporal = true;
bool IsVolatile = false;
bool IsLastUse = false;
+ bool IsCooperative = false;
// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
@@ -811,6 +825,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
IsNonTemporal &= MMO->isNonTemporal();
IsVolatile |= MMO->isVolatile();
IsLastUse |= MMO->getFlags() & MOLastUse;
+ IsCooperative |= MMO->getFlags() & MOCooperative;
InstrAddrSpace |=
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -852,7 +867,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
- IsNonTemporal, IsLastUse);
+ IsNonTemporal, IsLastUse, IsCooperative);
}
std::optional<SIMemOpInfo>
@@ -2592,6 +2607,19 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
}
+bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
+ if (!ST.hasGFX1250Insts())
+ return false;
+
+ // Cooperative atomics need to be SCOPE_DEV or higher.
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ assert(CPol && "No CPol operand?");
+ const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+ if (Scope < CPol::SCOPE_DEV)
+ return setScope(MI, CPol::SCOPE_DEV);
+ return false;
+}
+
bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
@@ -2656,6 +2684,11 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
MOI.getOrderingAddrSpace());
}
+ // Handle cooperative atomics after cache bypass step, as it may override
+ // the scope of the instruction to a greater scope.
+ if (MOI.isCooperative())
+ Changed |= CC->handleCooperativeAtomic(*MI);
+
if (Order == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
@@ -2701,6 +2734,11 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getOrderingAddrSpace());
}
+ // Handle cooperative atomics after cache bypass step, as it may override
+ // the scope of the instruction to a greater scope.
+ if (MOI.isCooperative())
+ Changed |= CC->handleCooperativeAtomic(*MI);
+
if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertRelease(MI, MOI.getScope(),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
new file mode 100644
index 0000000000000..e3ec4d1f0f67a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
@@ -0,0 +1,521 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+!0 = !{ !"agent" }
+!1 = !{ !"agent-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll
new file mode 100644
index 0000000000000..1a2d2fe3f441c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; test behavior with unsupported 'consume' and 'acq_rel' ordering
+
+define i32 @test_load_consume(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_load_consume:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 1, metadata !0)
+ ret i32 %0
+}
+
+define i32 @test_load_acqrel(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_load_acqrel:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 4, metadata !0)
+ ret i32 %0
+}
+
+define void @test_store_acqrel(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_store_acqrel:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 4, metadata !0)
+ ret void
+}
+
+!0 = !{ !"" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll
new file mode 100644
index 0000000000000..f97bced6364db
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+!0 = !{ !"singlethread" }
+!1 = !{ !"singlethread-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll
new file mode 100644
index 0000000000000..c4234cc0de06a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll
@@ -0,0 +1,533 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+!0 = !{ !"" }
+!1 = !{ !"one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll
new file mode 100644
index 0000000000000..3ada966c5db26
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: not opt -mtriple=amdgcn -mcpu=gfx1250 -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define i32 @load_local_as(ptr addrspace(3) noundef %addr) {
+entry:
+ %res = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p3(ptr addrspace(3) %addr, i32 0, metadata !0)
+ ret i32 %res
+}
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define i32 @load_private_as(ptr addrspace(5) noundef %addr) {
+entry:
+ %res = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p5(ptr addrspace(5) %addr, i32 0, metadata !0)
+ ret i32 %res
+}
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define void @store_local_as(ptr addrspace(3) noundef %addr, i32 noundef %val) {
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p3(ptr addrspace(3) %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define void @store_private_as(ptr addrspace(5) noundef %addr, i32 noundef %val) {
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p5(ptr addrspace(5) %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+
+; CHECK: cooperative atomic intrinsics require that the last argument is a metadata string
+define i32 @test_empty_md(ptr noundef readonly %addr) {
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 1, metadata !{})
+ ret i32 %0
+}
+
+; CHECK: cooperative atomic intrinsics require that the last argument is a metadata string
+define i32 @test_no_md_str(ptr noundef readonly %addr) {
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 1, metadata !{!{}})
+ ret i32 %0
+}
+
+!0 = !{ !"" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll
new file mode 100644
index 0000000000000..329b7612426dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+!0 = !{ !"wavefront" }
+!1 = !{ !"wavefront-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
new file mode 100644
index 0000000000000..e86f0e0083805
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+ ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+ ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+ ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+ ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+ ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+ ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+ ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val) {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+ ret void
+}
+
+!0 = !{ !"workgroup" }
+!1 = !{ !"workgroup-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
More information about the llvm-commits
mailing list