[clang] [llvm] [AMDGPU][gfx1250] Add 128B cooperative atomics (PR #156418)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 2 00:36:39 PDT 2025


https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/156418

- Add clang built-ins + sema/codegen
- Add IR Intrinsic + verifier
- Add DAG/GlobalISel codegen for the intrinsics
- Add lowering in SIMemoryLegalizer using a MMO flag.

>From d798d8c9e008e786f66a6991b3944cdbc9bc959f Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 2 Sep 2025 09:34:46 +0200
Subject: [PATCH] [AMDGPU][gfx1250] Add 128B cooperative atomics

- Add clang built-ins + sema/codegen
- Add IR Intrinsic + verifier
- Add DAG/GlobalISel codegen for the intrinsics
- Add lowering in SIMemoryLegalizer using a MMO flag.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  10 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +-
 clang/include/clang/Sema/SemaAMDGPU.h         |   2 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  43 ++
 clang/lib/Sema/SemaAMDGPU.cpp                 |  53 ++
 ...tins-amdgcn-gfx1250-cooperative-atomics.cl | 104 ++++
 ...mdgcn-error-gfx1250-cooperative-atomics.cl |  66 +++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  30 +
 .../include/llvm/Target/TargetSelectionDAG.td |  13 +
 llvm/lib/IR/Verifier.cpp                      |  22 +
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   6 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  14 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td    |   4 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  75 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   4 +
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |  44 +-
 .../llvm.amdgcn.cooperative.atomic-agent.ll   | 521 +++++++++++++++++
 .../llvm.amdgcn.cooperative.atomic-basic.ll   |  49 ++
 ....amdgcn.cooperative.atomic-singlethread.ll | 479 ++++++++++++++++
 .../llvm.amdgcn.cooperative.atomic-system.ll  | 533 ++++++++++++++++++
 ...llvm.amdgcn.cooperative.atomic-verifier.ll |  47 ++
 ...lvm.amdgcn.cooperative.atomic-wavefront.ll | 479 ++++++++++++++++
 ...lvm.amdgcn.cooperative.atomic-workgroup.ll | 479 ++++++++++++++++
 24 files changed, 3078 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6f5d1e024b91d..fe48f3d57ff4d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -828,5 +828,15 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbI
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 
+// GFX12.5 128B cooperative atomics
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_load_32x4B,  "ii*IicC*",  "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_store_32x4B, "vi*iIicC*", "nc", "gfx1250-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_load_16x8B,  "V2iV2i*IicC*",  "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_store_16x8B, "vV2i*V2iIicC*", "nc", "gfx1250-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_load_8x16B,  "V4iV4i*IicC*",  "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_cooperative_atomic_store_8x16B, "vV4i*V4iIicC*", "nc", "gfx1250-insts,wavefrontsize32")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c934fed2c7462..4e459d5819212 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10692,7 +10692,7 @@ def warn_dangling_reference_captured_by_unknown : Warning<
 
 // Diagnostics based on the Lifetime safety analysis.
 def warn_lifetime_safety_loan_expires_permissive : Warning<
-   "object whose reference is captured does not live long enough">, 
+   "object whose reference is captured does not live long enough">,
    InGroup<LifetimeSafetyPermissive>, DefaultIgnore;
 def warn_lifetime_safety_loan_expires_strict : Warning<
    "object whose reference is captured may not live long enough">,
@@ -13598,4 +13598,6 @@ def warn_acc_var_referenced_lacks_op
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
 def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+
+def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d62c9bb65fadb..bac812a9d4fcf 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -26,6 +26,8 @@ class SemaAMDGPU : public SemaBase {
 
   bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
 
+  bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
+
   bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
                                unsigned NumDataArgs);
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 433d76b2812db..7ad73c0324cff 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -680,6 +680,49 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     return emitBuiltinWithOneOverloadedType<5>(*this, E,
                                                Intrinsic::amdgcn_load_to_lds);
   }
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B: {
+    Intrinsic::ID IID;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
+      IID = Intrinsic::amdgcn_cooperative_atomic_load_32x4B;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
+      IID = Intrinsic::amdgcn_cooperative_atomic_store_32x4B;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
+      IID = Intrinsic::amdgcn_cooperative_atomic_load_16x8B;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
+      IID = Intrinsic::amdgcn_cooperative_atomic_store_16x8B;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
+      IID = Intrinsic::amdgcn_cooperative_atomic_load_8x16B;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
+      IID = Intrinsic::amdgcn_cooperative_atomic_store_8x16B;
+      break;
+    }
+
+    LLVMContext &Ctx = CGM.getLLVMContext();
+    SmallVector<Value *, 5> Args;
+    // last argument is a MD string
+    const unsigned ScopeArg = E->getNumArgs() - 1;
+    for (unsigned i = 0; i != ScopeArg; ++i)
+      Args.push_back(EmitScalarExpr(E->getArg(i)));
+    StringRef Arg = cast<StringLiteral>(E->getArg(ScopeArg)->IgnoreParenCasts())
+                        ->getString();
+    llvm::MDNode *MD = llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, Arg)});
+    Args.push_back(llvm::MetadataAsValue::get(Ctx, MD));
+    // Intrinsic is typed based on the pointer AS. Pointer is always the first
+    // argument.
+    llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
+    return Builder.CreateCall(F, {Args});
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
     Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
                                    {llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 1913bb830ccd0..baba503239e9f 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include <cstdint>
 
@@ -100,6 +101,14 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6:
   case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6:
     return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7);
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B:
+    return checkCoopAtomicFunctionCall(TheCall, /*IsStore=*/false);
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_32x4B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_16x8B:
+  case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_store_8x16B:
+    return checkCoopAtomicFunctionCall(TheCall, /*IsStore=*/true);
   default:
     return false;
   }
@@ -145,6 +154,50 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   return false;
 }
 
+bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
+  bool Fail = false;
+
+  // First argument is a global or generic pointer.
+  Expr *PtrArg = TheCall->getArg(0);
+  QualType PtrTy = PtrArg->getType()->getPointeeType();
+  unsigned AS = getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace());
+  if (AS != llvm::AMDGPUAS::FLAT_ADDRESS &&
+      AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) {
+    Fail = true;
+    Diag(TheCall->getBeginLoc(), diag::err_amdgcn_coop_atomic_invalid_as)
+        << PtrArg->getSourceRange();
+  }
+
+  // Check atomic ordering
+  Expr *AtomicOrdArg = TheCall->getArg(IsStore ? 2 : 1);
+  Expr::EvalResult AtomicOrdArgRes;
+  if (!AtomicOrdArg->EvaluateAsInt(AtomicOrdArgRes, getASTContext()))
+    llvm_unreachable("Intrinsic requires imm for atomic ordering argument!");
+  auto Ord =
+      llvm::AtomicOrderingCABI(AtomicOrdArgRes.Val.getInt().getZExtValue());
+
+  // Atomic ordering cannot be acq_rel in any case, acquire for stores or
+  // release for loads.
+  if (!llvm::isValidAtomicOrderingCABI((unsigned)Ord) ||
+      (Ord == llvm::AtomicOrderingCABI::acq_rel) ||
+      Ord == (IsStore ? llvm::AtomicOrderingCABI::acquire
+                      : llvm::AtomicOrderingCABI::release)) {
+    return Diag(AtomicOrdArg->getBeginLoc(),
+                diag::warn_atomic_op_has_invalid_memory_order)
+           << 0 << AtomicOrdArg->getSourceRange();
+  }
+
+  // Last argument is a string literal
+  Expr *Arg = TheCall->getArg(TheCall->getNumArgs() - 1);
+  if (!isa<StringLiteral>(Arg->IgnoreParenImpCasts())) {
+    Fail = true;
+    Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
+        << Arg->getSourceRange();
+  }
+
+  return Fail;
+}
+
 bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
                                          unsigned NumDataArgs) {
   assert(NumDataArgs <= 2);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl
new file mode 100644
index 0000000000000..8768f2f367654
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-cooperative-atomics.cl
@@ -0,0 +1,104 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[VAL]], i32 0, metadata [[META4:![0-9]+]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_cooperative_atomic_store_32x4B(global int* gaddr, int val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent");
+}
+
+// CHECK-LABEL: define dso_local i32 @test_amdgcn_cooperative_atomic_load_32x4B(
+// CHECK-SAME: ptr noundef readonly captures(none) [[ADDR:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr [[ADDR]], i32 0, metadata [[META5:![0-9]+]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int test_amdgcn_cooperative_atomic_load_32x4B(int* addr)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_32x4B(addr, __ATOMIC_RELAXED, "");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_16x8B(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], <2 x i32> noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p1(ptr addrspace(1) [[GADDR]], <2 x i32> [[VAL]], i32 0, metadata [[META5]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_cooperative_atomic_store_16x8B(global v2i* gaddr, v2i val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_16x8B(gaddr, val, __ATOMIC_RELAXED, "");
+}
+
+// CHECK-LABEL: define dso_local <2 x i32> @test_amdgcn_cooperative_atomic_load_16x8B(
+// CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p1(ptr addrspace(1) [[GADDR]], i32 0, metadata [[META6:![0-9]+]])
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_cooperative_atomic_load_16x8B(global v2i* gaddr)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_16x8B(gaddr, __ATOMIC_RELAXED, "workgroup");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_8x16B(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], <4 x i32> noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p1(ptr addrspace(1) [[GADDR]], <4 x i32> [[VAL]], i32 0, metadata [[META7:![0-9]+]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_cooperative_atomic_store_8x16B(global v4i* gaddr, v4i val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_8x16B(gaddr, val, __ATOMIC_RELAXED, "singlethread");
+}
+
+// CHECK-LABEL: define dso_local <4 x i32> @test_amdgcn_cooperative_atomic_load_8x16B(
+// CHECK-SAME: ptr addrspace(1) noundef readonly captures(none) [[GADDR:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p1(ptr addrspace(1) [[GADDR]], i32 0, metadata [[META4]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i test_amdgcn_cooperative_atomic_load_8x16B(global v4i* gaddr)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELAXED, "agent");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B_truncated(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i64 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[VAL]] to i32
+// CHECK-NEXT:    tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[CONV]], i32 0, metadata [[META4]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_cooperative_atomic_store_32x4B_truncated(global int* gaddr, long val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent");
+}
+
+// CHECK-LABEL: define dso_local void @test_amdgcn_cooperative_atomic_store_32x4B_extended(
+// CHECK-SAME: ptr addrspace(1) noundef writeonly captures(none) [[GADDR:%.*]], i8 noundef signext [[VAL:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[VAL]] to i32
+// CHECK-NEXT:    tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p1(ptr addrspace(1) [[GADDR]], i32 [[CONV]], i32 0, metadata [[META4]])
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_cooperative_atomic_store_32x4B_extended(global int* gaddr, char val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, "agent");
+}
+
+//.
+// CHECK: [[META4]] = !{!"agent"}
+// CHECK: [[META5]] = !{!""}
+// CHECK: [[META6]] = !{!"workgroup"}
+// CHECK: [[META7]] = !{!"singlethread"}
+//.
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
new file mode 100644
index 0000000000000..0ab9a5a43e718
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+
+void test_amdgcn_cooperative_atomic_store_32x4B(global int* gaddr, int val, const char* syncscope)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(gaddr, val, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+int test_amdgcn_cooperative_atomic_load_32x4B(global int* gaddr, const char* syncscope)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_32x4B(gaddr, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+void test_amdgcn_cooperative_atomic_store_16x8B(global v2i* gaddr, v2i val, const char* syncscope)
+{
+  __builtin_amdgcn_cooperative_atomic_store_16x8B(gaddr, val, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+v2i test_amdgcn_cooperative_atomic_load_16x8B(global v2i* gaddr, const char* syncscope)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_16x8B(gaddr, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+void test_amdgcn_cooperative_atomic_store_8x16B(global v4i* gaddr, v4i val, const char* syncscope)
+{
+  __builtin_amdgcn_cooperative_atomic_store_8x16B(gaddr, val, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+v4i test_amdgcn_cooperative_atomic_load_8x16B(global v4i* gaddr, const char* syncscope)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELAXED, syncscope); // expected-error {{expression is not a string literal}}
+}
+
+v4i test_amdgcn_cooperative_atomic_load_8x16B_release(global v4i* gaddr)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_RELEASE, ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
+
+v4i test_amdgcn_cooperative_atomic_load_8x16B_acq_rel(global v4i* gaddr)
+{
+  return __builtin_amdgcn_cooperative_atomic_load_8x16B(gaddr, __ATOMIC_ACQ_REL, ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__sharedptr(local int* addr, int val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED,  ""); // expected-error {{cooperative atomic requires a global or generic pointer}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__ordering_not_imm(local int* addr, int ord, int val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, ord,  ""); // expected-error {{argument to '__builtin_amdgcn_cooperative_atomic_store_32x4B' must be a constant integer}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__acquire(int* addr, int ord, int val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_ACQUIRE,  ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
+
+void test_amdgcn_cooperative_atomic_store_32x4B__acq_rel(int* addr, int ord, int val)
+{
+  __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_ACQ_REL,  ""); // expected-warning {{memory order argument to atomic operation is invalid}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a4d4adae580d0..c9f89566b80c7 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3769,6 +3769,36 @@ def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4"
   DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
                         [IntrNoMem, IntrSpeculatable]>;
 
+class AMDGPUCooperativeAtomicStore<LLVMType Ty> : Intrinsic <
+  [],
+  [llvm_anyptr_ty,         // pointer to store to
+   Ty,                     // value to store
+   llvm_i32_ty,            // C ABI Atomic Ordering ID
+   llvm_metadata_ty],      // syncscope
+  [IntrWriteMem, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<2>>,
+   IntrNoCallback, IntrNoFree, IntrConvergent],
+  "",
+  [SDNPMemOperand, SDNPMayStore]
+>;
+
+class AMDGPUCooperativeAtomicLoad<LLVMType Ty> : Intrinsic <
+  [Ty],
+  [llvm_anyptr_ty,         // pointer to load from
+   llvm_i32_ty,            // C ABI Atomic Ordering ID
+   llvm_metadata_ty],      // syncscope
+  [IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+   IntrNoCallback, IntrNoFree, IntrConvergent],
+  "",
+  [SDNPMemOperand, SDNPMayLoad]
+>;
+
+def int_amdgcn_cooperative_atomic_load_32x4B : AMDGPUCooperativeAtomicLoad<llvm_i32_ty>;
+def int_amdgcn_cooperative_atomic_store_32x4B : AMDGPUCooperativeAtomicStore<llvm_i32_ty>;
+def int_amdgcn_cooperative_atomic_load_16x8B : AMDGPUCooperativeAtomicLoad<llvm_v2i32_ty>;
+def int_amdgcn_cooperative_atomic_store_16x8B : AMDGPUCooperativeAtomicStore<llvm_v2i32_ty>;
+def int_amdgcn_cooperative_atomic_load_8x16B : AMDGPUCooperativeAtomicLoad<llvm_v4i32_ty>;
+def int_amdgcn_cooperative_atomic_store_8x16B : AMDGPUCooperativeAtomicStore<llvm_v4i32_ty>;
+
 //===----------------------------------------------------------------------===//
 // Special Intrinsics for backend internal use only. No frontend
 // should emit calls to these.
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a4ed62bb5715c..b14078d306eda 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1960,6 +1960,12 @@ def atomic_load_nonext_64 :
   let MemoryVT = i64;
 }
 
+def atomic_load_nonext_128 :
+  PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> {
+  let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
+  let MemoryVT = i128;
+}
+
 def atomic_load_zext_8 :
   PatFrag<(ops node:$ptr), (atomic_load_zext node:$ptr)> {
   let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
@@ -2191,6 +2197,13 @@ def atomic_store_64 :
   let MemoryVT = i64;
 }
 
+def atomic_store_128 :
+  PatFrag<(ops node:$val, node:$ptr),
+          (atomic_store node:$val, node:$ptr)> {
+  let IsAtomic = true;
+  let MemoryVT = i128;
+}
+
 //===----------------------------------------------------------------------===//
 // Selection DAG Pattern Support.
 //
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index da05ff166122f..ed08e7e93a83d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6776,6 +6776,28 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           "invalid vector type for format", &Call, Src1, Call.getArgOperand(2));
     break;
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+    // Check we only use this intrinsic on the FLAT or GLOBAL address spaces.
+    Value *PtrArg = Call.getArgOperand(0);
+    const unsigned AS = PtrArg->getType()->getPointerAddressSpace();
+    Check(AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS,
+          "cooperative atomic intrinsics require a generic or global pointer",
+          &Call, PtrArg);
+
+    // Last argument must be a MD string
+    auto *Op = cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1));
+    MDNode *MD = cast<MDNode>(Op->getMetadata());
+    Check((MD->getNumOperands() == 1) && isa<MDString>(MD->getOperand(0)),
+          "cooperative atomic intrinsics require that the last argument is a "
+          "metadata string",
+          &Call, Op);
+    break;
+  }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
   case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
     Value *V = Call.getArgOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index efcd87e466207..bd443b5b6f1e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -509,6 +509,10 @@ def atomic_load_nonext_64_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_64
   let IsAtomic = 1;
 }
 
+def atomic_load_nonext_128_#as : PatFrag<(ops node:$ptr), (atomic_load_nonext_128 node:$ptr)> {
+  let IsAtomic = 1;
+}
+
 def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> {
   let IsAtomic = 1;
 }
@@ -573,6 +577,8 @@ def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr),
                                   (atomic_store_32 node:$val, node:$ptr)>;
 def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr),
                                   (atomic_store_64 node:$val, node:$ptr)>;
+def atomic_store_128_#as : PatFrag<(ops node:$val, node:$ptr),
+                                   (atomic_store_128 node:$val, node:$ptr)>;
 } // End let IsAtomic = 1, AddressSpaces = ...
 } // End foreach as
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 55a76f1172cb9..64ba53c5d45d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7819,6 +7819,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+    assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+    B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
+    MI.eraseFromParent();
+    return true;
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
+    assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
+    B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
+    MI.eraseFromParent();
+    return true;
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 80f0d504ea30c..f7ab2cdb724ce 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1895,6 +1895,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
 defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
 defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, atomic_load_nonext_128_flat, v4i32>;
 
 defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
@@ -1919,6 +1920,7 @@ defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
 defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
 defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, atomic_store_128_flat, v4i32>;
 defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
 
@@ -2065,6 +2067,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX4, atomic_load_nonext_128_global, v4i32>;
 
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -2105,6 +2108,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, atomic_store_128_global, v4i32>;
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a861d9a96c9e3..cf19b66501e5e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1267,18 +1267,53 @@ static unsigned getIntrMemWidth(unsigned IntrID) {
     return 8;
   case Intrinsic::amdgcn_global_load_async_to_lds_b32:
   case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
     return 32;
   case Intrinsic::amdgcn_global_load_async_to_lds_b64:
   case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
     return 64;
   case Intrinsic::amdgcn_global_load_async_to_lds_b128:
   case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
     return 128;
   default:
     llvm_unreachable("Unknown width");
   }
 }
 
+static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
+                                      TargetLoweringBase::IntrinsicInfo &Info) {
+  Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
+  unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
+  switch (AtomicOrderingCABI(Ord)) {
+  case AtomicOrderingCABI::acquire:
+    Info.order = AtomicOrdering::Acquire;
+    break;
+  case AtomicOrderingCABI::release:
+    Info.order = AtomicOrdering::Release;
+    break;
+  case AtomicOrderingCABI::seq_cst:
+    Info.order = AtomicOrdering::SequentiallyConsistent;
+    break;
+  default:
+    Info.order = AtomicOrdering::Monotonic;
+    break;
+  }
+
+  Info.flags =
+      (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
+  Info.flags |= MOCooperative;
+
+  MDNode *ScopeMD = cast<MDNode>(
+      cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
+  StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
+  Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
@@ -1525,6 +1560,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad;
     return true;
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
+    return true;
+  }
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+    Info.ptrVal = CI.getArgOperand(0);
+    Info.align.reset();
+    getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
+    return true;
+  }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
   case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -10246,6 +10301,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
+  case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
+    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = Op->getOperand(0);
+    SDValue Ptr = Op->getOperand(2);
+    EVT VT = Op->getValueType(0);
+    return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
+                             Chain, Ptr, MII->getMemOperand());
+  }
   default:
 
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -10878,6 +10943,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                    Op->getVTList(), Ops, M->getMemoryVT(),
                                    M->getMemOperand());
   }
+  case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
+  case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
+    MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = Op->getOperand(0);
+    SDValue Ptr = Op->getOperand(2);
+    SDValue Val = Op->getOperand(3);
+    return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
+                         Ptr, MII->getMemOperand());
+  }
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d43924d46b005..acc70bb52ba5f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9554,6 +9554,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
       {
           {MONoClobber, "amdgpu-noclobber"},
           {MOLastUse, "amdgpu-last-use"},
+          {MOCooperative, "amdgpu-cooperative"},
       };
 
   return ArrayRef(TargetFlags);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fdbd9ce4a66bf..45d9683740f55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -48,6 +48,10 @@ static const MachineMemOperand::Flags MONoClobber =
 static const MachineMemOperand::Flags MOLastUse =
     MachineMemOperand::MOTargetFlag2;
 
+/// Mark the MMO of cooperative load/store atomics.
+static const MachineMemOperand::Flags MOCooperative =
+    MachineMemOperand::MOTargetFlag3;
+
 /// Utility to store machine instructions worklist.
 struct SIInstrWorklist {
   SIInstrWorklist() = default;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53f554eccb1fb..9e7ccec247bcd 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -103,6 +103,7 @@ class SIMemOpInfo final {
   bool IsVolatile = false;
   bool IsNonTemporal = false;
   bool IsLastUse = false;
+  bool IsCooperative = false;
 
   SIMemOpInfo(
       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -112,14 +113,15 @@ class SIMemOpInfo final {
       bool IsCrossAddressSpaceOrdering = true,
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
-      bool IsLastUse = false)
+      bool IsLastUse = false, bool IsCooperative = false)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
         IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
-        IsLastUse(IsLastUse) {
+        IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
 
     if (Ordering == AtomicOrdering::NotAtomic) {
+      assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
       assert(Scope == SIAtomicScope::NONE &&
              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
              !IsCrossAddressSpaceOrdering &&
@@ -209,6 +211,9 @@ class SIMemOpInfo final {
   /// create this SIMemOpInfo is last use, false otherwise.
   bool isLastUse() const { return IsLastUse; }
 
+  /// \returns True if this is a cooperative load or store atomic.
+  bool isCooperative() const { return IsCooperative; }
+
   /// \returns True if ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo is unordered or higher, false otherwise.
   bool isAtomic() const {
@@ -325,6 +330,12 @@ class SICacheControl {
     return false;
   };
 
+  /// Handle cooperative load/store atomics.
+  virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
+    llvm_unreachable(
+        "cooperative atomics are not available on this architecture");
+  }
+
   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -604,6 +615,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
 
   bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
 
+  virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
                      Position Pos) const override;
@@ -804,6 +817,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   bool IsNonTemporal = true;
   bool IsVolatile = false;
   bool IsLastUse = false;
+  bool IsCooperative = false;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
@@ -811,6 +825,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     IsNonTemporal &= MMO->isNonTemporal();
     IsVolatile |= MMO->isVolatile();
     IsLastUse |= MMO->getFlags() & MOLastUse;
+    IsCooperative |= MMO->getFlags() & MOCooperative;
     InstrAddrSpace |=
       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -852,7 +867,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   }
   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
-                     IsNonTemporal, IsLastUse);
+                     IsNonTemporal, IsLastUse, IsCooperative);
 }
 
 std::optional<SIMemOpInfo>
@@ -2592,6 +2607,19 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
   return false;
 }
 
+bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
+  if (!ST.hasGFX1250Insts())
+    return false;
+
+  // Cooperative atomics need to be SCOPE_DEV or higher.
+  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+  assert(CPol && "No CPol operand?");
+  const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+  if (Scope < CPol::SCOPE_DEV)
+    return setScope(MI, CPol::SCOPE_DEV);
+  return false;
+}
+
 bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
                                          SIAtomicScope Scope,
                                          SIAtomicAddrSpace AddrSpace) const {
@@ -2656,6 +2684,11 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
                                            MOI.getOrderingAddrSpace());
     }
 
+    // Handle cooperative atomics after cache bypass step, as it may override
+    // the scope of the instruction to a greater scope.
+    if (MOI.isCooperative())
+      Changed |= CC->handleCooperativeAtomic(*MI);
+
     if (Order == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
                                 SIMemOp::LOAD | SIMemOp::STORE,
@@ -2701,6 +2734,11 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
                                             MOI.getOrderingAddrSpace());
     }
 
+    // Handle cooperative atomics after cache bypass step, as it may override
+    // the scope of the instruction to a greater scope.
+    if (MOI.isCooperative())
+      Changed |= CC->handleCooperativeAtomic(*MI);
+
     if (MOI.getOrdering() == AtomicOrdering::Release ||
         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
       Changed |= CC->insertRelease(MI, MOI.getScope(),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
new file mode 100644
index 0000000000000..e3ec4d1f0f67a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
@@ -0,0 +1,521 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+!0 = !{ !"agent" }
+!1 = !{ !"agent-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll
new file mode 100644
index 0000000000000..1a2d2fe3f441c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-basic.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; test behavior with unsupported 'consume' and 'acq_rel' ordering
+
+define i32 @test_load_consume(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_load_consume:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 1, metadata !0)
+  ret i32 %0
+}
+
+define i32 @test_load_acqrel(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_load_acqrel:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 4, metadata !0)
+  ret i32 %0
+}
+
+define void @test_store_acqrel(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_store_acqrel:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 4, metadata !0)
+  ret void
+}
+
+!0 = !{ !"" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll
new file mode 100644
index 0000000000000..f97bced6364db
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-singlethread.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+!0 = !{ !"singlethread" }
+!1 = !{ !"singlethread-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll
new file mode 100644
index 0000000000000..c4234cc0de06a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-system.ll
@@ -0,0 +1,533 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+!0 = !{ !"" }
+!1 = !{ !"one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll
new file mode 100644
index 0000000000000..3ada966c5db26
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-verifier.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: not opt -mtriple=amdgcn -mcpu=gfx1250 -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define i32 @load_local_as(ptr addrspace(3) noundef %addr)  {
+entry:
+  %res = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p3(ptr addrspace(3) %addr, i32 0, metadata !0)
+  ret i32 %res
+}
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define i32 @load_private_as(ptr addrspace(5) noundef %addr)  {
+entry:
+  %res = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p5(ptr addrspace(5) %addr, i32 0, metadata !0)
+  ret i32 %res
+}
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define void @store_local_as(ptr addrspace(3) noundef %addr, i32 noundef %val)  {
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p3(ptr addrspace(3) %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+; CHECK: cooperative atomic intrinsics require a generic or global pointer
+define void @store_private_as(ptr addrspace(5) noundef %addr, i32 noundef %val)  {
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p5(ptr addrspace(5) %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+
+; CHECK: cooperative atomic intrinsics require that the last argument is a metadata string
+define i32 @test_empty_md(ptr noundef readonly %addr)  {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 1, metadata !{})
+  ret i32 %0
+}
+
+; CHECK: cooperative atomic intrinsics require that the last argument is a metadata string
+define i32 @test_no_md_str(ptr noundef readonly %addr)  {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 1, metadata !{!{}})
+  ret i32 %0
+}
+
+!0 = !{ !"" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll
new file mode 100644
index 0000000000000..329b7612426dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-wavefront.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+!0 = !{ !"wavefront" }
+!1 = !{ !"wavefront-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
new file mode 100644
index 0000000000000..e86f0e0083805
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !0)
+  ret void
+}
+
+define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !0)
+  ret i32 %0
+}
+
+define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !0)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !0)
+  ret <4 x i32> %0
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !0)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 0, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 0, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 0, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_relaxed:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 0, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 2, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 2, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_acquire:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 2, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 3, metadata !1)
+  ret void
+}
+
+define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.amdgcn.cooperative.atomic.load.32x4B.p0(ptr %addr, i32 5, metadata !1)
+  ret i32 %0
+}
+
+define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.amdgcn.cooperative.atomic.load.16x8B.p0(ptr %addr, i32 5, metadata !1)
+  ret <2 x i32> %0
+}
+
+define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr noundef readonly %addr)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %0 = tail call <4 x i32> @llvm.amdgcn.cooperative.atomic.load.8x16B.p0(ptr %addr, i32 5, metadata !1)
+  ret <4 x i32> %0
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef %addr, i32 noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.32x4B.p0(ptr %addr, i32 %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef %addr, <2 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.16x8B.p0(ptr %addr, <2 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef %addr, <4 x i32> noundef %val)  {
+; GFX1250-LABEL: test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  tail call void @llvm.amdgcn.cooperative.atomic.store.8x16B.p0(ptr %addr, <4 x i32> %val, i32 5, metadata !1)
+  ret void
+}
+
+!0 = !{ !"workgroup" }
+!1 = !{ !"workgroup-one-as" }
+
+;; GFX1250: {{.*}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}



More information about the llvm-commits mailing list