[llvm] 3b66d4a - [AMDGPU] Support builtin/intrinsics for async loads/stores on gfx1250 (#151058)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 08:20:09 PDT 2025
Author: Changpeng Fang
Date: 2025-07-29T08:20:05-07:00
New Revision: 3b66d4a987bff6d9d3e8a0932604cb40850136eb
URL: https://github.com/llvm/llvm-project/commit/3b66d4a987bff6d9d3e8a0932604cb40850136eb
DIFF: https://github.com/llvm/llvm-project/commit/3b66d4a987bff6d9d3e8a0932604cb40850136eb.diff
LOG: [AMDGPU] Support builtin/intrinsics for async loads/stores on gfx1250 (#151058)
Added:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 945e11be31278..b8ece53328b3c 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -651,6 +651,16 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_store_async_from_lds_b128, "vV4i*1V4i*3IiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
@@ -670,9 +680,6 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
-TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")
-
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
index e3fe31ff7dd75..ccc05f0aa5af3 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
@@ -2,6 +2,89 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_load_async_to_lds_b8( global char* gaddr, local char* laddr)
+{
+ __builtin_amdgcn_global_load_async_to_lds_b8(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_load_async_to_lds_b32(global int* gaddr, local int* laddr)
+{
+ __builtin_amdgcn_global_load_async_to_lds_b32(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b64(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_load_async_to_lds_b64(global v2i* gaddr, local v2i* laddr)
+{
+ __builtin_amdgcn_global_load_async_to_lds_b64(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_async_to_lds_b128(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_load_async_to_lds_b128( global v4i* gaddr, local v4i* laddr)
+{
+ __builtin_amdgcn_global_load_async_to_lds_b128(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_store_async_from_lds_b8(global char* gaddr, local char* laddr)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b8(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_store_async_from_lds_b32(global int* gaddr, local int* laddr)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b32(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b64(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_store_async_from_lds_b64(global v2i* gaddr, local v2i* laddr)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b64(gaddr, laddr, 16, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_store_async_from_lds_b128(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) [[GADDR:%.*]], ptr addrspace(3) [[LADDR:%.*]], i32 16, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_global_store_async_from_lds_b128(global v4i* gaddr, local v4i* laddr)
+{
+ __builtin_amdgcn_global_store_async_from_lds_b128(gaddr, laddr, 16, 0);
+}
+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]])
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 3a7db6d599551..1da4e36c5a743 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3643,6 +3643,50 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;
+class AMDGPUAsyncGlobalLoadToLDS : Intrinsic <
+ [],
+ [global_ptr_ty, // Base global pointer to load from
+ local_ptr_ty, // LDS base pointer to store to.
+ llvm_i32_ty, // offset
+ llvm_i32_ty], // gfx12+ cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ReadOnly<ArgIndex<0>>, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+>;
+
+class AMDGPUAsyncGlobalStoreFromLDS : Intrinsic <
+ [],
+ [global_ptr_ty, // Base global pointer to store to
+ local_ptr_ty, // LDS base pointer to load from
+ llvm_i32_ty, // offset
+ llvm_i32_ty], // gfx12+ cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+>;
+
+def int_amdgcn_global_load_async_to_lds_b8 :
+ ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b8">, AMDGPUAsyncGlobalLoadToLDS;
+def int_amdgcn_global_load_async_to_lds_b32 :
+ ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b32">, AMDGPUAsyncGlobalLoadToLDS;
+def int_amdgcn_global_load_async_to_lds_b64 :
+ ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b64">, AMDGPUAsyncGlobalLoadToLDS;
+def int_amdgcn_global_load_async_to_lds_b128 :
+ ClangBuiltin<"__builtin_amdgcn_global_load_async_to_lds_b128">, AMDGPUAsyncGlobalLoadToLDS;
+
+def int_amdgcn_global_store_async_from_lds_b8 :
+ ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b8">, AMDGPUAsyncGlobalStoreFromLDS;
+def int_amdgcn_global_store_async_from_lds_b32 :
+ ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b32">, AMDGPUAsyncGlobalStoreFromLDS;
+def int_amdgcn_global_store_async_from_lds_b64 :
+ ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b64">, AMDGPUAsyncGlobalStoreFromLDS;
+def int_amdgcn_global_store_async_from_lds_b128 :
+ ClangBuiltin<"__builtin_amdgcn_global_store_async_from_lds_b128">, AMDGPUAsyncGlobalStoreFromLDS;
+
// WMMA intrinsics.
class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
Intrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index c01e5d3ff93c2..992572f17e5b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -143,6 +143,9 @@ def gi_global_saddr_cpol :
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;
+def gi_global_saddr_no_ioffset :
+ GIComplexOperandMatcher<s64, "selectGlobalSAddrNoIOffset">,
+ GIComplexPatternEquiv<GlobalSAddrNoIOffset>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index dfaa1450e5c61..3d7e678d2e54f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2049,6 +2049,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
+ SDValue &SAddr,
+ SDValue &VOffset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ SDValue DummyOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
+ false))
+ return false;
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
+ CPol = CurDAG->getTargetConstant(
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
+ return true;
+}
+
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5636d896f2e7c..983f1aa8fab86 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -174,6 +174,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
+ bool SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, SDValue &SAddr,
+ SDValue &VOffset, SDValue &CPol) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 266dee183229e..04773c9c7b773 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5788,6 +5788,17 @@ AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
+ MachineOperand &Root) const {
+ const MachineInstr &I = *Root.getParent();
+
+ // We are assuming CPol is always the last operand of the intrinsic.
+ auto PassedCPol =
+ I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
+ return selectGlobalSAddr(Root, PassedCPol, false);
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index fe9743d0a4b99..140e753bf976a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -264,6 +264,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c5a1d9e005e15..306443d25a74f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5364,6 +5364,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0f172e0ddee56..8ede9caead8bc 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -11,6 +11,7 @@ let WantsRoot = true in {
def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [], -10>;
def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [], -10>;
+ def GlobalSAddrNoIOffset : ComplexPattern<iPTR, 3, "SelectGlobalSAddrNoIOffset", [], [], -3>;
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
@@ -1361,6 +1362,26 @@ class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueT
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $dsaddr, $vaddr, $offset, $cpol)
+>;
+
+class GlobalLoadLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $dsaddr, $saddr, $voffset, $offset, $cpol)
+>;
+
+class FlatStoreLDSSignedPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (i64 VReg_64:$vaddr), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm:$cpol)),
+ (inst $vaddr, $dsaddr, $offset, $cpol)
+>;
+
+class GlobalStoreLDSSaddrPat <FLAT_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node (GlobalSAddrNoIOffset (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), CPol:$cpol), (i32 VGPR_32:$dsaddr), (i32 timm:$offset), (i32 timm)),
+ (inst $saddr, $voffset, $dsaddr, $offset, $cpol)
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
@@ -1571,6 +1592,26 @@ class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Va
(inst $vaddr, $saddr, $offset, $cpol)
>;
+multiclass GlobalLoadLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatLoadLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalLoadLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
+multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
+ def : FlatStoreLDSSignedPat <inst, node> {
+ let AddedComplexity = 10;
+ }
+
+ def : GlobalStoreLDSSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
@@ -2137,6 +2178,18 @@ let OtherPredicates = [isGFX125xOnly] in {
defm : GlobalFLATLoadPats_CPOL <GLOBAL_LOAD_MONITOR_B128, int_amdgcn_global_load_monitor_b128, v4i32>;
} // End SubtargetPredicate = isGFX125xOnly
+let OtherPredicates = [isGFX1250Plus] in {
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
+ defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B128, int_amdgcn_global_load_async_to_lds_b128>;
+
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B8, int_amdgcn_global_store_async_from_lds_b8>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B32, int_amdgcn_global_store_async_from_lds_b32>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B64, int_amdgcn_global_store_async_from_lds_b64>;
+ defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9017f4f26f835..fbaf9bc452790 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1256,6 +1256,25 @@ MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
+static unsigned getIntrMemWidth(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ return 8;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ return 32;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ return 64;
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+ return 128;
+ default:
+ llvm_unreachable("Unknown width");
+ }
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -1527,6 +1546,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(1);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
+ Info.ptrVal = CI.getArgOperand(0);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
Info.opc = ISD::INTRINSIC_VOID;
@@ -1623,10 +1662,18 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b8:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b32:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b64:
+ case Intrinsic::amdgcn_global_store_async_from_lds_b128:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b8:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b32:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b64:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128:
Ptr = II->getArgOperand(1);
break;
default:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
new file mode 100644
index 0000000000000..dd679101047ea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+declare void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+declare void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+declare void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+
+define amdgpu_ps void @global_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_load_async_to_lds_b8_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_async_to_lds_b8_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_load_async_to_lds_b8_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_load_async_to_lds_b8 v0, v1, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_load_async_to_lds_b32_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_async_to_lds_b32_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 10)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_load_async_to_lds_b32_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 22)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_load_async_to_lds_b64_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_load_async_to_lds_b128_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_async_to_lds_b128_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 27)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_load_async_to_lds_b128_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_load_async_to_lds_b128 v0, v1, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
+; GFX1250-LABEL: global_load_async_to_lds_b32_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom
+ call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
+; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i64 %idxprom
+ call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
+; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom
+ call void @llvm.amdgcn.global.load.async.to.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
new file mode 100644
index 0000000000000..fd35313802558
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+declare void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+declare void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+declare void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %offset, i32 %cpol)
+
+define amdgpu_ps void @global_store_async_from_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_store_async_from_lds_b8_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_store_async_from_lds_b8 v[0:1], v2, off offset:16 th:TH_STORE_NT
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_store_async_from_lds_b8_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_store_async_from_lds_b8 v[0:1], v2, off offset:16 th:TH_STORE_NT
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_store_async_from_lds_b8_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_store_async_from_lds_b8 v1, v0, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b8(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_store_async_from_lds_b32:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_store_async_from_lds_b32 v[0:1], v2, off offset:16 th:TH_STORE_HT scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_store_async_from_lds_b32:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_store_async_from_lds_b32 v[0:1], v2, off offset:16 th:TH_STORE_HT scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 10)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_store_async_from_lds_b32_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[0:1], v2, off offset:16 th:TH_STORE_NT_HT scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_store_async_from_lds_b64 v[0:1], v2, off offset:16 th:TH_STORE_NT_HT scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 22)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_store_async_from_lds_b64_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-SDAG-LABEL: global_store_async_from_lds_b128_vaddr:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], 32, v[0:1]
+; GFX1250-SDAG-NEXT: global_store_async_from_lds_b128 v[0:1], v2, off offset:16 th:TH_STORE_BYPASS scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_store_async_from_lds_b128_vaddr:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 32
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: global_store_async_from_lds_b128 v[0:1], v2, off offset:16 th:TH_STORE_BYPASS scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 27)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
+; GFX1250-LABEL: global_store_async_from_lds_b128_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 32
+; GFX1250-NEXT: global_store_async_from_lds_b128 v1, v0, s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i32 4
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
+; GFX1250-LABEL: global_store_async_from_lds_b32_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom
+ call void @llvm.amdgcn.global.store.async.from.lds.b32(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
+; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_scale_offset:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT
+; GFX1250-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i64, ptr addrspace(1) %gaddr, i64 %idxprom
+ call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
+
+define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
+; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]
+; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom
+ call void @llvm.amdgcn.global.store.async.from.lds.b64(ptr addrspace(1) %gep, ptr addrspace(3) %laddr, i32 16, i32 1)
+ ret void
+}
More information about the llvm-commits
mailing list