[clang] [llvm] [mlir] [NVPTX] Add support for Distributed Shared Memory address space. (PR #135444)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 11 14:50:45 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: None (modiking)
<details>
<summary>Changes</summary>
Adds support for new Distributed Shared Memory Address Space (DSMEM, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details.
1. Update address space structures and datalayout to contain the new space
2. Update codegen and intrinsics that support/expect this address space in both LLVM and MLIR
3. Update NVPTX alias analysis
4. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a DSMEM pointer to the new address space
---
Patch is 79.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135444.diff
23 Files Affected:
- (modified) clang/lib/Basic/Targets/NVPTX.cpp (+4-3)
- (modified) clang/test/CodeGen/target-data.c (+1-1)
- (modified) clang/test/CodeGenCUDA/builtins-sm90.cu (+1-1)
- (modified) llvm/docs/NVPTXUsage.rst (+3-3)
- (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+23-22)
- (modified) llvm/include/llvm/Support/NVPTXAddrSpace.h (+1)
- (modified) llvm/lib/IR/AutoUpgrade.cpp (+87)
- (modified) llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp (+1)
- (modified) llvm/lib/Target/NVPTX/NVPTX.h (+1)
- (modified) llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp (+5)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+10-1)
- (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+1)
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+9-1)
- (modified) llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp (+8-6)
- (modified) llvm/lib/Target/NVPTX/NVPTXUtilities.h (+2)
- (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+57)
- (modified) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll (+48-48)
- (modified) llvm/test/CodeGen/NVPTX/cp-async-bulk.ll (+9-9)
- (added) llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll (+258)
- (modified) llvm/test/CodeGen/NVPTX/nvptx-aa.ll (+10-2)
- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h (+4-1)
- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+3-2)
- (modified) mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir (+12-12)
``````````diff
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 5931a77a85fec..08c8460045c6a 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
if (TargetPointerWidth == 32)
resetDataLayout(
- "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+ "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
else if (Opts.NVPTXUseShortPointers)
- resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
- "16-v32:32-n16:32:64");
+ resetDataLayout(
+ "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:"
+ "16-v32:32-n16:32:64");
else
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index fe29aadb1dd53..9cb00e8ee73d3 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,7 +160,7 @@
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX
-// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX64
diff --git a/clang/test/CodeGenCUDA/builtins-sm90.cu b/clang/test/CodeGenCUDA/builtins-sm90.cu
index a639c7716adb1..f4746df944536 100644
--- a/clang/test/CodeGenCUDA/builtins-sm90.cu
+++ b/clang/test/CodeGenCUDA/builtins-sm90.cu
@@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa(ptr, u);
- // CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+ // CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
out[i++] = __nvvm_getctarank(ptr);
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 621879fc5648b..2ce9a4540034c 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -499,7 +499,7 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+ declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
Overview:
"""""""""
@@ -563,7 +563,7 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
+ declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
Overview:
"""""""""
@@ -718,7 +718,7 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 4aeb1d8a2779e..f053fa6e2bf22 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
+def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr
//
// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
list<LLVMType> ArgsTy = !listconcat(
- [llvm_shared_ptr_ty, // dst_smem_ptr
- llvm_shared_ptr_ty, // mbarrier_smem_ptr
- llvm_ptr_ty], // tensormap_ptr
- TensorDimsTy, // actual tensor dims
- Im2ColOffsetsTy, // im2col offsets
- [llvm_i16_ty, // cta_mask
- llvm_i64_ty, // cache_hint
- llvm_i1_ty, // Flag for cta_mask
- llvm_i1_ty] // Flag for cache_hint
+ [llvm_dshared_ptr_ty, // dst_smem_ptr
+ llvm_shared_ptr_ty, // mbarrier_smem_ptr
+ llvm_ptr_ty], // tensormap_ptr
+ TensorDimsTy, // actual tensor dims
+ Im2ColOffsetsTy, // im2col offsets
+ [llvm_i16_ty, // cta_mask
+ llvm_i64_ty, // cache_hint
+ llvm_i1_ty, // Flag for cta_mask
+ llvm_i1_ty] // Flag for cache_hint
);
int TempFlagsStartIdx = !add(dim, 5);
@@ -5087,7 +5088,7 @@ def int_nvvm_mapa
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa">;
def int_nvvm_mapa_shared_cluster
- : DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
+ : DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa.shared.cluster">;
def int_nvvm_getctarank
@@ -5187,14 +5188,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
// From Global to Shared Cluster
def int_nvvm_cp_async_bulk_global_to_shared_cluster
: DefaultAttrsIntrinsic<[],
- [llvm_shared_ptr_ty, // dst_smem_ptr
- llvm_shared_ptr_ty, // mbarrier_ptr
- llvm_global_ptr_ty, // src_gmem_ptr
- llvm_i32_ty, // copy_size
- llvm_i16_ty, // cta_mask
- llvm_i64_ty, // cache_hint
- llvm_i1_ty, // Flag for cta_mask
- llvm_i1_ty], // Flag for cache_hint
+ [llvm_dshared_ptr_ty, // dst_dsmem_ptr
+ llvm_shared_ptr_ty, // mbarrier_ptr
+ llvm_global_ptr_ty, // src_gmem_ptr
+ llvm_i32_ty, // copy_size
+ llvm_i16_ty, // cta_mask
+ llvm_i64_ty, // cache_hint
+ llvm_i1_ty, // Flag for cta_mask
+ llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5204,10 +5205,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
// From Shared CTA to Shared Cluster
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
: DefaultAttrsIntrinsic<[],
- [llvm_shared_ptr_ty, // dst_smem_ptr
- llvm_shared_ptr_ty, // mbarrier_ptr
- llvm_shared_ptr_ty, // src_smem_ptr
- llvm_i32_ty], // copy_size
+ [llvm_dshared_ptr_ty, // dst_dsmem_ptr
+ llvm_shared_ptr_ty, // mbarrier_ptr
+ llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_i32_ty], // copy_size
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h
index 486a396621da1..a3eac31f2e5e9 100644
--- a/llvm/include/llvm/Support/NVPTXAddrSpace.h
+++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h
@@ -25,6 +25,7 @@ enum AddressSpace : unsigned {
ADDRESS_SPACE_CONST = 4,
ADDRESS_SPACE_LOCAL = 5,
ADDRESS_SPACE_TENSOR = 6,
+ ADDRESS_SPACE_DSHARED = 7,
ADDRESS_SPACE_PARAM = 101,
};
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 0b329d91c3c7c..7482014d3c168 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -46,6 +46,7 @@
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstdint>
@@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return false; // No other 'arm.*', 'aarch64.*'.
}
+static Intrinsic::ID shouldUpgradeNVPTXDSharedIntrinsic(Function *F,
+ StringRef Name) {
+ if (Name.consume_front("mapa.shared.cluster"))
+ if (F->getReturnType()->getPointerAddressSpace() ==
+ NVPTXAS::ADDRESS_SPACE_SHARED)
+ return Intrinsic::nvvm_mapa_shared_cluster;
+
+ if (Name.consume_front("cp.async.bulk.")) {
+ Intrinsic::ID ID =
+ StringSwitch<Intrinsic::ID>(Name)
+ .Case("global.to.shared.cluster",
+ Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
+ .Case("shared.cta.to.cluster",
+ Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
+ .Case("tensor.g2s.im2col.3d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
+ .Case("tensor.g2s.im2col.4d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
+ .Case("tensor.g2s.im2col.5d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
+ .Case("tensor.g2s.tile.1d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
+ .Case("tensor.g2s.tile.2d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
+ .Case("tensor.g2s.tile.3d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
+ .Case("tensor.g2s.tile.4d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
+ .Case("tensor.g2s.tile.5d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
+ .Default(Intrinsic::not_intrinsic);
+
+ if (ID != Intrinsic::not_intrinsic)
+ if (F->getArg(0)->getType()->getPointerAddressSpace() ==
+ NVPTXAS::ADDRESS_SPACE_SHARED)
+ return ID;
+ }
+
+ return Intrinsic::not_intrinsic;
+}
+
static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
if (Name.consume_front("abs."))
return StringSwitch<Intrinsic::ID>(Name)
@@ -1284,6 +1326,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
}
+ // Upgrade Distributed Shared Memory Intrinsics
+ Intrinsic::ID IID = shouldUpgradeNVPTXDSharedIntrinsic(F, Name);
+ if (IID != Intrinsic::not_intrinsic) {
+ rename(F);
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+ return true;
+ }
+
// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
//
@@ -4704,6 +4754,43 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
CI->eraseFromParent();
return;
}
+ case Intrinsic::nvvm_mapa_shared_cluster: {
+ // Create a new call with the correct address space.
+ NewCall =
+ Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
+ Value *Res = NewCall;
+ Res = Builder.CreateAddrSpaceCast(
+ Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
+ Res = Builder.CreateAddrSpaceCast(
+ Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
+ NewCall->takeName(CI);
+ CI->replaceAllUsesWith(Res);
+ CI->eraseFromParent();
+ return;
+ }
+ case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
+ case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
+
+ SmallVector<Value *, 4> Args(CI->args());
+ Args[0] = Builder.CreateAddrSpaceCast(
+ Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
+ Args[0] = Builder.CreateAddrSpaceCast(
+ Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_DSHARED));
+
+ NewCall = Builder.CreateCall(NewFn, Args);
+ NewCall->takeName(CI);
+ CI->replaceAllUsesWith(NewCall);
+ CI->eraseFromParent();
+ return;
+ }
case Intrinsic::riscv_sha256sig0:
case Intrinsic::riscv_sha256sig1:
case Intrinsic::riscv_sha256sum0:
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index e42e738b9973f..9ab59c1c144f3 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -288,6 +288,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
case NVPTX::AddressSpace::Global:
case NVPTX::AddressSpace::Const:
case NVPTX::AddressSpace::Shared:
+ case NVPTX::AddressSpace::Dshared:
case NVPTX::AddressSpace::Param:
case NVPTX::AddressSpace::Local:
O << "." << A;
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 98e77ca80b8d5..c20c522f36bd3 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
Shared = 3,
Const = 4,
Local = 5,
+ Dshared = 7,
// NVPTX Backend Private:
Param = 101
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
index b910ccab21bf3..60bc22f5f589c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
@@ -86,6 +86,11 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) {
// TODO: cvta.param is not yet supported. We need to change aliasing
// rules once it is added.
+ // Distributed shared memory aliases with shared memory.
+ if (((AS1 == ADDRESS_SPACE_SHARED) && (AS2 == ADDRESS_SPACE_DSHARED)) ||
+ ((AS1 == ADDRESS_SPACE_DSHARED) && (AS2 == ADDRESS_SPACE_SHARED)))
+ return AliasResult::MayAlias;
+
return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ec1f969494cd1..34ddfd3c411a8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -513,6 +513,8 @@ static std::optional<unsigned> convertAS(unsigned AS) {
return NVPTX::AddressSpace::Global;
case llvm::ADDRESS_SPACE_SHARED:
return NVPTX::AddressSpace::Shared;
+ case llvm::ADDRESS_SPACE_DSHARED:
+ return NVPTX::AddressSpace::Dshared;
case llvm::ADDRESS_SPACE_GENERIC:
return NVPTX::AddressSpace::Generic;
case llvm::ADDRESS_SPACE_PARAM:
@@ -658,7 +660,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
bool AddrGenericOrGlobalOrShared =
(CodeAddrSpace == NVPTX::AddressSpace::Generic ||
CodeAddrSpace == NVPTX::AddressSpace::Global ||
- CodeAddrSpace == NVPTX::AddressSpace::Shared);
+ CodeAddrSpace == NVPTX::AddressSpace::Shared ||
+ CodeAddrSpace == NVPTX::AddressSpace::Dshared);
if (!AddrGenericOrGlobalOrShared)
return NVPTX::Ordering::NotAtomic;
@@ -979,6 +982,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
break;
+ case ADDRESS_SPACE_DSHARED:
+ Opc = TM.is64Bit() ? NVPTX::cvta_dshared_64 : NVPTX::cvta_dshared;
+ break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
break;
@@ -1001,6 +1007,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
break;
+ case ADDRESS_SPACE_DSHARED:
+ Opc = TM.is64Bit() ? NVPTX::cvta_to_dshared_64 : NVPTX::cvta_to_dshared;
+ break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 16b489afddf5c..4cf5292983048 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -137,6 +137,7 @@ def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
def hasVote : Predicate<"Subtarget->hasVote()">;
def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def hasClusters : Predicate<"Subtarget->hasClusters()">;
def hasLDG : Predicate<"Subtarget->hasLDG()">;
def hasLDU : Predicate<"Subtarget->hasLDU()">;
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8528ff702f236..19b370e4ce6f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -33,6 +33,9 @@ def AS_match {
code shared = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
}];
+ code dshared = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_DSHARED);
+ }];
code global = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
}];
@@ -1979,10 +1982,11 @@ class ATOMIC_GLOBAL_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.global>;
class ATOMIC_SHARED_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.shared>;
+class ATOMIC_DSHARED_CHK <dag frag>
+ : PatFrag<!setdagop(frag, ops), frag, AS_match.dshared>;
class ATOMIC_GENERIC_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.generic>;
-
multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
SDPatternOperator op, list<Predicate> preds> {
defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;";
@@ -2034,6 +2038,7 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defvar frag_pat = (frag node:$a, node:$b);
defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _DS : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_DSHARED_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>;
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
@@ -2041,6 +2046,7 @@ multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, st
defvar frag_pat = (frag node:$a, node:$b, node:$c);
defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/135444
More information about the llvm-commits
mailing list