[llvm] d6a68be - [NVPTX] Add support for Shared Cluster Memory address space [1/2] (#135444)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 22 15:14:43 PDT 2025
Author: modiking
Date: 2025-04-22T15:14:39-07:00
New Revision: d6a68be7af0e1cec95e5fb9442dc1867cbde9f8d
URL: https://github.com/llvm/llvm-project/commit/d6a68be7af0e1cec95e5fb9442dc1867cbde9f8d
DIFF: https://github.com/llvm/llvm-project/commit/d6a68be7af0e1cec95e5fb9442dc1867cbde9f8d.diff
LOG: [NVPTX] Add support for Shared Cluster Memory address space [1/2] (#135444)
Adds support for new Shared Cluster Memory Address Space
(SHARED_CLUSTER, addrspace 7). See
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory
for details.
1. Update address space structures and datalayout to contain the new
space
2. Add new intrinsics that use this new address space
3. Update NVPTX alias analysis
The existing intrinsics are updated in
https://github.com/llvm/llvm-project/pull/136768
Added:
llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll
llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
Modified:
clang/lib/Basic/Targets/NVPTX.cpp
clang/test/CodeGen/target-data.c
llvm/include/llvm/Support/NVPTXAddrSpace.h
llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
llvm/lib/Target/NVPTX/NVPTX.h
llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
llvm/lib/Target/NVPTX/NVPTXUtilities.h
llvm/test/CodeGen/NVPTX/nvptx-aa.ll
llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll
Removed:
################################################################################
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 5931a77a85fec..08c8460045c6a 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
if (TargetPointerWidth == 32)
resetDataLayout(
- "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+ "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
else if (Opts.NVPTXUseShortPointers)
- resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
- "16-v32:32-n16:32:64");
+ resetDataLayout(
+ "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:"
+ "16-v32:32-n16:32:64");
else
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index fe29aadb1dd53..9cb00e8ee73d3 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,7 +160,7 @@
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX
-// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX64
diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h
index 486a396621da1..04f74c34787cc 100644
--- a/llvm/include/llvm/Support/NVPTXAddrSpace.h
+++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h
@@ -25,6 +25,7 @@ enum AddressSpace : unsigned {
ADDRESS_SPACE_CONST = 4,
ADDRESS_SPACE_LOCAL = 5,
ADDRESS_SPACE_TENSOR = 6,
+ ADDRESS_SPACE_SHARED_CLUSTER = 7,
ADDRESS_SPACE_PARAM = 101,
};
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 4e2e4c99df803..0b137250e4e59 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -285,6 +285,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
case NVPTX::AddressSpace::Global:
case NVPTX::AddressSpace::Const:
case NVPTX::AddressSpace::Shared:
+ case NVPTX::AddressSpace::SharedCluster:
case NVPTX::AddressSpace::Param:
case NVPTX::AddressSpace::Local:
O << "." << A;
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 98e77ca80b8d5..cf21ad991ccdf 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
Shared = 3,
Const = 4,
Local = 5,
+ SharedCluster = 7,
// NVPTX Backend Private:
Param = 101
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
index b910ccab21bf3..a579783802aa2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
@@ -86,6 +86,12 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) {
// TODO: cvta.param is not yet supported. We need to change aliasing
// rules once it is added.
+ // Distributed shared memory aliases with shared memory.
+ if (((AS1 == ADDRESS_SPACE_SHARED) &&
+ (AS2 == ADDRESS_SPACE_SHARED_CLUSTER)) ||
+ ((AS1 == ADDRESS_SPACE_SHARED_CLUSTER) && (AS2 == ADDRESS_SPACE_SHARED)))
+ return AliasResult::MayAlias;
+
return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 486c7c815435a..032975ed663e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -513,6 +513,8 @@ static std::optional<unsigned> convertAS(unsigned AS) {
return NVPTX::AddressSpace::Global;
case llvm::ADDRESS_SPACE_SHARED:
return NVPTX::AddressSpace::Shared;
+ case llvm::ADDRESS_SPACE_SHARED_CLUSTER:
+ return NVPTX::AddressSpace::SharedCluster;
case llvm::ADDRESS_SPACE_GENERIC:
return NVPTX::AddressSpace::Generic;
case llvm::ADDRESS_SPACE_PARAM:
@@ -658,7 +660,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
bool AddrGenericOrGlobalOrShared =
(CodeAddrSpace == NVPTX::AddressSpace::Generic ||
CodeAddrSpace == NVPTX::AddressSpace::Global ||
- CodeAddrSpace == NVPTX::AddressSpace::Shared);
+ CodeAddrSpace == NVPTX::AddressSpace::Shared ||
+ CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
if (!AddrGenericOrGlobalOrShared)
return NVPTX::Ordering::NotAtomic;
@@ -979,6 +982,12 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
break;
+ case ADDRESS_SPACE_SHARED_CLUSTER:
+ if (!TM.is64Bit())
+ report_fatal_error(
+ "Shared cluster address space is only supported in 64-bit mode");
+ Opc = NVPTX::cvta_shared_cluster_64;
+ break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
break;
@@ -1004,6 +1013,12 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
break;
+ case ADDRESS_SPACE_SHARED_CLUSTER:
+ if (!TM.is64Bit())
+ report_fatal_error(
+ "Shared cluster address space is only supported in 64-bit mode");
+ Opc = NVPTX::cvta_to_shared_cluster_64;
+ break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 49f4f30096f00..18baf1f338023 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3043,8 +3043,27 @@ SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
unsigned SrcAS = N->getSrcAddressSpace();
unsigned DestAS = N->getDestAddressSpace();
if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
- DestAS != llvm::ADDRESS_SPACE_GENERIC)
+ DestAS != llvm::ADDRESS_SPACE_GENERIC) {
+ // Shared and SharedCluster can be converted to each other through generic
+ // space
+ if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
+ DestAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER) ||
+ (SrcAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER &&
+ DestAS == llvm::ADDRESS_SPACE_SHARED)) {
+ SDLoc DL(Op.getNode());
+ const MVT GenerictVT =
+ getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC);
+ SDValue GenericConversion = DAG.getAddrSpaceCast(
+ DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
+ SDValue SharedClusterConversion =
+ DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
+ ADDRESS_SPACE_GENERIC, DestAS);
+ return SharedClusterConversion;
+ }
+
return DAG.getUNDEF(Op.getValueType());
+ }
+
return Op;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ee6380a8a89c4..043da14bcb236 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -137,6 +137,7 @@ def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
def hasVote : Predicate<"Subtarget->hasVote()">;
def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def hasClusters : Predicate<"Subtarget->hasClusters()">;
def hasLDG : Predicate<"Subtarget->hasLDG()">;
def hasLDU : Predicate<"Subtarget->hasLDU()">;
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 4ba3e6f06bb5f..a6595e512dbae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -33,6 +33,9 @@ def AS_match {
code shared = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
}];
+ code shared_cluster = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED_CLUSTER);
+ }];
code global = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
}];
@@ -2039,10 +2042,11 @@ class ATOMIC_GLOBAL_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.global>;
class ATOMIC_SHARED_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.shared>;
+class ATOMIC_SHARED_CLUSTER_CHK <dag frag>
+ : PatFrag<!setdagop(frag, ops), frag, AS_match.shared_cluster>;
class ATOMIC_GENERIC_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.generic>;
-
multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
SDPatternOperator op, list<Predicate> preds> {
defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;";
@@ -2094,6 +2098,7 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defvar frag_pat = (frag node:$a, node:$b);
defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
@@ -2101,6 +2106,7 @@ multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, st
defvar frag_pat = (frag node:$a, node:$b, node:$c);
defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
@@ -2381,18 +2387,22 @@ def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
-multiclass NG_TO_G<string Str> {
- def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "cvta." # Str # ".u32 \t$result, $src;", []>;
- def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "cvta." # Str # ".u64 \t$result, $src;", []>;
+multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
+ if Supports32 then
+ def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "cvta." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
+
+ def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "cvta." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
}
-multiclass G_TO_NG<string Str> {
- def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "cvta.to." # Str # ".u32 \t$result, $src;", []>;
- def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "cvta.to." # Str # ".u64 \t$result, $src;", []>;
+multiclass G_TO_NG<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
+ if Supports32 then
+ def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "cvta.to." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
+
+ def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
}
foreach space = ["local", "shared", "global", "const", "param"] in {
@@ -2400,6 +2410,9 @@ foreach space = ["local", "shared", "global", "const", "param"] in {
defm cvta_to_#space : G_TO_NG<space>;
}
+defm cvta_shared_cluster : NG_TO_G<"shared::cluster", false, [hasClusters]>;
+defm cvta_to_shared_cluster : G_TO_NG<"shared::cluster", false, [hasClusters]>;
+
def : Pat<(int_nvvm_ptr_param_to_gen i32:$src),
(cvta_param $src)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index a4c3b43aec9f2..1a7b20390a562 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -117,13 +117,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
std::string Ret = "e";
- if (!is64Bit)
- Ret += "-p:32:32";
- else if (UseShortPointers)
- Ret += "-p3:32:32-p4:32:32-p5:32:32";
-
// Tensor Memory (addrspace:6) is always 32-bits.
- Ret += "-p6:32:32";
+ // Distributed Shared Memory (addrspace:7) follows shared memory
+ // (addrspace:3).
+ if (!is64Bit)
+ Ret += "-p:32:32-p6:32:32-p7:32:32";
+ else if (UseShortPointers) {
+ Ret += "-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32";
+ } else
+ Ret += "-p6:32:32";
Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
@@ -280,8 +282,10 @@ NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
case Intrinsic::nvvm_isspacep_local:
return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
case Intrinsic::nvvm_isspacep_shared:
- case Intrinsic::nvvm_isspacep_shared_cluster:
return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
+ case Intrinsic::nvvm_isspacep_shared_cluster:
+ return std::make_pair(II->getArgOperand(0),
+ llvm::ADDRESS_SPACE_SHARED_CLUSTER);
default:
break;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 5e5362b0461d0..66c5139f8c2cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -424,12 +424,13 @@ static std::optional<bool> evaluateIsSpace(Intrinsic::ID IID, unsigned AS) {
case Intrinsic::nvvm_isspacep_local:
return AS == NVPTXAS::ADDRESS_SPACE_LOCAL;
case Intrinsic::nvvm_isspacep_shared:
+ // If shared cluster this can't be evaluated at compile time.
+ if (AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)
+ return std::nullopt;
return AS == NVPTXAS::ADDRESS_SPACE_SHARED;
case Intrinsic::nvvm_isspacep_shared_cluster:
- // We can't tell shared from shared_cluster at compile time from AS alone,
- // but it can't be either is AS is not shared.
- return AS == NVPTXAS::ADDRESS_SPACE_SHARED ? std::nullopt
- : std::optional{false};
+ return AS == NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER ||
+ AS == NVPTXAS::ADDRESS_SPACE_SHARED;
case Intrinsic::nvvm_isspacep_const:
return AS == NVPTXAS::ADDRESS_SPACE_CONST;
default:
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 70bf02035fd48..a1b4a0e5e7471 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -168,6 +168,8 @@ inline std::string AddressSpaceToString(AddressSpace A) {
return "const";
case AddressSpace::Shared:
return "shared";
+ case AddressSpace::SharedCluster:
+ return "shared::cluster";
case AddressSpace::Param:
return "param";
case AddressSpace::Local:
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll
new file mode 100644
index 0000000000000..afd0a7fded64e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s -check-prefixes=NOPTRCONV
+; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | FileCheck %s -check-prefixes=PTRCONV
+; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify %}
+; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 --nvptx-short-ptr | %ptxas-verify %}
+
+; ALL-LABEL: conv_shared_cluster_to_generic
+define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
+; CLS32: cvta.shared::cluster.u32
+; NOPTRCONV-LABEL: conv_shared_cluster_to_generic(
+; NOPTRCONV: {
+; NOPTRCONV-NEXT: .reg .b32 %r<2>;
+; NOPTRCONV-NEXT: .reg .b64 %rd<3>;
+; NOPTRCONV-EMPTY:
+; NOPTRCONV-NEXT: // %bb.0:
+; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_generic_param_0];
+; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
+; NOPTRCONV-NEXT: ld.u32 %r1, [%rd2];
+; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
+; NOPTRCONV-NEXT: ret;
+;
+; PTRCONV-LABEL: conv_shared_cluster_to_generic(
+; PTRCONV: {
+; PTRCONV-NEXT: .reg .b32 %r<3>;
+; PTRCONV-NEXT: .reg .b64 %rd<3>;
+; PTRCONV-EMPTY:
+; PTRCONV-NEXT: // %bb.0:
+; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_generic_param_0];
+; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1;
+; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
+; PTRCONV-NEXT: ld.u32 %r2, [%rd2];
+; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2;
+; PTRCONV-NEXT: ret;
+ %genptr = addrspacecast ptr addrspace(7) %ptr to ptr
+ %val = load i32, ptr %genptr
+ ret i32 %val
+}
+
+; ALL-LABEL: conv_generic_to_shared_cluster
+define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
+; CLS32: cvta.to.shared::cluster.u32
+; NOPTRCONV-LABEL: conv_generic_to_shared_cluster(
+; NOPTRCONV: {
+; NOPTRCONV-NEXT: .reg .b32 %r<2>;
+; NOPTRCONV-NEXT: .reg .b64 %rd<3>;
+; NOPTRCONV-EMPTY:
+; NOPTRCONV-NEXT: // %bb.0:
+; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0];
+; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1;
+; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd2];
+; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
+; NOPTRCONV-NEXT: ret;
+;
+; PTRCONV-LABEL: conv_generic_to_shared_cluster(
+; PTRCONV: {
+; PTRCONV-NEXT: .reg .b32 %r<3>;
+; PTRCONV-NEXT: .reg .b64 %rd<3>;
+; PTRCONV-EMPTY:
+; PTRCONV-NEXT: // %bb.0:
+; PTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0];
+; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1;
+; PTRCONV-NEXT: cvt.u32.u64 %r1, %rd2;
+; PTRCONV-NEXT: ld.shared::cluster.u32 %r2, [%r1];
+; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2;
+; PTRCONV-NEXT: ret;
+ %specptr = addrspacecast ptr %ptr to ptr addrspace(7)
+ %val = load i32, ptr addrspace(7) %specptr
+ ret i32 %val
+}
+
+; ALL-LABEL: conv_shared_to_shared_cluster
+define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
+; NOPTRCONV-LABEL: conv_shared_to_shared_cluster(
+; NOPTRCONV: {
+; NOPTRCONV-NEXT: .reg .b32 %r<2>;
+; NOPTRCONV-NEXT: .reg .b64 %rd<4>;
+; NOPTRCONV-EMPTY:
+; NOPTRCONV-NEXT: // %bb.0:
+; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_to_shared_cluster_param_0];
+; NOPTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1;
+; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2;
+; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd3];
+; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
+; NOPTRCONV-NEXT: ret;
+;
+; PTRCONV-LABEL: conv_shared_to_shared_cluster(
+; PTRCONV: {
+; PTRCONV-NEXT: .reg .b32 %r<4>;
+; PTRCONV-NEXT: .reg .b64 %rd<4>;
+; PTRCONV-EMPTY:
+; PTRCONV-NEXT: // %bb.0:
+; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_to_shared_cluster_param_0];
+; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1;
+; PTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1;
+; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2;
+; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3;
+; PTRCONV-NEXT: ld.shared::cluster.u32 %r3, [%r2];
+; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3;
+; PTRCONV-NEXT: ret;
+ %specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7)
+ %val = load i32, ptr addrspace(7) %specptr
+ ret i32 %val
+}
+
+; ALL-LABEL: conv_shared_cluster_to_shared
+define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
+; NOPTRCONV-LABEL: conv_shared_cluster_to_shared(
+; NOPTRCONV: {
+; NOPTRCONV-NEXT: .reg .b32 %r<2>;
+; NOPTRCONV-NEXT: .reg .b64 %rd<4>;
+; NOPTRCONV-EMPTY:
+; NOPTRCONV-NEXT: // %bb.0:
+; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_shared_param_0];
+; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
+; NOPTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2;
+; NOPTRCONV-NEXT: ld.shared.u32 %r1, [%rd3];
+; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
+; NOPTRCONV-NEXT: ret;
+;
+; PTRCONV-LABEL: conv_shared_cluster_to_shared(
+; PTRCONV: {
+; PTRCONV-NEXT: .reg .b32 %r<4>;
+; PTRCONV-NEXT: .reg .b64 %rd<4>;
+; PTRCONV-EMPTY:
+; PTRCONV-NEXT: // %bb.0:
+; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_shared_param_0];
+; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1;
+; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
+; PTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2;
+; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3;
+; PTRCONV-NEXT: ld.shared.u32 %r3, [%r2];
+; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3;
+; PTRCONV-NEXT: ret;
+ %specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3)
+ %val = load i32, ptr addrspace(3) %specptr
+ ret i32 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
new file mode 100644
index 0000000000000..8b6c554aeb9f2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -o - -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.0 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Floating point atomic operations tests
+define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
+; CHECK-LABEL: test_distributed_shared_cluster_float_atomic(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
+; CHECK-NEXT: mov.b16 %rs1, 0x3C00;
+; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
+; CHECK-NEXT: atom.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3;
+; CHECK-NEXT: atom.shared::cluster.add.f32 %f1, [%rd1], 0f3F800000;
+; CHECK-NEXT: atom.shared::cluster.add.f64 %fd1, [%rd1], 0d3FF0000000000000;
+; CHECK-NEXT: ret;
+entry:
+ ; Floating point atomic operations
+ %0 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, half 1.000000e+00 seq_cst
+ %1 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, bfloat 1.000000e+00 seq_cst
+ %2 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, float 1.000000e+00 seq_cst
+ %3 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, double 1.000000e+00 seq_cst
+
+ ret void
+}
+
+; Integer atomic operations tests
+define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
+; CHECK-LABEL: test_distributed_shared_cluster_int_atomic(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
+; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.exch.b64 %rd3, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.min.s32 %r3, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.min.s64 %rd4, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.min.u32 %r4, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.min.u64 %rd5, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.max.s32 %r5, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.max.s64 %rd6, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.max.u32 %r6, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.max.u64 %rd7, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.inc.u32 %r7, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.dec.u32 %r8, [%rd1], 1;
+; CHECK-NEXT: ret;
+entry:
+ ; Integer add operations
+ %0 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %1 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+
+ ; Exchange operations
+ %2 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %3 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+
+ ; Min operations (signed and unsigned)
+ %4 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %5 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+ %6 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %7 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+
+ ; Max operations (signed and unsigned)
+ %8 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %9 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+ %10 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %11 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+
+ ; Inc/Dec operations (32-bit only)
+ %12 = atomicrmw uinc_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %13 = atomicrmw udec_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+
+ ret void
+}
+
+; Bitwise atomic operations tests
+define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
+; CHECK-LABEL: test_distributed_shared_cluster_bitwise_atomic(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
+; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.or.b64 %rd3, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.xor.b32 %r3, [%rd1], 1;
+; CHECK-NEXT: atom.shared::cluster.xor.b64 %rd4, [%rd1], 1;
+; CHECK-NEXT: ret;
+entry:
+ ; Bitwise operations
+ %0 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %1 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+ %2 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %3 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+ %4 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i32 1 monotonic
+ %5 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i64 1 monotonic
+
+ ret void
+}
+
+; Compare-exchange operations tests
+define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr {
+; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<11>;
+; CHECK-NEXT: .reg .b32 %r<53>;
+; CHECK-NEXT: .reg .b64 %rd<12>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
+; CHECK-NEXT: atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
+; CHECK-NEXT: atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0;
+; CHECK-NEXT: atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
+; CHECK-NEXT: and.b64 %rd1, %rd2, -4;
+; CHECK-NEXT: cvt.u32.u64 %r33, %rd2;
+; CHECK-NEXT: and.b32 %r34, %r33, 3;
+; CHECK-NEXT: shl.b32 %r1, %r34, 3;
+; CHECK-NEXT: mov.b32 %r35, 65535;
+; CHECK-NEXT: shl.b32 %r36, %r35, %r1;
+; CHECK-NEXT: not.b32 %r2, %r36;
+; CHECK-NEXT: mov.b32 %r37, 1;
+; CHECK-NEXT: shl.b32 %r3, %r37, %r1;
+; CHECK-NEXT: ld.shared::cluster.u32 %r38, [%rd1];
+; CHECK-NEXT: and.b32 %r48, %r38, %r2;
+; CHECK-NEXT: $L__BB3_1: // %partword.cmpxchg.loop33
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: or.b32 %r39, %r48, %r3;
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
+; CHECK-NEXT: setp.eq.s32 %p1, %r6, %r39;
+; CHECK-NEXT: @%p1 bra $L__BB3_3;
+; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32
+; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT: and.b32 %r7, %r6, %r2;
+; CHECK-NEXT: setp.ne.s32 %p2, %r48, %r7;
+; CHECK-NEXT: mov.b32 %r48, %r7;
+; CHECK-NEXT: @%p2 bra $L__BB3_1;
+; CHECK-NEXT: $L__BB3_3: // %partword.cmpxchg.end31
+; CHECK-NEXT: ld.shared::cluster.u32 %r40, [%rd1];
+; CHECK-NEXT: and.b32 %r49, %r40, %r2;
+; CHECK-NEXT: $L__BB3_4: // %partword.cmpxchg.loop23
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: or.b32 %r41, %r49, %r3;
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
+; CHECK-NEXT: setp.eq.s32 %p3, %r10, %r41;
+; CHECK-NEXT: @%p3 bra $L__BB3_6;
+; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22
+; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT: and.b32 %r11, %r10, %r2;
+; CHECK-NEXT: setp.ne.s32 %p4, %r49, %r11;
+; CHECK-NEXT: mov.b32 %r49, %r11;
+; CHECK-NEXT: @%p4 bra $L__BB3_4;
+; CHECK-NEXT: $L__BB3_6: // %partword.cmpxchg.end21
+; CHECK-NEXT: fence.acq_rel.sys;
+; CHECK-NEXT: fence.acq_rel.sys;
+; CHECK-NEXT: ld.shared::cluster.u32 %r42, [%rd1];
+; CHECK-NEXT: and.b32 %r50, %r42, %r2;
+; CHECK-NEXT: $L__BB3_7: // %partword.cmpxchg.loop13
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: or.b32 %r43, %r50, %r3;
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
+; CHECK-NEXT: setp.eq.s32 %p5, %r14, %r43;
+; CHECK-NEXT: @%p5 bra $L__BB3_9;
+; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12
+; CHECK-NEXT: // in Loop: Header=BB3_7 Depth=1
+; CHECK-NEXT: and.b32 %r15, %r14, %r2;
+; CHECK-NEXT: setp.ne.s32 %p6, %r50, %r15;
+; CHECK-NEXT: mov.b32 %r50, %r15;
+; CHECK-NEXT: @%p6 bra $L__BB3_7;
+; CHECK-NEXT: $L__BB3_9: // %partword.cmpxchg.end11
+; CHECK-NEXT: fence.acq_rel.sys;
+; CHECK-NEXT: ld.shared::cluster.u32 %r44, [%rd1];
+; CHECK-NEXT: and.b32 %r51, %r44, %r2;
+; CHECK-NEXT: $L__BB3_10: // %partword.cmpxchg.loop3
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: or.b32 %r45, %r51, %r3;
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
+; CHECK-NEXT: setp.eq.s32 %p7, %r18, %r45;
+; CHECK-NEXT: @%p7 bra $L__BB3_12;
+; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2
+; CHECK-NEXT: // in Loop: Header=BB3_10 Depth=1
+; CHECK-NEXT: and.b32 %r19, %r18, %r2;
+; CHECK-NEXT: setp.ne.s32 %p8, %r51, %r19;
+; CHECK-NEXT: mov.b32 %r51, %r19;
+; CHECK-NEXT: @%p8 bra $L__BB3_10;
+; CHECK-NEXT: $L__BB3_12: // %partword.cmpxchg.end1
+; CHECK-NEXT: fence.acq_rel.sys;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: ld.shared::cluster.u32 %r46, [%rd1];
+; CHECK-NEXT: and.b32 %r52, %r46, %r2;
+; CHECK-NEXT: $L__BB3_13: // %partword.cmpxchg.loop
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: or.b32 %r47, %r52, %r3;
+; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
+; CHECK-NEXT: setp.eq.s32 %p9, %r22, %r47;
+; CHECK-NEXT: @%p9 bra $L__BB3_15;
+; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure
+; CHECK-NEXT: // in Loop: Header=BB3_13 Depth=1
+; CHECK-NEXT: and.b32 %r23, %r22, %r2;
+; CHECK-NEXT: setp.ne.s32 %p10, %r52, %r23;
+; CHECK-NEXT: mov.b32 %r52, %r23;
+; CHECK-NEXT: @%p10 bra $L__BB3_13;
+; CHECK-NEXT: $L__BB3_15: // %partword.cmpxchg.end
+; CHECK-NEXT: fence.acq_rel.sys;
+; CHECK-NEXT: ret;
+entry:
+ ; Compare-exchange operation - all memory ordering combinations for 32-bit
+ %0 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 monotonic monotonic
+ %1 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire monotonic
+ %2 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire acquire
+ %3 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 release monotonic
+ %4 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel monotonic
+ %5 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel acquire
+ %6 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst monotonic
+ %7 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst acquire
+ %8 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst seq_cst
+
+ ; Compare-exchange operation - all memory ordering combinations for 64-bit
+ %9 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 monotonic monotonic
+ %10 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire monotonic
+ %11 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire acquire
+ %12 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 release monotonic
+ %13 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel monotonic
+ %14 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel acquire
+ %15 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst monotonic
+ %16 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst acquire
+ %17 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst seq_cst
+
+ ; Compare-exchange operation - 16-bit
+ %18 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 monotonic monotonic
+ %19 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acquire acquire
+ %20 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 release monotonic
+ %21 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acq_rel acquire
+ %22 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 seq_cst seq_cst
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-aa.ll b/llvm/test/CodeGen/NVPTX/nvptx-aa.ll
index 074e741dc3e94..924220326c341 100644
--- a/llvm/test/CodeGen/NVPTX/nvptx-aa.ll
+++ b/llvm/test/CodeGen/NVPTX/nvptx-aa.ll
@@ -18,25 +18,33 @@ target triple = "nvptx64-nvidia-cuda"
; CHECK-ALIAS: NoAlias: i8 addrspace(1)* %global, i8 addrspace(5)* %local
; CHECK-ALIAS: NoAlias: i8 addrspace(5)* %local, i8 addrspace(3)* %shared
; CHECK-ALIAS: NoAlias: i8 addrspace(4)* %const, i8 addrspace(5)* %local
+; CHECK-ALIAS: MayAlias: i8* %gen, i8 addrspace(7)* %shared_cluster
+; CHECK-ALIAS: NoAlias: i8 addrspace(1)* %global, i8 addrspace(7)* %shared_cluster
+; CHECK-ALIAS: MayAlias: i8 addrspace(3)* %shared, i8 addrspace(7)* %shared_cluster
+; CHECK-ALIAS: NoAlias: i8 addrspace(4)* %const, i8 addrspace(7)* %shared_cluster
+; CHECK-ALIAS: NoAlias: i8 addrspace(5)* %local, i8 addrspace(7)* %shared_cluster
; CHECK-ALIAS: MayAlias: i8* %gen, i8 addrspace(101)* %param
; CHECK-ALIAS: NoAlias: i8 addrspace(1)* %global, i8 addrspace(101)* %param
; CHECK-ALIAS: NoAlias: i8 addrspace(101)* %param, i8 addrspace(3)* %shared
; CHECK-ALIAS: NoAlias: i8 addrspace(4)* %const, i8 addrspace(101)* %param
; CHECK-ALIAS: NoAlias: i8 addrspace(5)* %local, i8 addrspace(101)* %param
+; CHECK-ALIAS: NoAlias: i8 addrspace(101)* %param, i8 addrspace(7)* %shared_cluster
-define i8 @test_alias(ptr %gen, ptr addrspace(1) %global, ptr addrspace(3) %shared, ptr addrspace(4) %const, ptr addrspace(5) %local) {
+define i8 @test_alias(ptr %gen, ptr addrspace(1) %global, ptr addrspace(3) %shared, ptr addrspace(4) %const, ptr addrspace(5) %local, ptr addrspace(7) %shared_cluster) {
%param = addrspacecast ptr %gen to ptr addrspace(101)
%v1 = load i8, ptr %gen
%v2 = load i8, ptr addrspace(1) %global
%v3 = load i8, ptr addrspace(3) %shared
%v4 = load i8, ptr addrspace(4) %const
%v5 = load i8, ptr addrspace(5) %local
- %v6 = load i8, ptr addrspace(101) %param
+ %v6 = load i8, ptr addrspace(7) %shared_cluster
+ %v7 = load i8, ptr addrspace(101) %param
%res1 = add i8 %v1, %v2
%res2 = add i8 %res1, %v3
%res3 = add i8 %res2, %v4
%res4 = add i8 %res3, %v5
%res5 = add i8 %res4, %v6
+ %res6 = add i8 %res4, %v7
ret i8 %res5
}
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll
index 348fa688770df..d05e106d81342 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/isspacep.ll
@@ -114,24 +114,21 @@ entry:
ret i1 %val
}
-define i1 @test_isspacep_cluster_shared_unsure(ptr addrspace(3) %addr) {
-; CHECK-LABEL: define i1 @test_isspacep_cluster_shared_unsure(
-; CHECK-SAME: ptr addrspace(3) [[ADDR:%.*]]) {
+define i1 @test_isspacep_shared_cluster_true(ptr addrspace(7) %addr) {
+; CHECK-LABEL: define i1 @test_isspacep_shared_cluster_true(
+; CHECK-SAME: ptr addrspace(7) [[ADDR:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr i8, ptr addrspace(3) [[ADDR]], i32 10
-; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[ADDR1]] to ptr
-; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr [[TMP0]])
-; CHECK-NEXT: ret i1 [[VAL]]
+; CHECK-NEXT: ret i1 true
;
entry:
- %addr0 = addrspacecast ptr addrspace(3) %addr to ptr
+ %addr0 = addrspacecast ptr addrspace(7) %addr to ptr
%addr1 = getelementptr i8, ptr %addr0, i32 10
%val = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %addr1)
ret i1 %val
}
-define i1 @test_isspacep_cluster_shared_false(ptr addrspace(1) %addr) {
-; CHECK-LABEL: define i1 @test_isspacep_cluster_shared_false(
+define i1 @test_isspacep_shared_cluster_false(ptr addrspace(1) %addr) {
+; CHECK-LABEL: define i1 @test_isspacep_shared_cluster_false(
; CHECK-SAME: ptr addrspace(1) [[ADDR:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: ret i1 false
@@ -142,3 +139,34 @@ entry:
%val = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %addr1)
ret i1 %val
}
+
+; isspacep_shared_cluster returns true for shared
+define i1 @test_isspacep_cluster_shared_shared(ptr addrspace(3) %addr) {
+; CHECK-LABEL: define i1 @test_isspacep_cluster_shared_shared(
+; CHECK-SAME: ptr addrspace(3) [[ADDR:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %addr0 = addrspacecast ptr addrspace(3) %addr to ptr
+ %addr1 = getelementptr i8, ptr %addr0, i32 10
+ %val = call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %addr1)
+ ret i1 %val
+}
+
+; shared cluster cannot be evaluated to shared at compile time
+define i1 @test_isspacep_shared_shared_cluster(ptr addrspace(7) %addr) {
+; CHECK-LABEL: define i1 @test_isspacep_shared_shared_cluster(
+; CHECK-SAME: ptr addrspace(7) [[ADDR:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ADDR2:%.*]] = getelementptr i8, ptr addrspace(7) [[ADDR]], i32 10
+; CHECK-NEXT: [[ADDR1:%.*]] = addrspacecast ptr addrspace(7) [[ADDR2]] to ptr
+; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.nvvm.isspacep.shared(ptr [[ADDR1]])
+; CHECK-NEXT: ret i1 [[VAL]]
+;
+entry:
+ %addr0 = addrspacecast ptr addrspace(7) %addr to ptr
+ %addr1 = getelementptr i8, ptr %addr0, i32 10
+ %val = call i1 @llvm.nvvm.isspacep.shared(ptr %addr1)
+ ret i1 %val
+}
More information about the llvm-commits
mailing list