[llvm-branch-commits] [mlir] [mlir][GPU] Plumb range information through the NVVM lowterings (PR #107659)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Sep 6 17:01:27 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-mlir-llvm
Author: Krzysztof Drewniak (krzysz00)
<details>
<summary>Changes</summary>
Update the GPU to NVVM lowerings to correctly propagate range
information on IDs and dimension queries, etiher from
known_{block,grid}_size attributes or from `upperBound` annotations on
the operations themselves.
---
Patch is 37.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107659.diff
6 Files Affected:
- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+149-134)
- (modified) mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp (+34-16)
- (modified) mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp (+1)
- (modified) mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir (+15-3)
- (modified) mlir/test/Target/LLVMIR/Import/nvvmir.ll (+3)
- (modified) mlir/test/Target/LLVMIR/nvvmir.mlir (+5-2)
``````````diff
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 709dd922b8fa2f..66ac9f289d233b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -86,8 +86,8 @@ class NVVM_Op<string mnemonic, list<Trait> traits = []> :
LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
}
-/// Base class that defines BasicPtxBuilderOpInterface.
-class NVVM_PTXBuilder_Op<string mnemonic,
+/// Base class that defines BasicPtxBuilderOpInterface.
+class NVVM_PTXBuilder_Op<string mnemonic,
list<Trait> traits = [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]> :
LLVM_OpBase<NVVM_Dialect, mnemonic, traits> {
}
@@ -123,52 +123,67 @@ class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
let assemblyFormat = "attr-dict `:` type($res)";
}
+class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
+ NVVM_SpecialRegisterOp<mnemonic, traits> {
+ let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range);
+ let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
+ let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda;
+ let mlirBuilder = baseMlirBuilder # importRangeRetAttrCode # baseMlirBuilderCoda;
+
+ // Backwards-compatibility builder for an unspecified range.
+ let builders = [
+ OpBuilder<(ins "Type":$resultType), [{
+ build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
+ }]>
+ ];
+}
+
//===----------------------------------------------------------------------===//
// Lane index and range
-def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">;
-def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">;
+def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
//===----------------------------------------------------------------------===//
// Thread index and range
-def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
-def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
-def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
-def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
-def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
-def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+def NVVM_ThreadIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">;
//===----------------------------------------------------------------------===//
// Block index and range
-def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
-def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
-def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
-def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
-def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
-def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
+def NVVM_BlockIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">;
//===----------------------------------------------------------------------===//
// CTA Cluster index and range
-def NVVM_ClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.x">;
-def NVVM_ClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.y">;
-def NVVM_ClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.z">;
-def NVVM_ClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.x">;
-def NVVM_ClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.y">;
-def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">;
+def NVVM_ClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x">;
+def NVVM_ClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">;
+def NVVM_ClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">;
+def NVVM_ClusterDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">;
+def NVVM_ClusterDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">;
+def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">;
//===----------------------------------------------------------------------===//
// CTA index and range within Cluster
-def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
-def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
-def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
-def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
-def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
-def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
+def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
+def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
+def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
+def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
+def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
+def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
//===----------------------------------------------------------------------===//
// CTA index and across Cluster dimensions
-def NVVM_ClusterId : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctarank">;
-def NVVM_ClusterDim : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank">;
+def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
//===----------------------------------------------------------------------===//
// Clock registers
@@ -197,11 +212,11 @@ def ReduxKindMin : I32EnumAttrCase<"MIN", 4, "min">;
def ReduxKindOr : I32EnumAttrCase<"OR", 5, "or">;
def ReduxKindUmax : I32EnumAttrCase<"UMAX", 6, "umax">;
def ReduxKindUmin : I32EnumAttrCase<"UMIN", 7, "umin">;
-def ReduxKindXor : I32EnumAttrCase<"XOR", 8, "xor">;
+def ReduxKindXor : I32EnumAttrCase<"XOR", 8, "xor">;
/// Enum attribute of the different kinds.
def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
- [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr,
+ [ReduxKindAdd, ReduxKindAnd, ReduxKindMax, ReduxKindMin, ReduxKindOr,
ReduxKindUmax, ReduxKindUmin, ReduxKindXor]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
@@ -221,7 +236,7 @@ def NVVM_ReduxOp :
}];
let assemblyFormat = [{
$kind $val `,` $mask_and_clamp attr-dict `:` type($val) `->` type($res)
- }];
+ }];
}
//===----------------------------------------------------------------------===//
@@ -308,7 +323,7 @@ def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.
let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
}
-def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
+def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,
Arguments<(ins LLVM_AnyPointer:$addr, I32:$txcount, PtxPredicate:$predicate)> {
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
let extraClassDefinition = [{
@@ -316,16 +331,16 @@ def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_t
}];
}
-def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,
- Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
+def NVVM_MBarrierArriveExpectTxSharedOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx.shared">,
+ Arguments<(ins LLVM_PointerShared:$addr, I32:$txcount, PtxPredicate:$predicate)> {
let assemblyFormat = "$addr `,` $txcount (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)";
let extraClassDefinition = [{
std::string $cppClass::getPtx() { return std::string("mbarrier.arrive.expect_tx.shared.b64 _, [%0], %1;"); }
}];
}
-def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,
- Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {
+def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity">,
+ Arguments<(ins LLVM_AnyPointer:$addr, I32:$phase, I32:$ticks)> {
let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
let extraClassDefinition = [{
std::string $cppClass::getPtx() {
@@ -338,13 +353,13 @@ def NVVM_MBarrierTryWaitParityOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity"
"bra.uni LAB_WAIT; \n\t"
"DONE: \n\t"
"}"
- );
+ );
}
}];
}
-def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,
- Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
+def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.parity.shared">,
+ Arguments<(ins LLVM_PointerShared:$addr, I32:$phase, I32:$ticks)> {
let assemblyFormat = "$addr `,` $phase `,` $ticks attr-dict `:` type(operands)";
let extraClassDefinition = [{
std::string $cppClass::getPtx() {
@@ -357,7 +372,7 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
"bra.uni LAB_WAIT; \n\t"
"DONE: \n\t"
"}"
- );
+ );
}
}];
}
@@ -392,7 +407,7 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
}
def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
- let arguments = (ins
+ let arguments = (ins
Optional<I32>:$barrierId,
Optional<I32>:$numberOfThreads);
string llvmBuilder = [{
@@ -401,7 +416,7 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
{$barrierId, $numberOfThreads});
} else if($barrierId) {
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_n,
- {$barrierId});
+ {$barrierId});
} else {
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier0);
}
@@ -410,27 +425,27 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
}
-def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
+def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
{
let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads);
let description = [{
- Thread that executes this op announces their arrival at the barrier with
+ Thread that executes this op announces their arrival at the barrier with
given id and continue their execution.
- The default barrier id is 0 that is similar to `nvvm.barrier` Op. When
- `barrierId` is not present, the default barrier id is used.
+ The default barrier id is 0 that is similar to `nvvm.barrier` Op. When
+ `barrierId` is not present, the default barrier id is used.
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
}];
-
+
let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict";
let extraClassDefinition = [{
std::string $cppClass::getPtx() {
std::string ptx = "bar.arrive ";
- if (getBarrierId()) { ptx += "%0, %1;"; }
+ if (getBarrierId()) { ptx += "%0, %1;"; }
else { ptx += "0, %0;"; }
return ptx;
}
@@ -553,7 +568,7 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
-
+
let assemblyFormat = "attr-dict";
let extraClassDefinition = [{
std::string $cppClass::getPtx() {
@@ -671,9 +686,9 @@ def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
-
+
let assemblyFormat = "attr-dict";
- let extraClassDefinition = [{
+ let extraClassDefinition = [{
std::string $cppClass::getPtx() {
return std::string("fence.mbarrier_init.release.cluster;");
}
@@ -749,13 +764,13 @@ def NVVM_SyncWarpOp :
}
-def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
+def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
[DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>
-{
+{
let results = (outs I1:$pred);
- let assemblyFormat = "attr-dict `->` type(results)";
- let extraClassDefinition = [{
- std::string $cppClass::getPtx() {
+ let assemblyFormat = "attr-dict `->` type(results)";
+ let extraClassDefinition = [{
+ std::string $cppClass::getPtx() {
return std::string(
"{ \n"
".reg .u32 rx; \n"
@@ -764,7 +779,7 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync",
" elect.sync rx | px, 0xFFFFFFFF;\n"
"@px mov.pred %0, 1; \n"
"}\n"
- );
+ );
}
}];
}
@@ -776,16 +791,16 @@ def LoadCacheModifierLU : I32EnumAttrCase<"LU", 3, "lu">;
def LoadCacheModifierCV : I32EnumAttrCase<"CV", 4, "cv">;
/// Enum attribute of the different kinds.
-def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
+def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
"NVVM load cache modifier kind",
- [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS,
+ [LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS,
LoadCacheModifierLU, LoadCacheModifierCV]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
let description = [{
Enum attribute of the different kinds of cache operators for load instructions.
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#id62)
}];
}
@@ -811,7 +826,7 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;
else if($modifier == NVVM::LoadCacheModifierKind::CA)
id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
- else
+ else
llvm_unreachable("unsupported cache modifier");
break;
default:
@@ -824,21 +839,21 @@ def NVVM_CpAsyncOp : NVVM_PTXBuilder_Op<"cp.async.shared.global">,
let extraClassDeclaration = [{
bool hasIntrinsic() { if(getCpSize()) return false; return true; }
- void getAsmValues(RewriterBase &rewriter,
+ void getAsmValues(RewriterBase &rewriter,
llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues) {
asmValues.push_back({getDst(), PTXRegisterMod::Read});
asmValues.push_back({getSrc(), PTXRegisterMod::Read});
asmValues.push_back({makeConstantI32(rewriter, getSize()), PTXRegisterMod::Read});
asmValues.push_back({getCpSize(), PTXRegisterMod::Read});
- }
+ }
}];
- let extraClassDefinition = [{
- std::string $cppClass::getPtx() {
+ let extraClassDefinition = [{
+ std::string $cppClass::getPtx() {
if(getModifier() == NVVM::LoadCacheModifierKind::CG)
return std::string("cp.async.cg.shared.global [%0], [%1], %2, %3;\n");
if(getModifier() == NVVM::LoadCacheModifierKind::CA)
return std::string("cp.async.ca.shared.global [%0], [%1], %2, %3;\n");
- llvm_unreachable("unsupported cache modifier");
+ llvm_unreachable("unsupported cache modifier");
}
}];
}
@@ -1526,9 +1541,9 @@ def NVVM_WMMAMmaOp : NVVM_Op<"wmma.mma">,
let hasVerifier = 1;
}
-def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
- Arguments<(ins LLVM_PointerShared:$ptr,
- Variadic<I32>:$sources,
+def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
+ Arguments<(ins LLVM_PointerShared:$ptr,
+ Variadic<I32>:$sources,
MMALayoutAttr:$layout)> {
let summary = "cooperative matrix store";
let description = [{
@@ -1537,7 +1552,7 @@ def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix)
}];
-
+
let assemblyFormat = "$ptr `,` $sources attr-dict `:` type(operands)";
let extraClassDefinition = [{
std::string $cppClass::getPtx() {
@@ -1757,25 +1772,25 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">,
}
def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
- Arguments<(ins
- ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group,
+ Arguments<(ins
+ ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group,
OptionalAttr<UnitAttr>:$read)> {
let assemblyFormat = "$group attr-dict";
let description = [{
Op waits for completion of the most recent bulk async-groups.
The `$group` operand tells waiting has to be done until for $group or fewer
- of the most recent bulk async-groups. If `$group` is 0, the op wait until
+ of the most recent bulk async-groups. If `$group` is 0, the op wait until
all the most recent bulk async-groups have completed.
- The `$read` indicates that the waiting has to be done until all the bulk
- async operations in the specified bulk async-group have completed reading
+ The `$read` indicates that the waiting has to be done until all the bulk
+ async operations in the specified bulk async-group have completed reading
from their source locations.
[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group)
}];
-
+
string llvmBuilder = [{
auto intId = op.getRead() ?
llvm::Intrinsic::nvvm_cp_async_bulk_wait_group_read :
@@ -1784,53 +1799,53 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
}];
}
-def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
- NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global",
- [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
+def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
+ NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global",
+ [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>,
AttrSizedOperandSegments]>,
Arguments<(ins LLVM_PointerShared:$dstMem,
LLVM_AnyPointer:$tmaDescriptor,
Variadic<I32>:$coordinates,
- LLVM_PointerShared:$mbar,
+ LLVM_PointerShared:$mbar,
Variadic<I16>:$im2colOffsets,
Optional<I16>:$multicastMask,
Optional<I64>:$l2CacheHint,
...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/107659
More information about the llvm-branch-commits
mailing list