[Mlir-commits] [mlir] dd099e9 - [MLIR][NVVM] Fix links in OP definition (#125865)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Feb 5 07:18:09 PST 2025
Author: Guray Ozen
Date: 2025-02-05T16:18:04+01:00
New Revision: dd099e9cc24ad60e988ccec1eaee3299a0c7c277
URL: https://github.com/llvm/llvm-project/commit/dd099e9cc24ad60e988ccec1eaee3299a0c7c277
DIFF: https://github.com/llvm/llvm-project/commit/dd099e9cc24ad60e988ccec1eaee3299a0c7c277.diff
LOG: [MLIR][NVVM] Fix links in OP definition (#125865)
Added:
Modified:
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index c501b5e7c10015..3d378751e798f9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -477,8 +477,7 @@ def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
The default barrier id is 0 that is similar to `nvvm.barrier` Op. When
`barrierId` is not present, the default barrier id is used.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
}];
let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict";
@@ -504,8 +503,7 @@ def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
}];
string llvmBuilder = [{
@@ -531,8 +529,7 @@ def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> {
ordering and visibility guarantees provided for the memory accesses performed prior to
`cluster.arrive`.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
}];
string llvmBuilder = [{
@@ -553,8 +550,7 @@ def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> {
of the cluster to perform `cluster.arrive`. The `aligned` attribute, when provided,
generates the .aligned version of the PTX instruction.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
}];
string llvmBuilder = [{
@@ -606,8 +602,8 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
let description = [{
Fence operation with proxy to establish an ordering between memory accesses
that may happen through
diff erent proxies.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
let assemblyFormat = "attr-dict";
@@ -657,8 +653,8 @@ def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
value for the `size` operand is 128 and must be an immediate. Generic Addressing
is used unconditionally, and the address specified by the operand `addr` must
fall within the `.global` state space. Otherwise, the behavior is undefined
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
@@ -685,8 +681,8 @@ def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
subsequent memory access performed via the tensormap proxy. `fence.proxy.release`
operation can form a release sequence that synchronizes with an acquire
sequence that contains the fence.proxy.acquire proxy fence operation
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
@@ -724,8 +720,8 @@ def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
let arguments = (ins );
let description = [{
Fence operation that applies on the prior nvvm.mbarrier.init
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
}];
let assemblyFormat = "attr-dict";
@@ -768,8 +764,8 @@ def NVVM_ShflOp :
the source. The `mask_and_clamp` contains two packed values specifying
a mask for logically splitting warps into sub-segments and an upper bound
for clamping the source lane index.
- [For more information, refer PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync)
}];
string llvmBuilder = [{
auto intId = getShflIntrinsicId(
@@ -814,8 +810,7 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync">
of this Op. The predicate result is set to `True` for the
leader thread, and `False` for all other threads.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync)
}];
let results = (outs I1:$pred);
@@ -899,8 +894,8 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
The `addr` operand specifies the address of the mbarrier object
in generic address space. The `noinc` attr impacts how the
mbarrier's state is updated.
- [For more information, refer PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
}];
let assemblyFormat = "$addr attr-dict `:` type(operands)";
@@ -923,8 +918,9 @@ def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.share
track all prior cp.async operations initiated by the executing thread.
The `addr` operand specifies the address of the mbarrier object in
shared memory. The `noinc` attr impacts how the mbarrier's state
- is updated. [For more information, refer PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
+ is updated.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
}];
let assemblyFormat = "$addr attr-dict `:` type(operands)";
@@ -982,8 +978,8 @@ def NVVM_CvtFloatToTF32Op : NVVM_Op<"cvt.float.to.tf32"> {
The `relu` attribute, when set, lowers to the '.relu' variant of
the cvt instruction. The `rnd` and `sat` attributes specify the
the rounding and saturation modes respectively.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
}];
let hasVerifier = 1;
@@ -1633,8 +1629,8 @@ def NVVM_StMatrixOp: NVVM_PTXBuilder_Op<"stmatrix">,
let description = [{
Collectively store one or more matrices across all threads in a warp to the
location indicated by the address operand $ptr in shared memory.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix)
}];
let assemblyFormat = "$ptr `,` $sources attr-dict `:` type(operands)";
@@ -1846,8 +1842,7 @@ def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">,
This Op commits all prior initiated but uncommitted cp.async.bulk
instructions into a cp.async.bulk-group.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group)
}];
string llvmBuilder = [{
@@ -1871,8 +1866,7 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
async operations in the specified bulk async-group have completed reading
from their source locations.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group)
}];
string llvmBuilder = [{
@@ -1917,8 +1911,7 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
The `l2CacheHint` operand is optional, and it is used to specify cache
eviction policy that may be used during the memory access.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor)
}];
let assemblyFormat = [{
@@ -2034,8 +2027,7 @@ def NVVM_CpAsyncBulkTensorPrefetchOp :
The `l2CacheHint` operand is optional, and it is used to specify cache
eviction policy that may be used during the memory access.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor)
}];
let assemblyFormat = [{
@@ -2134,8 +2126,7 @@ def NVVM_CpAsyncBulkTensorReduceOp :
The `l2CacheHint` operand is optional, and it is used to specify cache
eviction policy that may be used during the memory access.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor)
}];
let assemblyFormat = [{
@@ -2194,8 +2185,8 @@ def NVVM_CpAsyncBulkGlobalToSharedClusterOp :
The `l2CacheHint` operand is optional, and it is used to specify cache
eviction policy that may be used during the memory access.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
}];
let arguments = (ins
@@ -2252,8 +2243,7 @@ def NVVM_CpAsyncBulkSharedCTAToSharedClusterOp :
Initiates an asynchronous copy operation from Shared CTA memory to Shared
cluster memory.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
}];
let arguments = (ins
@@ -2283,8 +2273,8 @@ def NVVM_CpAsyncBulkSharedCTAToGlobalOp :
The `l2CacheHint` operand is optional, and it is used to specify cache
eviction policy that may be used during the memory access.
- [For more information, see PTX ISA]
- (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk)
}];
let arguments = (ins
@@ -2524,6 +2514,8 @@ def NVVM_GriddepcontrolWaitOp : NVVM_IntrOp<"griddepcontrol.wait", [], 0> {
Causes the executing thread to wait until all prerequisite grids in flight
have completed and all the memory operations from the prerequisite grids
are performed and made visible to the current grid.
+
+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol)
}];
}
@@ -2536,6 +2528,8 @@ def NVVM_GriddepcontrolLaunchDependentsOp
Signals that specific dependents the runtime system designated to react to
this instruction can be scheduled as soon as all other CTAs in the grid
issue the same instruction or have completed.
+
+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol)
}];
}
More information about the Mlir-commits
mailing list