[Mlir-commits] [mlir] [MLIR][NVVM] Add Ops for tcgen05 cp and shift (PR #127798)
Guray Ozen
llvmlistbot at llvm.org
Thu Feb 20 04:27:05 PST 2025
================
@@ -2810,6 +2810,113 @@ def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
}];
}
+def NVVM_Tcgen05ShiftOp : NVVM_Op<"tcgen05.shift"> {
+ let summary = "Tcgen05 shift operation";
+ let description = [{
+ The `tcgen05.shift` is an asynchronous instruction which initiates
+ the shifting of 32-byte elements downwards across all the rows,
+ except the last, by one row. The operand `taddr` specifies the base
+ address of the matrix in Tensor Memory whose rows must be down shifted.
+ [For more information refer to the PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-shift)
+ }];
+
+ let arguments = (ins LLVM_PointerTensor:$taddr,
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+ let assemblyFormat = "$taddr attr-dict `:` type(operands)";
+
+ string llvmBuilder = [{
+ auto id = ($group == NVVM::Tcgen05GroupKind::CTA_1) ?
+ llvm::Intrinsic::nvvm_tcgen05_shift_down_cg1 :
+ llvm::Intrinsic::nvvm_tcgen05_shift_down_cg2;
+ createIntrinsicCall(builder, id, {$taddr});
+ }];
+}
+
+def Shape128x256b : I32EnumAttrCase<"SHAPE_128x256b", 0, "shape_128x256b">;
+def Shape4x256b : I32EnumAttrCase<"SHAPE_4x256b", 1, "shape_4x256b">;
+def Shape128x128b : I32EnumAttrCase<"SHAPE_128x128b", 2, "shape_128x128b">;
+def Shape64x128b : I32EnumAttrCase<"SHAPE_64x128b", 3, "shape_64x128b">;
+def Shape32x128b : I32EnumAttrCase<"SHAPE_32x128b", 4, "shape_32x128b">;
+
+def Tcgen05CpShape : I32EnumAttr<"Tcgen05CpShape", "tcgen05 cp shapes",
+ [Shape128x256b, Shape4x256b, Shape128x128b, Shape64x128b, Shape32x128b]> {
+ let cppNamespace = "::mlir::NVVM";
+ let genSpecializedAttr = 0;
+}
+def Tcgen05CpShapeAttr : EnumAttr<NVVM_Dialect, Tcgen05CpShape, "tcgen05_cp_shape"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def Tcgen05CpMulticastNone: I32EnumAttrCase<"NONE", 0, "none">;
+def Tcgen05CpMulticastWarpx2_02_13: I32EnumAttrCase<"WARPX2_02_13", 1, "warpx2_02_13">;
+def Tcgen05CpMulticastWarpx2_01_23: I32EnumAttrCase<"WARPX2_01_23", 2, "warpx2_01_23">;
+def Tcgen05CpMulticastWarpx4: I32EnumAttrCase<"WARPX4", 3, "warpx4">;
+
+def Tcgen05CpMulticast : I32EnumAttr<"Tcgen05CpMulticast", "tcgen05 cp multicast",
+ [Tcgen05CpMulticastNone, Tcgen05CpMulticastWarpx2_02_13,
+ Tcgen05CpMulticastWarpx2_01_23, Tcgen05CpMulticastWarpx4]> {
+ let cppNamespace = "::mlir::NVVM";
+ let genSpecializedAttr = 0;
+}
+def Tcgen05CpMulticastAttr : EnumAttr<NVVM_Dialect, Tcgen05CpMulticast, "tcgen05_cp_multicast"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def FormatB6x16_P32: I32EnumAttrCase<"B6x16_P32", 0, "b6x16_p32">;
+def FormatB4x16_P64: I32EnumAttrCase<"B4x16_P64", 1, "b4x16_p64">;
+
+def Tcgen05CpSrcFormat : I32EnumAttr<"Tcgen05CpSrcFormat", "tcgen05 cp source format",
+ [FormatB6x16_P32, FormatB4x16_P64]> {
+ let cppNamespace = "::mlir::NVVM";
+ let genSpecializedAttr = 0;
+}
+def Tcgen05CpSrcFormatAttr : EnumAttr<NVVM_Dialect, Tcgen05CpSrcFormat, "tcgen05_cp_src_fmt"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_Tcgen05CpOp : NVVM_Op<"tcgen05.cp"> {
+ let summary = "Tcgen05 copy operation";
+ let description = [{
+ Instruction tcgen05.cp initiates an asynchronous copy operation from
+ shared memory to the location specified by the address operand `taddr`
+ in the Tensor Memory. The 64-bit register operand `smem_desc` specifies
+ the matrix descriptor representing the source matrix in the shared memory
+ that needs to be copied.
+
+ usage:
+ nvvm.tcgen05.cp %taddr, %smem_desc {
+ group = #nvvm.tcgen05_group<cta_2>,
+ shape = #nvvm.tcgen05_cp_shape<shape_64x128b>,
+ multicast = #nvvm.tcgen05_cp_multicast<warpx2_01_23>,
+ srcFormat = #nvvm.tcgen05_cp_format<b6x16_p32>
+ }
+ [For more information refer to the PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-cp)
----------------
grypp wrote:
```suggestion
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-cp)
```
https://github.com/llvm/llvm-project/pull/127798
More information about the Mlir-commits
mailing list