[Mlir-commits] [mlir] [mlir][nvgpu] NVGPU Tutorials (PR #87065)

Fri Mar 29 06:15:51 PDT 2024

github-actions[bot] wrote:




:warning: Python code formatter, darker found issues in your code. :warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
darker --check --diff -r d06ba376833553f38b5cbc606e479ed7936e5f5b...5a2e21c9c071a8ba2b7a4cbbf68dc65889256efd mlir/test/Examples/nvgpu/Ch0.py mlir/test/Examples/nvgpu/Ch1.py mlir/test/Examples/nvgpu/Ch2.py mlir/test/Examples/nvgpu/Ch3.py mlir/test/Examples/nvgpu/Ch4.py mlir/test/Examples/nvgpu/tools/nvdsl.py mlir/test/Examples/nvgpu/tools/nvgpucompiler.py
``````````

</details>

<details>
<summary>
View the diff from darker here.
</summary>

``````````diff

--- Ch2.py	2024-03-29 13:09:45.000000 +0000
+++ Ch2.py	2024-03-29 13:15:27.117517 +0000
@@ -3,11 +3,11 @@
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 2 : 2D Saxpy with TMA
 # ===----------------------------------------------------------------------===//
 #
-# This program demonstrates 2D Saxpy. It is same as Chapter 1, 
+# This program demonstrates 2D Saxpy. It is same as Chapter 1,
 # but it loads data using TMA (Tensor Memory Accelerator)
 #
 # This chapter introduces demonstrates:
 #  1. Create and initialize asynchronous transactional barrier (mbarrier)
 #  2. Execute Tensor Memory Accelerator (TMA) Load
--- Ch4.py	2024-03-29 13:09:45.000000 +0000
+++ Ch4.py	2024-03-29 13:15:27.222321 +0000
@@ -245,10 +245,11 @@
     x_tma.create_descriptor(x_dev)
     y_tma.create_descriptor(y_dev)
 
     grid = [(M // TILE_M), (N // TILE_N), 1]
     block = [128, 1, 1]
+
     @NVDSL.mlir_gpu_launch(grid=grid, block=block, smem=229440)
     def gemm_multistage_kernel():
         # Initialize mbarriers and prefetch TMA descriptors
         mbar_group = bootstrap(x_tma, y_tma)
 
--- tools/nvdsl.py	2024-03-29 13:09:45.000000 +0000
+++ tools/nvdsl.py	2024-03-29 13:15:27.359993 +0000
@@ -12,11 +12,13 @@
 MLIR_DYNAMIC = -9223372036854775808
 
 
 def const(value: int, ty=None):
     ty = T.index() if ty is None else ty
-    if isinstance(value, ir.Value) and (value.type.isinstance(value.type) or T.bool().isinstance(value.type)):
+    if isinstance(value, ir.Value) and (
+        value.type.isinstance(value.type) or T.bool().isinstance(value.type)
+    ):
         return value
     return arith.constant(ty, value)
 
 
 def get_type_size(ty):
@@ -28,10 +30,11 @@
     if ir.FloatType.isinstance(ty):
         return ir.FloatType(ty).width // 8
     if ir.IntegerType.isinstance(ty):
         return ir.IntegerType(ty).width // 8
     raise NotImplementedError(ty)
+
 
 def get_mlir_func_obj_ty(inputArgs):
     args = []
     c_int_p = ctypes.c_int * 1
     c_float_p = ctypes.c_float * 1
@@ -131,23 +134,24 @@
         return ir.Type.parse(parse_str)
 
     def create_descriptor(self, device_ptr):
         tma_descriptor_ty = self.tensormap_descriptor_ty
         device_unranked_memref = memref.CastOp(
-            ir.UnrankedMemRefType.get(self.memref_ty.element_type,
-                                      self.memref_ty.memory_space),
+            ir.UnrankedMemRefType.get(
+                self.memref_ty.element_type, self.memref_ty.memory_space
+            ),
             device_ptr,
         )
         self.tma_descriptor = nvgpu.TmaCreateDescriptorOp(
-            tma_descriptor_ty, device_unranked_memref,
-            map(const, self.tma_shape))
+            tma_descriptor_ty, device_unranked_memref, map(const, self.tma_shape)
+        )
         return self.tma_descriptor.result
 
     def prefetch(self, predicate=None):
         nvgpu.tma_prefetch_descriptor(self.tma_descriptor, predicate=predicate)
 
-    def load(self, dest, mbarrier: Mbarriers, coords=[0,0], predicate=None):
+    def load(self, dest, mbarrier: Mbarriers, coords=[0, 0], predicate=None):
         coord_ops = [const(c) for c in coords]
         nvgpu.TmaAsyncLoadOp(
             dest,
             mbarrier.mbar_group_op,
             self.tma_descriptor,
@@ -178,35 +182,39 @@
     def op(self):
         return nvgpu.warpgroup_mma_init_accumulator(self.acc_ty)
 
 
 class Matrix:
-
     def __init__(self, smem, tma_descriptor: TMA, M, N):
         self.tma_descriptor = tma_descriptor
         self.smem = smem
         self.M = M
         self.N = N
 
     @property
     def wgmma_ty(self):
-        return ir.Type.parse("!nvgpu.warpgroup.descriptor<tensor=memref<" +
-                             str(self.M) + "x" +
-                             str(self.N) + "x" +
-                             str(self.tma_descriptor.memref_ty.element_type) +
-                             ", #gpu.address_space<workgroup>>>")
+        return ir.Type.parse(
+            "!nvgpu.warpgroup.descriptor<tensor=memref<"
+            + str(self.M)
+            + "x"
+            + str(self.N)
+            + "x"
+            + str(self.tma_descriptor.memref_ty.element_type)
+            + ", #gpu.address_space<workgroup>>>"
+        )
 
     def matmul(lhs, rhs, acc):
         wgmma_desc_lhs = nvgpu.warpgroup_generate_descriptor(
-            lhs.wgmma_ty, lhs.smem, lhs.tma_descriptor.tma_descriptor)
+            lhs.wgmma_ty, lhs.smem, lhs.tma_descriptor.tma_descriptor
+        )
         wgmma_desc_rhs = nvgpu.warpgroup_generate_descriptor(
-            rhs.wgmma_ty, rhs.smem, rhs.tma_descriptor.tma_descriptor)
-        return nvgpu.WarpgroupMmaOp(acc.type,
-                                    wgmma_desc_lhs,
-                                    wgmma_desc_rhs,
-                                    acc,
-                                    transposeB=True)
+            rhs.wgmma_ty, rhs.smem, rhs.tma_descriptor.tma_descriptor
+        )
+        return nvgpu.WarpgroupMmaOp(
+            acc.type, wgmma_desc_lhs, wgmma_desc_rhs, acc, transposeB=True
+        )
+
 
 def get_dynamic_shared_memory(shape=None, ty=None, offset: int = 0):
     smem_space_str = "#gpu.address_space<workgroup>"
     smem_space = ir.Attribute.parse(smem_space_str)
     dynamic_smem = gpu.dynamic_shared_memory(
@@ -222,10 +230,11 @@
         dynamic_smem,
         const(offset),
         [],
     )
 
+
 @staticmethod
 def get_mlir_ty(arg):
     def get_mlir_ty_from_np(dtype):
         if dtype == np.float16:
             return T.f16()
@@ -236,10 +245,11 @@
         if dtype == np.int32:
             return T.i32()
         if dtype == np.int64:
             return T.i64()
         raise NotImplementedError(dtype)
+
     if isinstance(arg, bool):
         return T.bool()
     elif isinstance(arg, int):
         return T.index()
     elif isinstance(arg, float):
@@ -248,10 +258,11 @@
         descriptor = rt.get_ranked_memref_descriptor(arg)
         dtype = get_mlir_ty_from_np(arg.dtype)
         shape = descriptor.shape
         return memref.MemRefType.get(shape, dtype)
     raise NotImplementedError(arg)
+
 
 class NVDSL:
     @staticmethod
     def mlir_gpu_launch(grid=(1, 1, 1), block=(1, 1, 1), smem=0):
         def decorator(func):

``````````

</details>


https://github.com/llvm/llvm-project/pull/87065