[Mlir-commits] [mlir] [mlir][nvgpu] NVGPU Tutorials (PR #87065)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Mar 29 06:15:51 PDT 2024
github-actions[bot] wrote:
<!--LLVM CODE FORMAT COMMENT: {darker}-->
:warning: Python code formatter, darker found issues in your code. :warning:
<details>
<summary>
You can test this locally with the following command:
</summary>
``````````bash
darker --check --diff -r d06ba376833553f38b5cbc606e479ed7936e5f5b...5a2e21c9c071a8ba2b7a4cbbf68dc65889256efd mlir/test/Examples/nvgpu/Ch0.py mlir/test/Examples/nvgpu/Ch1.py mlir/test/Examples/nvgpu/Ch2.py mlir/test/Examples/nvgpu/Ch3.py mlir/test/Examples/nvgpu/Ch4.py mlir/test/Examples/nvgpu/tools/nvdsl.py mlir/test/Examples/nvgpu/tools/nvgpucompiler.py
``````````
</details>
<details>
<summary>
View the diff from darker here.
</summary>
``````````diff
--- Ch2.py 2024-03-29 13:09:45.000000 +0000
+++ Ch2.py 2024-03-29 13:15:27.117517 +0000
@@ -3,11 +3,11 @@
# ===----------------------------------------------------------------------===//
# Chapter 2 : 2D Saxpy with TMA
# ===----------------------------------------------------------------------===//
#
-# This program demonstrates 2D Saxpy. It is same as Chapter 1,
+# This program demonstrates 2D Saxpy. It is same as Chapter 1,
# but it loads data using TMA (Tensor Memory Accelerator)
#
# This chapter introduces demonstrates:
# 1. Create and initialize asynchronous transactional barrier (mbarrier)
# 2. Execute Tensor Memory Accelerator (TMA) Load
--- Ch4.py 2024-03-29 13:09:45.000000 +0000
+++ Ch4.py 2024-03-29 13:15:27.222321 +0000
@@ -245,10 +245,11 @@
x_tma.create_descriptor(x_dev)
y_tma.create_descriptor(y_dev)
grid = [(M // TILE_M), (N // TILE_N), 1]
block = [128, 1, 1]
+
@NVDSL.mlir_gpu_launch(grid=grid, block=block, smem=229440)
def gemm_multistage_kernel():
# Initialize mbarriers and prefetch TMA descriptors
mbar_group = bootstrap(x_tma, y_tma)
--- tools/nvdsl.py 2024-03-29 13:09:45.000000 +0000
+++ tools/nvdsl.py 2024-03-29 13:15:27.359993 +0000
@@ -12,11 +12,13 @@
MLIR_DYNAMIC = -9223372036854775808
def const(value: int, ty=None):
ty = T.index() if ty is None else ty
- if isinstance(value, ir.Value) and (value.type.isinstance(value.type) or T.bool().isinstance(value.type)):
+ if isinstance(value, ir.Value) and (
+ value.type.isinstance(value.type) or T.bool().isinstance(value.type)
+ ):
return value
return arith.constant(ty, value)
def get_type_size(ty):
@@ -28,10 +30,11 @@
if ir.FloatType.isinstance(ty):
return ir.FloatType(ty).width // 8
if ir.IntegerType.isinstance(ty):
return ir.IntegerType(ty).width // 8
raise NotImplementedError(ty)
+
def get_mlir_func_obj_ty(inputArgs):
args = []
c_int_p = ctypes.c_int * 1
c_float_p = ctypes.c_float * 1
@@ -131,23 +134,24 @@
return ir.Type.parse(parse_str)
def create_descriptor(self, device_ptr):
tma_descriptor_ty = self.tensormap_descriptor_ty
device_unranked_memref = memref.CastOp(
- ir.UnrankedMemRefType.get(self.memref_ty.element_type,
- self.memref_ty.memory_space),
+ ir.UnrankedMemRefType.get(
+ self.memref_ty.element_type, self.memref_ty.memory_space
+ ),
device_ptr,
)
self.tma_descriptor = nvgpu.TmaCreateDescriptorOp(
- tma_descriptor_ty, device_unranked_memref,
- map(const, self.tma_shape))
+ tma_descriptor_ty, device_unranked_memref, map(const, self.tma_shape)
+ )
return self.tma_descriptor.result
def prefetch(self, predicate=None):
nvgpu.tma_prefetch_descriptor(self.tma_descriptor, predicate=predicate)
- def load(self, dest, mbarrier: Mbarriers, coords=[0,0], predicate=None):
+ def load(self, dest, mbarrier: Mbarriers, coords=[0, 0], predicate=None):
coord_ops = [const(c) for c in coords]
nvgpu.TmaAsyncLoadOp(
dest,
mbarrier.mbar_group_op,
self.tma_descriptor,
@@ -178,35 +182,39 @@
def op(self):
return nvgpu.warpgroup_mma_init_accumulator(self.acc_ty)
class Matrix:
-
def __init__(self, smem, tma_descriptor: TMA, M, N):
self.tma_descriptor = tma_descriptor
self.smem = smem
self.M = M
self.N = N
@property
def wgmma_ty(self):
- return ir.Type.parse("!nvgpu.warpgroup.descriptor<tensor=memref<" +
- str(self.M) + "x" +
- str(self.N) + "x" +
- str(self.tma_descriptor.memref_ty.element_type) +
- ", #gpu.address_space<workgroup>>>")
+ return ir.Type.parse(
+ "!nvgpu.warpgroup.descriptor<tensor=memref<"
+ + str(self.M)
+ + "x"
+ + str(self.N)
+ + "x"
+ + str(self.tma_descriptor.memref_ty.element_type)
+ + ", #gpu.address_space<workgroup>>>"
+ )
def matmul(lhs, rhs, acc):
wgmma_desc_lhs = nvgpu.warpgroup_generate_descriptor(
- lhs.wgmma_ty, lhs.smem, lhs.tma_descriptor.tma_descriptor)
+ lhs.wgmma_ty, lhs.smem, lhs.tma_descriptor.tma_descriptor
+ )
wgmma_desc_rhs = nvgpu.warpgroup_generate_descriptor(
- rhs.wgmma_ty, rhs.smem, rhs.tma_descriptor.tma_descriptor)
- return nvgpu.WarpgroupMmaOp(acc.type,
- wgmma_desc_lhs,
- wgmma_desc_rhs,
- acc,
- transposeB=True)
+ rhs.wgmma_ty, rhs.smem, rhs.tma_descriptor.tma_descriptor
+ )
+ return nvgpu.WarpgroupMmaOp(
+ acc.type, wgmma_desc_lhs, wgmma_desc_rhs, acc, transposeB=True
+ )
+
def get_dynamic_shared_memory(shape=None, ty=None, offset: int = 0):
smem_space_str = "#gpu.address_space<workgroup>"
smem_space = ir.Attribute.parse(smem_space_str)
dynamic_smem = gpu.dynamic_shared_memory(
@@ -222,10 +230,11 @@
dynamic_smem,
const(offset),
[],
)
+
@staticmethod
def get_mlir_ty(arg):
def get_mlir_ty_from_np(dtype):
if dtype == np.float16:
return T.f16()
@@ -236,10 +245,11 @@
if dtype == np.int32:
return T.i32()
if dtype == np.int64:
return T.i64()
raise NotImplementedError(dtype)
+
if isinstance(arg, bool):
return T.bool()
elif isinstance(arg, int):
return T.index()
elif isinstance(arg, float):
@@ -248,10 +258,11 @@
descriptor = rt.get_ranked_memref_descriptor(arg)
dtype = get_mlir_ty_from_np(arg.dtype)
shape = descriptor.shape
return memref.MemRefType.get(shape, dtype)
raise NotImplementedError(arg)
+
class NVDSL:
@staticmethod
def mlir_gpu_launch(grid=(1, 1, 1), block=(1, 1, 1), smem=0):
def decorator(func):
``````````
</details>
https://github.com/llvm/llvm-project/pull/87065
More information about the Mlir-commits
mailing list