[Mlir-commits] [mlir] Skip address space checks for memrefs between launchOp and kernel func (PR #102925)

Mon Aug 12 09:03:10 PDT 2024

https://github.com/kurapov-peter created https://github.com/llvm/llvm-project/pull/102925

Hi! I've been trying to use the new signature lowering (https://github.com/llvm/llvm-project/pull/101664 and https://github.com/llvm/llvm-project/pull/102621) with gpu kernel outlining, gpu binary generation, and OpenCL runtime. `gpu-to-llvm-spv` pass can handle memrefs with address spaces attributes, however, when lowering an arbitrary function (say, a function that accepts a tensor and then runs bufferization) and then using its argument as a kernel argument the address space is lacking.

Consider the following example function that accepts some device-host shared memory:
```mlir
  func.func @foo(%mem : memref<5xf32>) {
    gpu.launch_func @gpu_kernels::@kernel args(%mem : memref<5xf32>)
    return
  }
  gpu.module @gpu_kernels {
    gpu.func @kernel(%arg0 : memref<5xf32, #gpu.address_space<global>>) kernel {
      gpu.return
    }
  }
```

The correct address space for a kernel argument is `1` for global (OpenCL's requirement), but it doesn't make any sense on the host side, for which, `0` is the right one (say, we rely on some runtime mechanism to deliver the data to the device). The two don't match and validation fails on the type checking even though the code is still valid.

The easiest workaround we discussed with @victor-eds is to allow this discrepancy on the validation side. It can be even more specific, and check the target to ensure this is the right case.

>From 5d86ec11b9d9a8012ca5671fc8d59e61738edcd0 Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov at intel.com>
Date: Mon, 12 Aug 2024 11:43:07 +0000
Subject: [PATCH] Skip address space checks for memrefs between launchOp and
 kernel func

---
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 23 ++++++++++++++++++++++-
 mlir/test/Dialect/GPU/ops.mlir         | 14 ++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a1f87a637a6141..8c3391c8d92936 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -401,8 +401,29 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
              << expectedNumArguments;
 
     auto functionType = kernelGPUFunction.getFunctionType();
+    auto typesMatch = [&](Type launchOpArgType, Type gpuFuncArgType) {
+      auto launchOpMemref = dyn_cast<MemRefType>(launchOpArgType);
+      auto kernelMemref = dyn_cast<MemRefType>(gpuFuncArgType);
+      // Allow address space incompatibility for OpenCL kernels: `gpu.launch`'s
+      // argument memref without address space attribute will match a kernel
+      // function's memref argument with address space `Global`.
+      if (launchOpMemref && kernelMemref) {
+        auto launchAS = llvm::dyn_cast_or_null<gpu::AddressSpaceAttr>(
+            launchOpMemref.getMemorySpace());
+        auto kernelAS = llvm::dyn_cast_or_null<gpu::AddressSpaceAttr>(
+            kernelMemref.getMemorySpace());
+        if (!launchAS && kernelAS &&
+            kernelAS.getValue() == gpu::AddressSpace::Global)
+          return launchOpMemref.getShape() == kernelMemref.getShape() &&
+                 launchOpMemref.getLayout() == kernelMemref.getLayout() &&
+                 launchOpMemref.getElementType() ==
+                     kernelMemref.getElementType();
+      }
+      return launchOpArgType == gpuFuncArgType;
+    };
     for (unsigned i = 0; i < expectedNumArguments; ++i) {
-      if (launchOp.getKernelOperand(i).getType() != functionType.getInput(i)) {
+      if (!typesMatch(launchOp.getKernelOperand(i).getType(),
+                      functionType.getInput(i))) {
         return launchOp.emitOpError("type of function argument ")
                << i << " does not match";
       }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index ba7897f4e80cb5..fdfd9fcc8b1853 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -441,3 +441,17 @@ gpu.module @module_with_two_target [#nvvm.target, #rocdl.target<chip = "gfx90a">
 
 gpu.module @module_with_offload_handler <#gpu.select_object<0>> [#nvvm.target] {
 }
+
+// Check kernel memref args are valid even if the address space differs
+module attributes {gpu.container_module} {
+  func.func @foo(%mem : memref<5xf32>) {
+    %c0 = arith.constant 0 : i32
+    gpu.launch_func @gpu_kernels::@kernel blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%mem : memref<5xf32>)
+    return
+  }
+  gpu.module @gpu_kernels {
+    gpu.func @kernel(%arg0 : memref<5xf32, #gpu.address_space<global>>) kernel {
+      gpu.return
+    }
+  }
+}