[Mlir-commits] [mlir] [mlir] gpu async region pass: do not add async if gpu.alloc is (PR #90876)

Thu May 2 09:55:17 PDT 2024

https://github.com/silee2 created https://github.com/llvm/llvm-project/pull/90876

host_shared.

gpu.alloc host_shared is not an async operation. Update gpu async region pass to handle this case correctly.
Also gpu.dealloc working on a host_shared memory (if it can be analyzed) is lowered to non async operation.
Currently only GPU to SPIR-V to SYCL runtime is impacted by this change.
SYCL runtime wrapper and GPU to SYCL integration test are updated accordingly.

>From 620be23ef0b4cb0693e0085e3ff42a5ab9a77b0f Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 2 May 2024 16:48:29 +0000
Subject: [PATCH] [mlir] gpu async region pass: do not add async if gpu.alloc
 is host_shared

---
 .../GPUCommon/GPUToLLVMConversion.cpp         | 21 ++++++++++++++++---
 .../GPU/Transforms/AsyncRegionRewriter.cpp    | 11 ++++++++++
 .../ExecutionEngine/SyclRuntimeWrappers.cpp   |  8 ++++++-
 .../lower-alloc-to-gpu-runtime-calls.mlir     | 10 ++++-----
 mlir/test/Dialect/GPU/async-region.mlir       |  9 ++++++++
 .../GPU/SYCL/gpu-addf32-to-spirv.mlir         | 14 +++++--------
 .../GPU/SYCL/gpu-addi64-to-spirv.mlir         | 14 +++++--------
 .../GPU/SYCL/gpu-reluf32-to-spirv.mlir        | 17 ++++++---------
 8 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 3a4fc7d8063f40..66d9ce7cfc5117 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -873,18 +873,33 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::DeallocOp deallocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
+  bool isHostShared = false;
+  auto *op = deallocOp.getOperation();
+  for (auto operand : op->getOperands()) {
+    gpu::AllocOp allocOp = operand.getDefiningOp<gpu::AllocOp>();
+    if (!allocOp)
+      continue;
+    if (allocOp.getHostShared())
+      isHostShared = true;
+  }
+
   if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) ||
-      failed(isAsyncWithOneDependency(rewriter, deallocOp)))
+      (!isHostShared && failed(isAsyncWithOneDependency(rewriter, deallocOp))))
     return failure();
 
   Location loc = deallocOp.getLoc();
 
   Value pointer =
       MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
-  Value stream = adaptor.getAsyncDependencies().front();
+  Value stream = isHostShared
+                     ? rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPointerType)
+                     : adaptor.getAsyncDependencies().front();
   deallocCallBuilder.create(loc, rewriter, {pointer, stream});
 
-  rewriter.replaceOp(deallocOp, {stream});
+  if (isHostShared)
+    rewriter.eraseOp(deallocOp);
+  else
+    rewriter.replaceOp(deallocOp, {stream});
   return success();
 }
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index b2fa3a99c53fc3..700a22822c5ab9 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -73,6 +73,17 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
   LogicalResult visit(Operation *op) {
     if (isa<gpu::LaunchOp>(op))
       return op->emitOpError("replace with gpu.launch_func first");
+    // gpu.alloc() host_shared cannot be done async
+    if (isa<gpu::AllocOp>(op)) {
+      gpu::AllocOp allocOp = dyn_cast<gpu::AllocOp>(op);
+      if (allocOp.getHostShared())
+        return success();
+    }
+    if (isa<gpu::DeallocOp>(op)) {
+      gpu::AllocOp allocOp = op->getOperand(0).getDefiningOp<gpu::AllocOp>();
+      if (allocOp.getHostShared())
+        return success();
+    }
     if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
       if (currentToken)
         waitOp.addAsyncDependency(currentToken);
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
index c250340c38fc77..6653850fa08bb8 100644
--- a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -90,7 +90,13 @@ static void *allocDeviceMemory(sycl::queue *queue, size_t size, bool isShared) {
 }
 
 static void deallocDeviceMemory(sycl::queue *queue, void *ptr) {
-  sycl::free(ptr, *queue);
+  if (queue == nullptr) {
+    queue = new sycl::queue(getDefaultContext(), getDefaultDevice());
+    sycl::free(ptr, *queue);
+    delete queue;
+  } else {
+    sycl::free(ptr, *queue);
+  }
 }
 
 static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index ae8b7aaac7fd94..74661a0b89b1d1 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -29,12 +29,10 @@ module attributes {gpu.container_module} {
     // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant 
     // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[nullptr]], %[[isHostShared]])
     %0 = gpu.alloc host_shared (%size) : memref<?xf32>
-    // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate()
-    %1 = gpu.wait async
-    %2 = gpu.dealloc async [%1] %0 : memref<?xf32>
-    // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
-    // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
-    gpu.wait [%2]
+    // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
+    // CHECK: %[[stream:.*]] = llvm.mlir.zero : !llvm.ptr
+    // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
+    gpu.dealloc %0 : memref<?xf32>
     return
   }
 }
diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir
index 00832c45de1146..d68d4d27c97922 100644
--- a/mlir/test/Dialect/GPU/async-region.mlir
+++ b/mlir/test/Dialect/GPU/async-region.mlir
@@ -189,4 +189,13 @@ module attributes {gpu.container_module} {
     gpu.wait
     return
   }
+
+  // CHECK-LABEL:func @alloc_host_shared()
+  func.func @alloc_host_shared() {
+    // CHECK: %[[m:.*]] = gpu.alloc  host_shared () : memref<7xf32>
+    %0 = gpu.alloc host_shared() : memref<7xf32>
+    // CHECK: gpu.dealloc  %[[m]] : memref<7xf32>
+    gpu.dealloc %0 : memref<7xf32>
+    return
+  }
 }
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
index c0e2903aee2d12..6d01ffba29c347 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_sycl_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -25,16 +25,12 @@ module @add attributes {gpu.container_module} {
     %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
     memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
     %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
-    %2 = gpu.wait async
-    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
-    gpu.wait [%3]
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
     %alloc = memref.alloc() : memref<2x2x2xf32>
     memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
-    %4 = gpu.wait async
-    %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
-    %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
-    %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
-    gpu.wait [%7]
+    gpu.dealloc %memref_2 : memref<2x2x2xf32>
+    gpu.dealloc %memref_0 : memref<2x2x2xf32>
+    gpu.dealloc %mem : memref<2x2x2xf32>
     return %alloc : memref<2x2x2xf32>
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
index 4ac1533b75d203..b9c740971ccd2b 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_sycl_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -25,16 +25,12 @@ module @add attributes {gpu.container_module} {
     %memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
     memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
     %memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
-    %2 = gpu.wait async
-    %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
-    gpu.wait [%3]
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
     %alloc = memref.alloc() : memref<3x3xi64>
     memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
-    %4 = gpu.wait async
-    %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
-    %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
-    %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
-    gpu.wait [%7]
+    gpu.dealloc %memref_2 : memref<3x3xi64>
+    gpu.dealloc %memref_0 : memref<3x3xi64>
+    gpu.dealloc %mem : memref<3x3xi64>
     return %alloc : memref<3x3xi64>
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
index 162a793305e972..770980f810c686 100644
--- a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
+++ b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},func.func(gpu-async-region),convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_sycl_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -40,19 +40,14 @@ module @relu attributes {gpu.container_module} {
     %memref = gpu.alloc host_shared () : memref<4x5xf32>
     memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
     %memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
-    %2 = gpu.wait async
-    %3 = gpu.launch_func async [%2]  @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
-    gpu.wait [%3]
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
     %memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
-    %4 = gpu.wait async
-    %5 = gpu.launch_func async [%4]  @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
-    gpu.wait [%5]
+    gpu.launch_func @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
     %alloc = memref.alloc() : memref<4x5xf32>
     memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
-    %6 = gpu.wait async
-    %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
-    %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
-    %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
+    gpu.dealloc %memref_1 : memref<4x5xf32>
+    gpu.dealloc %memref_0 : memref<4x5xi1>
+    gpu.dealloc %memref : memref<4x5xf32>
     return %alloc : memref<4x5xf32>
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {