[clang] [MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers (PR #66401)
Nishant Patel via cfe-commits
cfe-commits at lists.llvm.org
Tue Sep 26 10:35:00 PDT 2023
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/66401
>From 4a3211750ede9d375dedc676e2cfb8bc3924aa0b Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel at intel.com>
Date: Wed, 13 Sep 2023 20:58:25 +0000
Subject: [PATCH 1/4] Support lowering of hostShared in gpu.alloc op
---
.../Conversion/GPUCommon/GPUToLLVMConversion.cpp | 15 ++++++++++-----
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 3 ++-
mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp | 3 ++-
.../lower-alloc-to-gpu-runtime-calls.mlir | 3 ++-
.../test/Conversion/GPUCommon/typed-pointers.mlir | 3 ++-
5 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index eddf3e9a47d0bc8..428d5d1d4b0e944 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
"mgpuMemAlloc",
llvmPointerType /* void * */,
{llvmIntPtrType /* intptr_t sizeBytes */,
- llvmPointerType /* void *stream */}};
+ llvmPointerType /* void *stream */,
+ llvmInt64Type /* bool isHostShared */}};
FunctionCallBuilder deallocCallBuilder = {
"mgpuMemFree",
llvmVoidType,
@@ -786,9 +787,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::AllocOp allocOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
- if (adaptor.getHostShared())
- return rewriter.notifyMatchFailure(
- allocOp, "host_shared allocation is not supported");
MemRefType memRefType = allocOp.getType();
@@ -799,6 +797,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
auto loc = allocOp.getLoc();
+ bool isShared = allocOp.getHostShared();
+
// Get shape of the memref as values: static sizes are constant
// values and dynamic sizes are passed to 'alloc' as operands.
SmallVector<Value, 4> shape;
@@ -811,8 +811,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
// descriptor.
Type elementPtrType = this->getElementPtrType(memRefType);
auto stream = adaptor.getAsyncDependencies().front();
+
+ auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared));
+
Value allocatedPtr =
- allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+ allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
+ .getResult();
if (!getTypeConverter()->useOpaquePointers())
allocatedPtr =
rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 1dba677ebe66365..a0172f85a67a5c0 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -210,7 +210,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
}
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/,
+ bool /*isHostShared*/) {
ScopedContext scopedContext;
CUdeviceptr ptr;
CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..292159536f5522f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
}
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/,
+ bool /*isHostShared*/) {
void *ptr;
HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes));
return ptr;
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index 2506c6ceb990ef5..f365dcb02daf4c2 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
%0 = gpu.wait async
// CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
- // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+ // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
%1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
// CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
// CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
index 2fa6c854c567819..e27162c7dbc1902 100644
--- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
+++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
%0 = gpu.wait async
// CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
- // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+ // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
%1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
// CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
// CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
>From 4752f1b1d569aea737cea6b92a34f5d2ef218122 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel at intel.com>
Date: Fri, 15 Sep 2023 18:10:03 +0000
Subject: [PATCH 2/4] gpu host_shared allocation cannot be async
---
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 428d5d1d4b0e944..9d3d26002ec7f2c 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -791,14 +791,19 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
MemRefType memRefType = allocOp.getType();
if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
- !isConvertibleAndHasIdentityMaps(memRefType) ||
- failed(isAsyncWithOneDependency(rewriter, allocOp)))
+ !isConvertibleAndHasIdentityMaps(memRefType))
return failure();
auto loc = allocOp.getLoc();
bool isShared = allocOp.getHostShared();
+ if (isShared && allocOp.getAsyncToken())
+ return rewriter.notifyMatchFailure(
+ allocOp, "Host Shared allocation cannot be done async");
+ else if (!isShared && failed(isAsyncWithOneDependency(rewriter, allocOp)))
+ return failure();
+
// Get shape of the memref as values: static sizes are constant
// values and dynamic sizes are passed to 'alloc' as operands.
SmallVector<Value, 4> shape;
>From 49043d9f2b7284fb8a6c82f37f3cec83882baa70 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel at intel.com>
Date: Mon, 25 Sep 2023 21:16:58 +0000
Subject: [PATCH 3/4] Address PR feedback
---
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 9d3d26002ec7f2c..bcbd5d67de34dcd 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -168,7 +168,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
llvmPointerType /* void * */,
{llvmIntPtrType /* intptr_t sizeBytes */,
llvmPointerType /* void *stream */,
- llvmInt64Type /* bool isHostShared */}};
+ llvmInt8Type /* bool isHostShared */}};
FunctionCallBuilder deallocCallBuilder = {
"mgpuMemFree",
llvmVoidType,
@@ -818,7 +818,7 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
auto stream = adaptor.getAsyncDependencies().front();
auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
- loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared));
+ loc, llvmInt8Type, rewriter.getI64IntegerAttr(isShared));
Value allocatedPtr =
allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
>From 44f3978dae0b3e73e41566cbab42c2825164f62f Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel at intel.com>
Date: Mon, 25 Sep 2023 21:24:44 +0000
Subject: [PATCH 4/4] Use llvmInt8TypeI8 for bool
---
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index bcbd5d67de34dcd..f72500d7592d049 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -818,7 +818,7 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
auto stream = adaptor.getAsyncDependencies().front();
auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
- loc, llvmInt8Type, rewriter.getI64IntegerAttr(isShared));
+ loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared));
Value allocatedPtr =
allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
More information about the cfe-commits
mailing list