[Mlir-commits] [mlir] affcfcc - [mlir][nvgpu] Add initial support for `mbarrier`
Guray Ozen
llvmlistbot at llvm.org
Tue Jul 11 08:35:32 PDT 2023
Author: Guray Ozen
Date: 2023-07-11T17:35:27+02:00
New Revision: affcfccd3c1c51388c74b1083d0967332be3909d
URL: https://github.com/llvm/llvm-project/commit/affcfccd3c1c51388c74b1083d0967332be3909d
DIFF: https://github.com/llvm/llvm-project/commit/affcfccd3c1c51388c74b1083d0967332be3909d.diff
LOG: [mlir][nvgpu] Add initial support for `mbarrier`
`mbarrier` is a barrier created in shared memory that supports different flavors of synchronizing threads other than `__syncthreads`, for more information see below.
https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier
This work adds initial Ops wrt `mbarrier` to nvgpu dialect.
First, it introduces to two types:
`mbarrier.barrier` that is barrier object in shared memory
`mbarrier.barrier.token` that is token
It introduces following Ops:
`mbarrier.create` creates `mbarrier.barrier`
`mbarrier.init` initializes `mbarrier.barrier`
`mbarrier.arrive` performs arrive-on `mbarrier.barrier` returns `mbarrier.barrier.token`
`mbarrier.arrive.nocomplete` performs arrive-on (non-blocking) `mbarrier.barrier` returns `mbarrier.barrier.token`
`mbarrier.test_wait` waits on `mbarrier.barrier` and `mbarrier.barrier.token`
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D154090
Added:
Modified:
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 5dd37306990698..ad17d1874ef879 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -425,21 +425,22 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_Op<"mbarrier.try_wait.parity.share
def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
Results<(outs LLVM_Type:$res)>,
- Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$token)> {
+ Arguments<(ins LLVM_i64ptr_any:$addr, LLVM_Type:$state)> {
string llvmBuilder = [{
- $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $token});
+ $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
}];
- let assemblyFormat = "$addr `,` $token attr-dict `:` type(operands) `->` type($res)";
+ let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
}
def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
Results<(outs LLVM_Type:$res)>,
- Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$token)> {
+ Arguments<(ins LLVM_i64ptr_shared:$addr, LLVM_Type:$state)> {
string llvmBuilder = [{
- $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $token});
+ $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
}];
- let assemblyFormat = "$addr `,` $token attr-dict `:` type(operands) `->` type($res)";
+ let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
}
+
//===----------------------------------------------------------------------===//
// NVVM synchronization op definitions
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 41571fc0be060f..9e783d0c928e16 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -44,6 +44,11 @@ def NVGPU_Dialect : Dialect {
/// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
static bool hasSharedMemoryAddressSpace(MemRefType type);
+ /// Return true if the given Attribute has an integer address
+ /// space that matches the NVVM shared memory address space or
+ /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+ static bool isSharedMemoryAddressSpace(Attribute type);
+
/// Defines the MemRef memory space attribute numeric value that indicates
/// a memref is located in global memory. This should correspond to the
/// value used in NVVM.
@@ -77,6 +82,24 @@ def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
}];
}
+def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> {
+ let summary = "mbarrier barrier type";
+ let description = [{
+ This is the type for a mbarrier object in shared memory that is used
+ to synchronize a variable number of threads.
+
+ The mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
+ can be initiated and invalidated.
+
+ See for more details:
+ https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object
+ }];
+ let parameters = (ins "Attribute":$memorySpace);
+ let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
+
//===----------------------------------------------------------------------===//
// NVGPU Op Definitions
//===----------------------------------------------------------------------===//
@@ -355,4 +378,95 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
}];
}
+def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
+ let summary = "Creates a `nvgpu.mbarrier` object.";
+ let description = [{
+ The Op generates an `mbarrier` object, which is a barrier created in
+ shared memory and supports various synchronization behaviors for threads.
+
+ The `mbarrier` object has the following type and alignment requirements:
+ Type: .b64, Alignment: 8, Memory space: .shared
+
+ Example:
+ ```mlir
+ %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ ```
+ }];
+ let arguments = (ins);
+ let results = (outs NVGPU_MBarrier:$barrier);
+ let assemblyFormat = [{
+ attr-dict `->` type($barrier)
+ }];
+}
+
+def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
+ let summary = "Initialize the `nvgpu.mbarrier`.";
+ let description = [{
+ The Op initializes the `mbarrier` object with the given number of threads.
+
+ Example:
+ ```mlir
+ %num_threads = gpu.block_dim x
+ %barrier = nvgpu.mbarrier.create -> !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count);
+ let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)";
+}
+
+def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
+ let summary = "Checks if the `nvgpu.mbarrier` has completed its current phase.";
+ let description = [{
+ Checks whether the mbarrier object has completed the phase. It is is a
+ non-blocking instruction which tests for the completion of the phase.
+
+ Example:
+ ```mlir
+ %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token);
+ let results = (outs I1:$waitComplete);
+ let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)";
+}
+
+def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
+ let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive`.";
+ let description = [{
+ The Op performs arrive-on operation on the `mbarrier` object and returns a
+ `nvgpu.mbarrier.token`.
+
+ For more information, see
+ https://docs.nvidia.com/cuda/parallel-thread-execution/#arrive-on-operation-on-mbarrier-object
+
+ Example:
+ ```mlir
+ %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrier:$barrier);
+ let results = (outs NVGPU_MBarrierToken:$token);
+let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)";
+}
+
+def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
+ let summary = "Performs arrive operation on the `nvgpu.mbarrier.arrive.nocomplete` as non-blocking.";
+ let description = [{
+ The Op performs arrive-on operation on the `mbarrier` object and returns a
+ `nvgpu.mbarrier.token`.
+
+ The Op does not cause the `nvgpu.mbarrier` to complete its current phase.
+
+ Example:
+ ```mlir
+ %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
+ ```
+ }];
+ let arguments = (ins NVGPU_MBarrier:$barrier,
+ Index:$count);
+ let results = (outs NVGPU_MBarrierToken:$token);
+ let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)";
+}
+
#endif // NVGPU
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 5694e7c28de67f..b8adef23b06e6a 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -13,6 +13,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
@@ -337,23 +338,74 @@ struct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<nvgpu::MmaSyncOp> {
}
};
+/// Returns whether mbarrier object has shared memory address space.
+static bool isMbarrierShared(nvgpu::MBarrierType barrierType) {
+ return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(
+ barrierType.getMemorySpace()));
+}
+
+/// Returns whether memory space attribute of the mbarrier object.
+static Attribute getMbarrierMemorySpace(RewriterBase &rewriter,
+ nvgpu::MBarrierType barrierType) {
+ Attribute memorySpace = {};
+ if (isMbarrierShared(barrierType)) {
+ memorySpace = rewriter.getI64IntegerAttr(
+ nvgpu::NVGPUDialect::kSharedMemoryAddressSpace);
+ }
+ return memorySpace;
+}
+
+/// Returns memref type of the mbarrier object. The type is defined in the
+/// MBarrierType.
+static MemRefType createMBarrierMemrefType(RewriterBase &rewriter,
+ nvgpu::MBarrierType barrierType) {
+ Attribute memorySpace = getMbarrierMemorySpace(rewriter, barrierType);
+ MemRefLayoutAttrInterface layout;
+ return MemRefType::get({1}, rewriter.getI64Type(), layout, memorySpace);
+}
+
+/// Returns the base pointer of the mbarrier object.
+static Value getMbarrierPtr(ConversionPatternRewriter &rewriter,
+ LLVMTypeConverter &typeConverter,
+ TypedValue<nvgpu::MBarrierType> barrier,
+ Value barrierMemref) {
+ MemRefType memrefType = createMBarrierMemrefType(rewriter, barrier.getType());
+ MemRefDescriptor memRefDescriptor(barrierMemref);
+ return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter,
+ memrefType);
+}
+
struct ConvertNVGPUToNVVMPass
: public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {
using Base::Base;
+ void getDependentDialects(DialectRegistry ®istry) const override {
+ registry
+ .insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect>();
+ }
+
void runOnOperation() override {
LowerToLLVMOptions options(&getContext());
options.useOpaquePointers = useOpaquePointers;
RewritePatternSet patterns(&getContext());
LLVMTypeConverter converter(&getContext(), options);
- /// device-side async tokens cannot be materialized in nvvm. We just convert
- /// them to a dummy i32 type in order to easily drop them during conversion.
+ IRRewriter rewriter(&getContext());
+ /// device-side async tokens cannot be materialized in nvvm. We just
+ /// convert them to a dummy i32 type in order to easily drop them during
+ /// conversion.
converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {
return converter.convertType(IntegerType::get(type.getContext(), 32));
});
+ converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
+ return converter.convertType(IntegerType::get(type.getContext(), 64));
+ });
+ converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
+ return converter.convertType(createMBarrierMemrefType(rewriter, type));
+ });
populateNVGPUToNVVMConversionPatterns(converter, patterns);
LLVMConversionTarget target(getContext());
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+ target.addLegalDialect<::mlir::memref::MemRefDialect>();
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))
@@ -651,11 +703,164 @@ struct NVGPUAsyncWaitLowering
}
};
+/// Creates mbarrier object in shared memory
+struct NVGPUMBarrierCreateLowering
+ : public ConvertOpToLLVMPattern<nvgpu::MBarrierCreateOp> {
+ using ConvertOpToLLVMPattern<nvgpu::MBarrierCreateOp>::ConvertOpToLLVMPattern;
+
+ template <typename moduleT>
+ memref::GlobalOp generateGlobalBarrier(ConversionPatternRewriter &rewriter,
+ Operation *funcOp, moduleT moduleOp,
+ MemRefType barrierType) const {
+ SymbolTable symbolTable(moduleOp);
+ OpBuilder::InsertionGuard guard(rewriter);
+ rewriter.setInsertionPoint(&moduleOp.front());
+ auto global = rewriter.create<memref::GlobalOp>(
+ funcOp->getLoc(), "__mbarrier",
+ /*sym_visibility=*/rewriter.getStringAttr("private"),
+ /*type=*/barrierType,
+ /*initial_value=*/ElementsAttr(),
+ /*constant=*/false,
+ /*alignment=*/rewriter.getI64IntegerAttr(8));
+ symbolTable.insert(global);
+ return global;
+ }
+
+ LogicalResult
+ matchAndRewrite(nvgpu::MBarrierCreateOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Operation *funcOp = op->getParentOp();
+ Operation *mOp = funcOp->getParentOp();
+ MemRefType barrierType =
+ createMBarrierMemrefType(rewriter, op.getBarrier().getType());
+
+ memref::GlobalOp global;
+ if (auto moduleOp = dyn_cast<gpu::GPUModuleOp>(mOp))
+ global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType);
+ else if (auto moduleOp = dyn_cast<ModuleOp>(mOp))
+ global = generateGlobalBarrier(rewriter, funcOp, moduleOp, barrierType);
+
+ rewriter.setInsertionPoint(op);
+ rewriter.replaceOpWithNewOp<memref::GetGlobalOp>(op, barrierType,
+ global.getName());
+ return success();
+ }
+};
+
+/// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init`
+struct NVGPUMBarrierInitLowering
+ : public ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp> {
+ using ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp>::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ rewriter.setInsertionPoint(op);
+ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
+ op.getBarrier(), adaptor.getBarrier());
+
+ Value count = adaptor.getCount();
+ if (!adaptor.getCount().getType().isInteger(32)) {
+ count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
+ rewriter.getI32Type(), count);
+ }
+
+ if (isMbarrierShared(op.getBarrier().getType())) {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
+ count);
+ } else {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierInitOp>(op, barrier, count);
+ }
+ return success();
+ }
+};
+
+/// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive`
+struct NVGPUMBarrierArriveLowering
+ : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp> {
+ using ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
+ op.getBarrier(), adaptor.getBarrier());
+ Type tokenType = getTypeConverter()->convertType(
+ nvgpu::MBarrierTokenType::get(op->getContext()));
+ if (isMbarrierShared(op.getBarrier().getType())) {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
+ barrier);
+ } else {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
+ barrier);
+ }
+ return success();
+ }
+};
+
+/// Lowers `nvgpu.mbarrier.arrive.nocomplete` to
+/// `nvvm.mbarrier.arrive.nocomplete`
+struct NVGPUMBarrierArriveNoCompleteLowering
+ : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveNoCompleteOp> {
+ using ConvertOpToLLVMPattern<
+ nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
+ op.getBarrier(), adaptor.getBarrier());
+ Type tokenType = getTypeConverter()->convertType(
+ nvgpu::MBarrierTokenType::get(op->getContext()));
+ Value count = adaptor.getCount();
+ if (!adaptor.getCount().getType().isInteger(32)) {
+ count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
+ rewriter.getI32Type(), count);
+ }
+ if (isMbarrierShared(op.getBarrier().getType())) {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
+ op, tokenType, barrier, count);
+ } else {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
+ op, tokenType, barrier, count);
+ }
+ return success();
+ }
+};
+
+/// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait`
+struct NVGPUMBarrierTestWaitLowering
+ : public ConvertOpToLLVMPattern<nvgpu::MBarrierTestWaitOp> {
+ using ConvertOpToLLVMPattern<
+ nvgpu::MBarrierTestWaitOp>::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
+ op.getBarrier(), adaptor.getBarrier());
+ Type retType = rewriter.getI1Type();
+ if (isMbarrierShared(op.getBarrier().getType())) {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
+ op, retType, barrier, adaptor.getToken());
+ } else {
+ rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
+ op, retType, barrier, adaptor.getToken());
+ }
+ return success();
+ }
+};
+
} // namespace
void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns) {
- patterns.add<MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
- NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
- NVGPUMmaSparseSyncLowering>(converter);
+ patterns.add<
+ NVGPUMBarrierCreateLowering, // nvgpu.mbarrier.create
+ NVGPUMBarrierInitLowering, // nvgpu.mbarrier.init
+ NVGPUMBarrierArriveLowering, // nvgpu.mbarrier.arrive
+ NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete
+ NVGPUMBarrierTestWaitLowering, // nvgpu.try_wait_parity
+ MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
+ NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
+ NVGPUMmaSparseSyncLowering>(converter);
}
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 08384debaaf7c0..c3a62f468749ee 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -34,8 +34,7 @@ void nvgpu::NVGPUDialect::initialize() {
>();
}
-bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
- Attribute memorySpace = type.getMemorySpace();
+bool nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
if (!memorySpace)
return false;
if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
@@ -45,6 +44,11 @@ bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
return false;
}
+bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
+ Attribute memorySpace = type.getMemorySpace();
+ return isSharedMemoryAddressSpace(memorySpace);
+}
+
//===----------------------------------------------------------------------===//
// NVGPU_DeviceAsyncCopyOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 41369da08b8dcf..7a7f65f3d945bd 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -503,3 +503,58 @@ func.func @mma_sp_sync_i8_16864(%arg0: vector<4x4xi8>,
(vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
return %d : vector<2x2xi32>
}
+
+// -----
+!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!tokenType = !nvgpu.mbarrier.token
+
+// CHECK-LABEL: func @mbarrier
+func.func @mbarrier() {
+ %num_threads = arith.constant 128 : index
+
+ // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3>
+ %barrier = nvgpu.mbarrier.create -> !barrierType
+
+ // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+ nvgpu.mbarrier.init %barrier, %num_threads : !barrierType
+
+ // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]]
+ %token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType
+
+ // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+ %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType
+
+ func.return
+}
+
+// -----
+!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!tokenType = !nvgpu.mbarrier.token
+
+// CHECK-LABEL: func @mbarrier_nocomplete
+func.func @mbarrier_nocomplete() {
+ %num_threads = arith.constant 128 : index
+ %count = arith.constant 12 : index
+
+ // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier : memref<1xi64, 3>
+ %barrier = nvgpu.mbarrier.create -> !barrierType
+
+ // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
+ nvgpu.mbarrier.init %barrier, %num_threads : !barrierType
+
+ // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]]
+ %token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType
+
+ // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+ %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType
+
+ func.return
+}
More information about the Mlir-commits
mailing list