[Mlir-commits] [mlir] 17649a7 - [MLIR][NVGPU] Introduce `nvgpu.mbarrier.group` for multiple mbarrier use (#65951)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Sep 22 08:09:48 PDT 2023
Author: Guray Ozen
Date: 2023-09-22T17:09:43+02:00
New Revision: 17649a7726d3ce1ddba2bbf3ef73af03ea204753
URL: https://github.com/llvm/llvm-project/commit/17649a7726d3ce1ddba2bbf3ef73af03ea204753
DIFF: https://github.com/llvm/llvm-project/commit/17649a7726d3ce1ddba2bbf3ef73af03ea204753.diff
LOG: [MLIR][NVGPU] Introduce `nvgpu.mbarrier.group` for multiple mbarrier use (#65951)
A common practice involves the creation of multiple `mbarrier` objects,
see an example below. This is particularly valuable in scenarios like
software pipelining for GEMM, where we need to generate multiple
barriers dynamically use and wait them in a loop.
PR improves `nvgpu.mbarrier.barrier` type into the
`nvgpu.mbarrier.group`. All `mbarrier` related Ops now uses this type.
Consequently, these Ops are now capable of managing multiple barriers
seamlessly.
Having `num_barriers = 4` helps us to locate mbarrier object(s) into
static shared memory. We could make the value dynamic that requires
dynamic shared memory it would complicate the codegen.
```
%barriers = nvgpu.mbarrier.create -> !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c0], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c1], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c2], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c3], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
...
scf.for %i = %c0 to %n step %c1 {
nvgpu.mbarrier.try_wait %barriers[ (i % 4) ] ...
// ... Do work once mbarrier is ready
nvgpu.mbarrier.arrive.expect_tx %barriers[ (i + 3 % 4) ] ...
}
```
We will have mbarrier usages like below:
```
expect_tx[0]
expect_tx[1]
expect_tx[2]
Loop:
try_wait mbarrier[0], expect_tx[3]
try_wait mbarrier[1], expect_tx[0]
try_wait mbarrier[2], expect_tx[1]
try_wait mbarrier[3], expect_tx[2]
...
```
Added:
Modified:
mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
mlir/test/Dialect/NVGPU/tmaload-transform.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h b/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h
index 8c5667cd417f0d4..4b8d5c5fe2a893d 100644
--- a/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h
+++ b/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h
@@ -23,15 +23,15 @@ class Pass;
#include "mlir/Conversion/Passes.h.inc"
namespace nvgpu {
-class MBarrierType;
+class MBarrierGroupType;
/// Returns the memory space attribute of the mbarrier object.
Attribute getMbarrierMemorySpace(MLIRContext *context,
- MBarrierType barrierType);
+ MBarrierGroupType barrierType);
/// Return the memref type that can be used to represent an mbarrier object.
MemRefType getMBarrierMemrefType(MLIRContext *context,
- MBarrierType barrierType);
+ MBarrierGroupType barrierType);
} // namespace nvgpu
void populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 90381648dac6acc..5fcf08c6d3e1d7a 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -135,20 +135,26 @@ def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken",
}];
}
-def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> {
+def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> {
let summary = "mbarrier barrier type";
let description = [{
- This is the type for a mbarrier object in shared memory that is used
- to synchronize a variable number of threads.
+ This is the type for one or more mbarrier object in shared memory that is
+ used to synchronize a variable number of threads.
- The mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
- can be initiated and invalidated.
+ If `num_barriers` is not set, the number of mbarrier objects is 1.
- See for more details:
- https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object
+ A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object
+ can be initiated and invalidated.
+
+ [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object)
}];
- let parameters = (ins "Attribute":$memorySpace);
+ let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers);
let assemblyFormat = "`<` struct(params) `>`";
+ let builders = [
+ TypeBuilder<(ins "Attribute":$memorySpace), [{
+ return $_get($_ctxt, memorySpace, 1);
+ }]>
+ ];
}
def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
@@ -486,7 +492,7 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
let summary = "Creates a `nvgpu.mbarrier` object.";
let description = [{
- The Op generates an `mbarrier` object, which is a barrier created in
+ The Op generates one or more `mbarrier` object, which is a barrier created in
shared memory and supports various synchronization behaviors for threads.
The `mbarrier` object has the following type and alignment requirements:
@@ -498,9 +504,9 @@ def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> {
```
}];
let arguments = (ins);
- let results = (outs NVGPU_MBarrier:$barrier);
+ let results = (outs NVGPU_MBarrierGroup:$barriers);
let assemblyFormat = [{
- attr-dict `->` type($barrier)
+ attr-dict `->` type($barriers)
}];
}
@@ -516,8 +522,8 @@ def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> {
nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
```
}];
- let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count);
- let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)";
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers)";
}
def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
@@ -531,9 +537,9 @@ def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> {
%isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token
```
}];
- let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token);
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId);
let results = (outs I1:$waitComplete);
- let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)";
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)";
}
def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
@@ -550,9 +556,9 @@ def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> {
%token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
```
}];
- let arguments = (ins NVGPU_MBarrier:$barrier);
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId);
let results = (outs NVGPU_MBarrierToken:$token);
-let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)";
+let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)";
}
def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> {
@@ -568,10 +574,10 @@ def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []
%token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token
```
}];
- let arguments = (ins NVGPU_MBarrier:$barrier,
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId,
Index:$count);
let results = (outs NVGPU_MBarrierToken:$token);
- let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)";
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)";
}
def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
@@ -591,9 +597,8 @@ def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> {
nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
```
}];
- let arguments = (ins NVGPU_MBarrier:$barrier,
- Index:$txcount);
- let assemblyFormat = "$barrier `,` $txcount attr-dict `:` type($barrier)";
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount attr-dict `:` type($barriers)";
}
def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
@@ -610,8 +615,8 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
```
}];
- let arguments = (ins NVGPU_MBarrier:$barrier, Index:$phase, Index:$ticks);
- let assemblyFormat = "$barrier `,` $phase `,` $ticks attr-dict `:` type($barrier)";
+ let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$phase, Index:$ticks, Index:$mbarId);
+ let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phase `,` $ticks attr-dict `:` type($barriers)";
}
def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
@@ -626,12 +631,13 @@ def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
The Op uses `$barrier` mbarrier based completion mechanism.
}];
let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
- NVGPU_MBarrier:$barrier,
+ NVGPU_MBarrierGroup:$barriers,
NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
- Variadic<Index>:$coordinates);
+ Variadic<Index>:$coordinates,
+ Index:$mbarId);
let assemblyFormat = [{
- $tensorMapDescriptor `[` $coordinates `]` `,` $barrier `to` $dst
- attr-dict `:` type($tensorMapDescriptor) `,` type($barrier) `->` type($dst)
+ $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` `to` $dst
+ attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) `->` type($dst)
}];
let hasVerifier = 1;
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index f74aa05c0c4c4ff..4d1f6641af6dca3 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -18,8 +18,10 @@
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -218,14 +220,14 @@ static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter,
}
/// Returns whether mbarrier object has shared memory address space.
-static bool isMbarrierShared(nvgpu::MBarrierType barrierType) {
+static bool isMbarrierShared(nvgpu::MBarrierGroupType barrierType) {
return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(
barrierType.getMemorySpace()));
}
/// Returns the memory space attribute of the mbarrier object.
Attribute nvgpu::getMbarrierMemorySpace(MLIRContext *context,
- nvgpu::MBarrierType barrierType) {
+ nvgpu::MBarrierGroupType barrierType) {
Attribute memorySpace = {};
if (isMbarrierShared(barrierType)) {
memorySpace =
@@ -236,25 +238,13 @@ Attribute nvgpu::getMbarrierMemorySpace(MLIRContext *context,
}
/// Returns memref type of the mbarrier object. The type is defined in the
-/// MBarrierType.
+/// MBarrierGroupType.
MemRefType nvgpu::getMBarrierMemrefType(MLIRContext *context,
- nvgpu::MBarrierType barrierType) {
+ nvgpu::MBarrierGroupType barrierType) {
Attribute memorySpace = nvgpu::getMbarrierMemorySpace(context, barrierType);
MemRefLayoutAttrInterface layout;
- return MemRefType::get({1}, IntegerType::get(context, 64), layout,
- memorySpace);
-}
-
-/// Returns the base pointer of the mbarrier object.
-static Value getMbarrierPtr(ConversionPatternRewriter &rewriter,
- const LLVMTypeConverter &typeConverter,
- TypedValue<nvgpu::MBarrierType> barrier,
- Value barrierMemref) {
- MemRefType memrefType =
- nvgpu::getMBarrierMemrefType(rewriter.getContext(), barrier.getType());
- MemRefDescriptor memRefDescriptor(barrierMemref);
- return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter,
- memrefType);
+ return MemRefType::get({barrierType.getNumBarriers()},
+ IntegerType::get(context, 64), layout, memorySpace);
}
namespace {
@@ -441,7 +431,7 @@ struct ConvertNVGPUToNVVMPass
[&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type {
return converter.convertType(IntegerType::get(type.getContext(), 64));
});
- converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
+ converter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type {
return converter.convertType(
nvgpu::getMBarrierMemrefType(rewriter.getContext(), type));
});
@@ -779,7 +769,7 @@ struct NVGPUMBarrierCreateLowering
ConversionPatternRewriter &rewriter) const override {
Operation *funcOp = op->getParentOp();
MemRefType barrierType = nvgpu::getMBarrierMemrefType(
- rewriter.getContext(), op.getBarrier().getType());
+ rewriter.getContext(), op.getBarriers().getType());
memref::GlobalOp global;
if (auto moduleOp = funcOp->getParentOfType<gpu::GPUModuleOp>())
@@ -794,21 +784,37 @@ struct NVGPUMBarrierCreateLowering
}
};
+/// Base class for lowering mbarrier operations to nvvm intrinsics.
+template <typename SourceOp>
+struct MBarrierBasePattern : public ConvertOpToLLVMPattern<SourceOp> {
+public:
+ using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
+ /// Returns the base pointer of the mbarrier object.
+ Value getMbarrierPtr(Operation *op, nvgpu::MBarrierGroupType mbarType,
+ Value memrefDesc, Value mbarId,
+ ConversionPatternRewriter &rewriter) const {
+ MemRefType mbarrierMemrefType =
+ nvgpu::getMBarrierMemrefType(rewriter.getContext(), mbarType);
+ return ConvertToLLVMPattern::getStridedElementPtr(
+ op->getLoc(), mbarrierMemrefType, memrefDesc, {mbarId}, rewriter);
+ return memrefDesc;
+ }
+};
+
/// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init`
struct NVGPUMBarrierInitLowering
- : public ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp> {
- using ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp>::ConvertOpToLLVMPattern;
+ : public MBarrierBasePattern<nvgpu::MBarrierInitOp> {
+ using MBarrierBasePattern<nvgpu::MBarrierInitOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ nvgpu::MBarrierGroupType mbarrierType = op.getBarriers().getType();
rewriter.setInsertionPoint(op);
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
-
+ Value barrier = getMbarrierPtr(op, mbarrierType, adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
-
- if (isMbarrierShared(op.getBarrier().getType())) {
+ if (isMbarrierShared(mbarrierType)) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
count);
} else {
@@ -820,16 +826,17 @@ struct NVGPUMBarrierInitLowering
/// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive`
struct NVGPUMBarrierArriveLowering
- : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp> {
- using ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp>::ConvertOpToLLVMPattern;
+ : public MBarrierBasePattern<nvgpu::MBarrierArriveOp> {
+ using MBarrierBasePattern<nvgpu::MBarrierArriveOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
+ Value barrier =
+ getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
Type tokenType = getTypeConverter()->convertType(
nvgpu::MBarrierTokenType::get(op->getContext()));
- if (isMbarrierShared(op.getBarrier().getType())) {
+ if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
barrier);
} else {
@@ -843,19 +850,19 @@ struct NVGPUMBarrierArriveLowering
/// Lowers `nvgpu.mbarrier.arrive.nocomplete` to
/// `nvvm.mbarrier.arrive.nocomplete`
struct NVGPUMBarrierArriveNoCompleteLowering
- : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveNoCompleteOp> {
- using ConvertOpToLLVMPattern<
- nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern;
-
+ : public MBarrierBasePattern<nvgpu::MBarrierArriveNoCompleteOp> {
+ using MBarrierBasePattern<
+ nvgpu::MBarrierArriveNoCompleteOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
+ Value barrier =
+ getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
Type tokenType = getTypeConverter()->convertType(
nvgpu::MBarrierTokenType::get(op->getContext()));
Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
- if (isMbarrierShared(op.getBarrier().getType())) {
+ if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
op, tokenType, barrier, count);
} else {
@@ -868,17 +875,16 @@ struct NVGPUMBarrierArriveNoCompleteLowering
/// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait`
struct NVGPUMBarrierTestWaitLowering
- : public ConvertOpToLLVMPattern<nvgpu::MBarrierTestWaitOp> {
- using ConvertOpToLLVMPattern<
- nvgpu::MBarrierTestWaitOp>::ConvertOpToLLVMPattern;
-
+ : public MBarrierBasePattern<nvgpu::MBarrierTestWaitOp> {
+ using MBarrierBasePattern<nvgpu::MBarrierTestWaitOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
+ Value barrier =
+ getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
Type retType = rewriter.getI1Type();
- if (isMbarrierShared(op.getBarrier().getType())) {
+ if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
op, retType, barrier, adaptor.getToken());
} else {
@@ -890,18 +896,18 @@ struct NVGPUMBarrierTestWaitLowering
};
struct NVGPUMBarrierArriveExpectTxLowering
- : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveExpectTxOp> {
- using ConvertOpToLLVMPattern<
- nvgpu::MBarrierArriveExpectTxOp>::ConvertOpToLLVMPattern;
-
+ : public MBarrierBasePattern<nvgpu::MBarrierArriveExpectTxOp> {
+ using MBarrierBasePattern<
+ nvgpu::MBarrierArriveExpectTxOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::MBarrierArriveExpectTxOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
+ Value barrier =
+ getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
Value txcount = truncToI32(rewriter, op->getLoc(), adaptor.getTxcount());
- if (isMbarrierShared(op.getBarrier().getType())) {
+ if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>(
op, barrier, txcount);
return success();
@@ -914,19 +920,19 @@ struct NVGPUMBarrierArriveExpectTxLowering
};
struct NVGPUMBarrierTryWaitParityLowering
- : public ConvertOpToLLVMPattern<nvgpu::MBarrierTryWaitParityOp> {
- using ConvertOpToLLVMPattern<
- nvgpu::MBarrierTryWaitParityOp>::ConvertOpToLLVMPattern;
-
+ : public MBarrierBasePattern<nvgpu::MBarrierTryWaitParityOp> {
+ using MBarrierBasePattern<
+ nvgpu::MBarrierTryWaitParityOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::MBarrierTryWaitParityOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
+ Value barrier =
+ getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
Value ticks = truncToI32(rewriter, op->getLoc(), adaptor.getTicks());
Value phase = truncToI32(rewriter, op->getLoc(), adaptor.getPhase());
- if (isMbarrierShared(op.getBarrier().getType())) {
+ if (isMbarrierShared(op.getBarriers().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>(
op, barrier, phase, ticks);
return success();
@@ -939,16 +945,17 @@ struct NVGPUMBarrierTryWaitParityLowering
};
struct NVGPUTmaAsyncLoadOpLowering
- : public ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp> {
- using ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp>::ConvertOpToLLVMPattern;
+ : public MBarrierBasePattern<nvgpu::TmaAsyncLoadOp> {
+ using MBarrierBasePattern<nvgpu::TmaAsyncLoadOp>::MBarrierBasePattern;
LogicalResult
matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
auto srcMemrefType = cast<MemRefType>(op.getDst().getType());
Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType,
adaptor.getDst(), {}, rewriter);
- Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
- op.getBarrier(), adaptor.getBarrier());
+ Value barrier =
+ getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(),
+ adaptor.getMbarId(), rewriter);
SmallVector<Value> coords = adaptor.getCoordinates();
for (auto [index, value] : llvm::enumerate(coords)) {
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index 680c21ab74fe020..94d7d565ff1a905 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -70,7 +70,7 @@ void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(
LLVM::LLVMStructType::getLiteral(type.getContext(), structBody);
return llvmTypeConverter.convertType(convertedType);
});
- llvmTypeConverter.addConversion([&](nvgpu::MBarrierType type) -> Type {
+ llvmTypeConverter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type {
return llvmTypeConverter.convertType(
getMBarrierMemrefType(type.getContext(), type));
});
@@ -818,7 +818,7 @@ struct HopperBuilder {
HopperBuilder(RewriterBase &rewriter, Location loc)
: rewriter(rewriter), loc(loc) {}
- TypedValue<nvgpu::MBarrierType>
+ TypedValue<nvgpu::MBarrierGroupType>
buildAndInitBarrierInSharedMemory(OpFoldResult numThreads);
/// Create tma descriptor op to initiate transfer from global to shared
@@ -832,9 +832,9 @@ struct HopperBuilder {
OpFoldResult
buildTmaAsyncLoad(TypedValue<nvgpu::TensorMapDescriptorType> globalDesc,
TypedValue<MemRefType> sharedMemref,
- TypedValue<nvgpu::MBarrierType> barrier,
+ TypedValue<nvgpu::MBarrierGroupType> barrier,
SmallVectorImpl<Operation *> &loadOps);
- void buildBarrierArriveTx(TypedValue<nvgpu::MBarrierType> barrier,
+ void buildBarrierArriveTx(TypedValue<nvgpu::MBarrierGroupType> barrier,
ArrayRef<OpFoldResult> sizes);
/// If threadIdx.x == 0 does TMA request + wait, else just wait.
@@ -843,9 +843,9 @@ struct HopperBuilder {
SmallVector<Operation *> buildPredicateLoadsOnThread0(
ArrayRef<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescriptors,
ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,
- TypedValue<nvgpu::MBarrierType> barrier);
+ TypedValue<nvgpu::MBarrierGroupType> barrier);
- void buildTryWaitParity(TypedValue<nvgpu::MBarrierType> barrier);
+ void buildTryWaitParity(TypedValue<nvgpu::MBarrierGroupType> barrier);
RewriterBase &rewriter;
Location loc;
@@ -854,7 +854,7 @@ struct HopperBuilder {
SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0(
ArrayRef<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescriptors,
ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,
- TypedValue<nvgpu::MBarrierType> barrier) {
+ TypedValue<nvgpu::MBarrierGroupType> barrier) {
SmallVector<Operation *> loadOps;
Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
Value tidx = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x);
@@ -895,15 +895,18 @@ static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) {
// return b.getI64IntegerAttr(static_cast<int64_t>(kSharedMemorySpace));
}
-TypedValue<nvgpu::MBarrierType>
+TypedValue<nvgpu::MBarrierGroupType>
HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) {
auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);
Value barrier = rewriter.create<nvgpu::MBarrierCreateOp>(
- loc, nvgpu::MBarrierType::get(rewriter.getContext(), sharedMemorySpace));
+ loc,
+ nvgpu::MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace));
+ Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
rewriter.create<nvgpu::MBarrierInitOp>(
- loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads));
+ loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads),
+ zero);
rewriter.create<gpu::BarrierOp>(loc);
- return cast<TypedValue<nvgpu::MBarrierType>>(barrier);
+ return cast<TypedValue<nvgpu::MBarrierGroupType>>(barrier);
}
TypedValue<nvgpu::TensorMapDescriptorType>
@@ -938,12 +941,12 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,
OpFoldResult HopperBuilder::buildTmaAsyncLoad(
TypedValue<nvgpu::TensorMapDescriptorType> globalDesc,
TypedValue<MemRefType> sharedMemref,
- TypedValue<nvgpu::MBarrierType> barrier,
+ TypedValue<nvgpu::MBarrierGroupType> barrier,
SmallVectorImpl<Operation *> &loadOps) {
MLIRContext *ctx = rewriter.getContext();
Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
Operation *loadOp = rewriter.create<nvgpu::TmaAsyncLoadOp>(
- loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero});
+ loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}, zero);
loadOps.push_back(loadOp);
auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref);
SmallVector<AffineExpr> symbols(mixedSizes.size());
@@ -957,7 +960,7 @@ OpFoldResult HopperBuilder::buildTmaAsyncLoad(
}
void HopperBuilder::buildBarrierArriveTx(
- TypedValue<nvgpu::MBarrierType> barrier,
+ TypedValue<nvgpu::MBarrierGroupType> barrier,
ArrayRef<OpFoldResult> mixedSizes) {
assert(!mixedSizes.empty() && "expecte non-empty sizes");
MLIRContext *ctx = rewriter.getContext();
@@ -967,19 +970,21 @@ void HopperBuilder::buildBarrierArriveTx(
OpFoldResult size =
affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes);
Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size);
- rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal);
+ Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+ rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal, zero);
}
void HopperBuilder::buildTryWaitParity(
- TypedValue<nvgpu::MBarrierType> barrier) {
+ TypedValue<nvgpu::MBarrierGroupType> barrier) {
Value parity = rewriter.create<arith::ConstantIndexOp>(loc, 0);
// 10M is an arbitrary, not too small or too big number to specify the number
// of ticks before retry.
// TODO: hoist this in a default dialect constant.
Value ticksBeforeRetry =
rewriter.create<arith::ConstantIndexOp>(loc, 10000000);
+ Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
rewriter.create<nvgpu::MBarrierTryWaitParityOp>(loc, barrier, parity,
- ticksBeforeRetry);
+ ticksBeforeRetry, zero);
}
//===----------------------------------------------------------------------===//
@@ -1013,7 +1018,7 @@ SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) {
ArrayRef<OpFoldResult>{launchOp.getBlockSizeX(), launchOp.getBlockSizeY(),
launchOp.getBlockSizeZ()});
- TypedValue<nvgpu::MBarrierType> barrier =
+ TypedValue<nvgpu::MBarrierGroupType> barrier =
buildAndInitBarrierInSharedMemory(numThreads);
SmallVector<TypedValue<MemRefType>> shmems;
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index f011007e040ce9c..8c2f8dbbd5ad9a3 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -470,28 +470,34 @@ func.func @mma_sp_sync_i8_16864(%arg0: vector<4x4xi8>,
return %d : vector<2x2xi32>
}
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token
// CHECK-LABEL: func @mbarrier
func.func @mbarrier() {
%num_threads = arith.constant 128 : index
+ // CHECK: %[[c0:.+]] = arith.constant 0 : index
+ // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64
+ %c0 = arith.constant 0 : index
// CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3>
%barrier = nvgpu.mbarrier.create -> !barrierType
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
- // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
- nvgpu.mbarrier.init %barrier, %num_threads : !barrierType
+ nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
- // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]]
- %token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType
+ %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType
- // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
- %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType
+ %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
func.return
}
@@ -500,63 +506,96 @@ func.func @mbarrier() {
func.func @mbarrier_nocomplete() {
%num_threads = arith.constant 128 : index
%count = arith.constant 12 : index
+ // CHECK: %[[c0:.+]] = arith.constant 0 : index
+ // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64
+ %c0 = arith.constant 0 : index
// CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3>
%barrier = nvgpu.mbarrier.create -> !barrierType
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
- // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
- nvgpu.mbarrier.init %barrier, %num_threads : !barrierType
+ nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
- // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]]
- %token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType
+ %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType
- // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
- %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType
+ %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
func.return
}
+// CHECK-LABEL: func @mbarrier_wait
+func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, %token : !tokenType) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %n = arith.constant 100 : index
+
+ %numBarriers = arith.constant 5 : index
+
+ scf.for %i = %c0 to %n step %c1 {
+// CHECK: %[[c5:.+]] = arith.constant 5 : index
+// CHECK: scf.for %[[i:.*]] =
+// CHECK: %[[S2:.+]] = arith.remui %[[i]], %[[c5]] : index
+// CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64
+// CHECK: %[[S4:.+]] = llvm.extractvalue %0[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
+ %mbarId = arith.remui %i, %numBarriers : index
+ %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType
+ }
+ return
+}
+
// CHECK-LABEL: func @mbarrier_txcount
func.func @mbarrier_txcount() {
- %num_threads = arith.constant 128 : index
+ %num_threads = arith.constant 128 : index
+ // CHECK: %[[c0:.+]] = arith.constant 0 : index
+ // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64
+ %c0 = arith.constant 0 : index
// CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3>
%barrier = nvgpu.mbarrier.create -> !barrierType
// CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
- // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.init.shared %[[barPtr]]
- nvgpu.mbarrier.init %barrier, %num_threads : !barrierType
+ nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType
- %c0 = arith.constant 0 : index
%tidxreg = nvvm.read.ptx.sreg.tid.x : i32
%tidx = arith.index_cast %tidxreg : i32 to index
%cnd = arith.cmpi eq, %tidx, %c0 : index
scf.if %cnd {
%txcount = arith.constant 256 : index
- // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]]
- nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType
+ nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType
scf.yield
} else {
%txcount = arith.constant 0 : index
- // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]]
- nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType
+ nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType
scf.yield
}
%phase = arith.constant 0 : index
%ticks = arith.constant 10000000 : index
- // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+ // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
// CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]]
- nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType
+ nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType
func.return
}
@@ -567,7 +606,7 @@ func.func @mbarrier_txcount() {
!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none>
!tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = interleave_16b>
!tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none>
-!mbarrier = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
%buffer1d: memref<128xf32,3>,
%buffer2d: memref<32x32xf32,3>,
@@ -575,18 +614,19 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
%buffer4d: memref<2x2x32x32xf32,3>,
%buffer5d: memref<2x2x2x32x32xf32,3>,
%mbarrier: !mbarrier) {
+ %c0 = arith.constant 0 : index
%crd0 = arith.constant 0 : index
%crd1 = arith.constant 0 : index
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}]
- nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3>
+ nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier[%c0] to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3>
func.return
}
@@ -621,12 +661,12 @@ module @mymodule {
%rhsShmem3 = memref.subview %rhsShmem2[1,0,0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16,3> to memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3>
%rhsShmem = memref.subview %rhsShmem3[0,0,0][1, 64, 128][1, 1, 1] : memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3> to !shmemrhs
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global
- nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
+ nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
// CHECK: %[[desc:.+]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[c8192:.+]] = llvm.mlir.constant(8192 : index) : i64
// CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32
- nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
+ nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
return
}
}
diff --git a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
index 646008b64f794f4..30f8c45709bcd98 100644
--- a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
+++ b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
@@ -34,7 +34,7 @@ func.func @main() {
%out_1 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, #gpu.address_space<workgroup>>
// CHECK: %[[B:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>
- // CHECK: nvgpu.mbarrier.init %[[B]], %{{.*}} : <memorySpace = #gpu.address_space<workgroup>
+ // CHECK: nvgpu.mbarrier.init %[[B]][%{{.*}}], %{{.*}} : <memorySpace = #gpu.address_space<workgroup>
// CHECK: gpu.barrier
//
// CHECK: %[[c0:.*]] = arith.constant 0 : index
@@ -44,27 +44,27 @@ func.func @main() {
// CHECK: scf.if %[[CMP]] {
//
// CHECK: %[[c0_7:.*]] = arith.constant 0 : index
- // CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]] to %[[G1]]
+ // CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]][%{{.*}}] to %[[G1]]
// CHECK-SAME: : <tensor = memref<64x8xf32, #gpu.address_space<workgroup>>,
// CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>
// CHECK-SAME: -> memref<64x8xf32, #gpu.address_space<workgroup>>
//
// CHECK: %[[c0_8:.*]] = arith.constant 0 : index
- // CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]] to %[[G2]]
+ // CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]][%{{.*}}] to %[[G2]]
// CHECK-SAME: : <tensor = memref<8x128xf32, #gpu.address_space<workgroup>>,
// CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>
// CHECK-SAME: -> memref<8x128xf32, #gpu.address_space<workgroup>>
//
// CHECK: %[[c6144:.*]] = arith.constant 6144 : index
- // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]], %[[c6144]] : <memorySpace = #gpu.address_space<workgroup>
+ // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c6144]] : <memorySpace = #gpu.address_space<workgroup>
// CHECK: } else {
// CHECK: %[[c0_7:.*]] = arith.constant 0 : index
- // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup>
+ // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup>
// CHECK: }
//
// CHECK: %[[c0_6:.*]] = arith.constant 0 : index
// CHECK: %[[c10000000:.*]] = arith.constant 10000000 : index
- // CHECK: nvgpu.mbarrier.try_wait.parity %[[B]], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup>
+ // CHECK: nvgpu.mbarrier.try_wait.parity %[[B]][%{{.*}}], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup>
/// Both copies are matched and end up in the same async group.
linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, #gpu.address_space<workgroup>>)
More information about the Mlir-commits
mailing list