[Mlir-commits] [flang] [mlir] [MLIR][NVVM] Split nvvm.barrier into nvvm.barrier and nvvm.barrier.reduction (PR #199404)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sun May 24 01:26:20 PDT 2026
https://github.com/xys-syx created https://github.com/llvm/llvm-project/pull/199404
This PR is the second step of the three-PR plan following the direction suggested in #192203 to clean up `nvvm.barrier`:
- splits the reduction form into its own op so that each op has a single clear meaning and the structure is reflected directly in the IR.
- `barrierId` is modeled as `DefaultValuedAttr<I32Attr, "0">` rather than an `Optional<I32>` SSA value. PTX requires the barrier id to be a 4-bit immediate (`0..15`), so an attribute better matches the underlying constraint.
- Both ops now share a range verifier that rejects `barrierId` values outside `[0, 15]`
A follow-up PR will add the `aligned` attribute on top of this split.
(I am not able to merge PR, please help me)
>From cf7c18457065c851a98046139a1187579505b590 Mon Sep 17 00:00:00 2001
From: Yuansui Xu <xuyuansui at outlook.com>
Date: Sun, 24 May 2026 02:54:17 -0500
Subject: [PATCH] split NVVM_BarrierOp
---
.../Optimizer/Builder/CUDAIntrinsicCall.cpp | 21 ++---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 78 ++++++++++++-------
.../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 44 +++++++++--
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 73 ++++++++---------
.../Conversion/GPUToNVVM/gpu-to-nvvm.mlir | 17 +---
mlir/test/Dialect/LLVMIR/invalid.mlir | 16 ++++
mlir/test/Dialect/LLVMIR/nvvm.mlir | 32 ++++++--
mlir/test/Target/LLVMIR/nvvm/barrier.mlir | 35 +++++----
mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 7 --
mlir/test/python/dialects/nvvm.py | 18 ++---
10 files changed, 205 insertions(+), 136 deletions(-)
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index bc95d7d2893a7..f7402b1730b7d 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1331,7 +1331,8 @@ CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
// SYNCTHREADS
void CUDAIntrinsicLibrary::genSyncThreads(
llvm::ArrayRef<fir::ExtendedValue> args) {
- mlir::NVVM::BarrierOp::create(builder, loc);
+ mlir::NVVM::BarrierOp::create(builder, loc, /*barrierId=*/0,
+ /*numberOfThreads=*/mlir::Value{});
}
// SYNCTHREADS_AND
@@ -1339,12 +1340,12 @@ mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
- return mlir::NVVM::BarrierOp::create(
- builder, loc, resultType, {}, {},
+ return mlir::NVVM::BarrierReductionOp::create(
+ builder, loc, resultType, /*barrierId=*/0,
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::AND),
arg)
- .getResult(0);
+ .getResult();
}
// SYNCTHREADS_COUNT
@@ -1352,12 +1353,12 @@ mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
- return mlir::NVVM::BarrierOp::create(
- builder, loc, resultType, {}, {},
+ return mlir::NVVM::BarrierReductionOp::create(
+ builder, loc, resultType, /*barrierId=*/0,
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::POPC),
arg)
- .getResult(0);
+ .getResult();
}
// SYNCTHREADS_OR
@@ -1365,12 +1366,12 @@ mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
- return mlir::NVVM::BarrierOp::create(
- builder, loc, resultType, {}, {},
+ return mlir::NVVM::BarrierReductionOp::create(
+ builder, loc, resultType, /*barrierId=*/0,
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::OR),
arg)
- .getResult(0);
+ .getResult();
}
// SYNCWARP
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 030c33526b16a..80a7559fb0444 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1123,24 +1123,23 @@ def BarrierReductionAttr
let assemblyFormat = "`<` $value `>`";
}
-def NVVM_BarrierOp : NVVM_SingleResultIntrinsicOp<"barrier",
- [AttrSizedOperandSegments, InferTypeOpAdaptorWithIsCompatible]> {
+def NVVM_BarrierOp : NVVM_VoidIntrinsicOp<"barrier"> {
let summary = "CTA Barrier Synchronization Op";
let description = [{
The `nvvm.barrier` operation performs barrier synchronization and communication
within a CTA (Cooperative Thread Array). It causes executing threads to wait for
all non-exited threads participating in the barrier to arrive.
- The operation takes two optional operands:
+ The operation takes the following operands and attributes:
- `barrierId`: Specifies a logical barrier resource with value 0 through 15.
Each CTA instance has sixteen barriers numbered 0..15. Defaults to 0 if not specified.
- `numberOfThreads`: Specifies the number of threads participating in the barrier.
When specified, the value must be a multiple of the warp size. If not specified,
all threads in the CTA participate in the barrier.
- - `reductionOp`: specifies the reduction operation (`popc`, `and`, `or`).
- - `reductionPredicate`: specifies the predicate to be used with the
- `reductionOp`.
+
+ Reduction variants of the barrier instruction are modeled by the
+ `nvvm.barrier.reduction` op.
The barrier operation guarantees that when the barrier completes, prior memory
accesses requested by participating threads are performed relative to all threads
@@ -1157,32 +1156,57 @@ def NVVM_BarrierOp : NVVM_SingleResultIntrinsicOp<"barrier",
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
}];
- let arguments = (ins Optional<I32>:$barrierId, Optional<I32>:$numberOfThreads,
- OptionalAttr<BarrierReductionAttr>:$reductionOp,
- Optional<I32>:$reductionPredicate);
- string llvmBuilder = [{
- auto [id, args] = NVVM::BarrierOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- if ($reductionOp)
- $res = builder.CreateZExt(createIntrinsicCall(builder, id, args),
- builder.getInt32Ty());
- else
- createIntrinsicCall(builder, id, args);
- }];
- let results = (outs Optional<I32>:$res);
+ let arguments = (ins
+ DefaultValuedAttr<I32Attr, "0">:$barrierId,
+ Optional<I32>:$numberOfThreads);
+
+ let assemblyFormat =
+ "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? "
+ "attr-dict";
let hasVerifier = 1;
+}
+
+def NVVM_BarrierReductionOp :
+ NVVM_IntrinsicLoweringOp<"barrier.reduction"> {
+ let summary = "CTA Barrier Reduction Op";
+ let description = [{
+ The `nvvm.barrier.reduction` operation performs barrier synchronization with a
+ reduction across the per-thread predicates contributed by participating threads
+ in a CTA.
+
+ - `barrierId`: Specifies a logical barrier resource with value 0 through 15.
+ Defaults to 0.
+ - `reductionOp`: The reduction kind (`popc`, `and`, `or`) applied across the
+ per-thread predicates.
+ - `reductionPredicate`: The per-thread i32 predicate. It is compared against
+ zero to form the i1 value fed into the reduction.
+
+ The result is the i32 reduction value computed across all threads
+ participating in the barrier. This op always lowers to the aligned form of
+ the `@llvm.nvvm.barrier.cta.red.*` intrinsic family.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
+ }];
+
+ let arguments = (ins
+ DefaultValuedAttr<I32Attr, "0">:$barrierId,
+ BarrierReductionAttr:$reductionOp,
+ I32:$reductionPredicate);
+ let results = (outs I32:$res);
let assemblyFormat =
- "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? "
- "(qualified($reductionOp)^ $reductionPredicate)? (`->` type($res)^)? attr-dict";
+ "(`id` `=` $barrierId^)? qualified($reductionOp) $reductionPredicate "
+ "`->` type($res) attr-dict";
- let builders = [OpBuilder<(ins), [{
- return build($_builder, $_state, TypeRange{}, Value{}, Value{}, {}, Value{});
- }]>,
- OpBuilder<(ins "Value":$barrierId), [{
- return build($_builder, $_state, TypeRange{}, barrierId, Value{}, {}, Value{});
- }]>];
+ string llvmBuilder = [{
+ auto [id, args] = NVVM::BarrierReductionOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ $res = builder.CreateZExt(createIntrinsicCall(builder, id, args),
+ builder.getInt32Ty());
+ }];
+
+ let hasVerifier = 1;
}
def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index dab68fd734236..e03b37a1c0c61 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -412,6 +412,36 @@ struct AssertOpToAssertfailLowering
}
};
+/// Follows the SSA chain `insertvalue[0] -> load -> addressof -> global`
+/// to recover the integer initializer behind field [0] of `namedBarrier`,
+/// since `nvvm.barrier` carries `barrierId` as an `IntegerAttr`.
+static FailureOr<uint32_t>
+extractStaticNamedBarrierId(Operation *contextOp, Value namedBarrier) {
+ auto insertOp = namedBarrier.getDefiningOp<LLVM::InsertValueOp>();
+ while (insertOp && insertOp.getPosition() != ArrayRef<int64_t>{0})
+ insertOp = insertOp.getContainer().getDefiningOp<LLVM::InsertValueOp>();
+ if (!insertOp)
+ return failure();
+ auto loadOp = insertOp.getValue().getDefiningOp<LLVM::LoadOp>();
+ if (!loadOp)
+ return failure();
+ auto addrOf = loadOp.getAddr().getDefiningOp<LLVM::AddressOfOp>();
+ if (!addrOf)
+ return failure();
+ Operation *symbolTableOp =
+ contextOp->getParentWithTrait<OpTrait::SymbolTable>();
+ if (!symbolTableOp)
+ return failure();
+ auto globalOp = dyn_cast_or_null<LLVM::GlobalOp>(
+ SymbolTable::lookupSymbolIn(symbolTableOp, addrOf.getGlobalNameAttr()));
+ if (!globalOp)
+ return failure();
+ auto initAttr = dyn_cast_or_null<IntegerAttr>(globalOp.getValueAttr());
+ if (!initAttr)
+ return failure();
+ return static_cast<uint32_t>(initAttr.getInt());
+}
+
struct GPUBarrierOpToNVVMLowering final
: public ConvertOpToLLVMPattern<gpu::BarrierOp> {
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
@@ -421,12 +451,15 @@ struct GPUBarrierOpToNVVMLowering final
ConversionPatternRewriter &rewriter) const override {
if (Value namedBarrier = adaptor.getNamedBarrier()) {
Location loc = op.getLoc();
- Value barrierId =
- LLVM::ExtractValueOp::create(rewriter, loc, namedBarrier, 0);
+ FailureOr<uint32_t> barrierId =
+ extractStaticNamedBarrierId(op, namedBarrier);
+ if (failed(barrierId))
+ return rewriter.notifyMatchFailure(
+ op, "could not recover the static barrier id behind the named "
+ "barrier handle");
Value numberOfThreads =
LLVM::ExtractValueOp::create(rewriter, loc, namedBarrier, 1);
- NVVM::BarrierOp::create(rewriter, loc, barrierId, numberOfThreads,
- NVVM::BarrierReductionAttr{}, Value{});
+ NVVM::BarrierOp::create(rewriter, loc, *barrierId, numberOfThreads);
rewriter.eraseOp(op);
return success();
}
@@ -434,7 +467,8 @@ struct GPUBarrierOpToNVVMLowering final
gpu::BarrierScope scope = op.getScope();
switch (scope) {
case gpu::BarrierScope::Workgroup:
- rewriter.replaceOpWithNewOp<NVVM::BarrierOp>(op);
+ rewriter.replaceOpWithNewOp<NVVM::BarrierOp>(op, /*barrierId=*/0,
+ /*numberOfThreads=*/Value{});
return success();
case gpu::BarrierScope::Subgroup: {
// Emit __syncwarp(0xFFFFFFFF) for full-warp sync.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index aa9e05013eaed..29f969874c591 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -2926,32 +2926,22 @@ LogicalResult NVVM::SetMaxRegisterOp::verify() {
return success();
}
-LogicalResult NVVM::BarrierOp::verify() {
- if (getNumberOfThreads() && !getBarrierId())
- return emitOpError(
- "barrier id is missing, it should be set between 0 to 15");
-
- if (getBarrierId() && (getReductionOp() || getReductionPredicate()))
- return emitOpError("reduction are only available when id is 0");
-
- if ((getReductionOp() && !getReductionPredicate()) ||
- (!getReductionOp() && getReductionPredicate()))
- return emitOpError("reduction predicate and reduction operation must be "
- "specified together");
-
+/// Common verifier for `nvvm.barrier` and `nvvm.barrier.reduction`: PTX
+/// restricts the logical barrier resource id to the 4-bit range 0..15.
+template <typename BarrierLikeOp>
+static LogicalResult verifyBarrierIdRange(BarrierLikeOp op) {
+ if (op.getBarrierId() > 15)
+ return op.emitOpError("barrier id must be in the range [0, 15], got ")
+ << op.getBarrierId();
return success();
}
-LogicalResult BarrierOp::inferReturnTypes(
- MLIRContext *context, std::optional<Location> location,
- BarrierOp::Adaptor adaptor, SmallVectorImpl<Type> &inferredReturnTypes) {
- if (adaptor.getReductionOp())
- inferredReturnTypes.push_back(IntegerType::get(context, 32));
- return success();
+LogicalResult NVVM::BarrierOp::verify() {
+ return verifyBarrierIdRange(*this);
}
-bool BarrierOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
- return isCompatibleReturnTypesOptionalResult(l, r);
+LogicalResult NVVM::BarrierReductionOp::verify() {
+ return verifyBarrierIdRange(*this);
}
LogicalResult NVVM::Tcgen05CpOp::verify() {
@@ -3456,32 +3446,37 @@ void SubFOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::BarrierOp>(op);
- llvm::Value *barrierId = thisOp.getBarrierId()
- ? mt.lookupValue(thisOp.getBarrierId())
- : builder.getInt32(0);
+ llvm::SmallVector<llvm::Value *> args = {
+ builder.getInt32(thisOp.getBarrierId())};
llvm::Intrinsic::ID id;
- llvm::SmallVector<llvm::Value *> args = {barrierId};
if (thisOp.getNumberOfThreads()) {
id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count;
args.push_back(mt.lookupValue(thisOp.getNumberOfThreads()));
- } else if (thisOp.getReductionOp()) {
- switch (*thisOp.getReductionOp()) {
- case NVVM::BarrierReduction::AND:
- id = llvm::Intrinsic::nvvm_barrier_cta_red_and_aligned_all;
- break;
- case NVVM::BarrierReduction::OR:
- id = llvm::Intrinsic::nvvm_barrier_cta_red_or_aligned_all;
- break;
- case NVVM::BarrierReduction::POPC:
- id = llvm::Intrinsic::nvvm_barrier_cta_red_popc_aligned_all;
- break;
- }
- args.push_back(builder.CreateICmpNE(
- mt.lookupValue(thisOp.getReductionPredicate()), builder.getInt32(0)));
} else {
id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all;
}
+ return {id, std::move(args)};
+}
+mlir::NVVM::IDArgPair NVVM::BarrierReductionOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::BarrierReductionOp>(op);
+ llvm::Intrinsic::ID id;
+ switch (thisOp.getReductionOp()) {
+ case NVVM::BarrierReduction::AND:
+ id = llvm::Intrinsic::nvvm_barrier_cta_red_and_aligned_all;
+ break;
+ case NVVM::BarrierReduction::OR:
+ id = llvm::Intrinsic::nvvm_barrier_cta_red_or_aligned_all;
+ break;
+ case NVVM::BarrierReduction::POPC:
+ id = llvm::Intrinsic::nvvm_barrier_cta_red_popc_aligned_all;
+ break;
+ }
+ llvm::SmallVector<llvm::Value *> args = {
+ builder.getInt32(thisOp.getBarrierId()),
+ builder.CreateICmpNE(mt.lookupValue(thisOp.getReductionPredicate()),
+ builder.getInt32(0))};
return {id, std::move(args)};
}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index b96069ac41a44..6086b8fae84c1 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -218,27 +218,14 @@ gpu.module @test_module_5 {
// CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[ID1]], %[[DESC3]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[THREADS1]], %[[DESC4]][1] : !llvm.struct<(i32, i32)>
%nb1 = gpu.initialize_named_barrier %c2 : i32 -> !gpu.named_barrier
- // CHECK: %[[BARRIER_ID0:.*]] = llvm.extractvalue %[[DESC2]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[BARRIER_THREADS0:.*]] = llvm.extractvalue %[[DESC2]][1] : !llvm.struct<(i32, i32)>
- // CHECK: nvvm.barrier id = %[[BARRIER_ID0]] number_of_threads = %[[BARRIER_THREADS0]]
+ // CHECK: nvvm.barrier id = 1 number_of_threads = %[[BARRIER_THREADS0]]
gpu.barrier named(%nb0 : !gpu.named_barrier)
- // CHECK: %[[BARRIER_ID1:.*]] = llvm.extractvalue %[[DESC5]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[BARRIER_THREADS1:.*]] = llvm.extractvalue %[[DESC5]][1] : !llvm.struct<(i32, i32)>
- // CHECK: nvvm.barrier id = %[[BARRIER_ID1]] number_of_threads = %[[BARRIER_THREADS1]]
+ // CHECK: nvvm.barrier id = 2 number_of_threads = %[[BARRIER_THREADS1]]
gpu.barrier named(%nb1 : !gpu.named_barrier)
func.return
}
-
- // CHECK-LABEL: func @gpu_named_barrier_arg
- // CHECK-SAME: (%[[NB:.*]]: !llvm.struct<(i32, i32)>)
- func.func @gpu_named_barrier_arg(%nb : !gpu.named_barrier) {
- // CHECK: %[[BARRIER_ID:.*]] = llvm.extractvalue %[[NB]][0] : !llvm.struct<(i32, i32)>
- // CHECK: %[[BARRIER_THREADS:.*]] = llvm.extractvalue %[[NB]][1] : !llvm.struct<(i32, i32)>
- // CHECK: nvvm.barrier id = %[[BARRIER_ID]] number_of_threads = %[[BARRIER_THREADS]]
- gpu.barrier named(%nb : !gpu.named_barrier)
- func.return
- }
-
// CHECK: llvm.mlir.global internal constant @[[$NB0]](1 : i32) {addr_space = 0 : i32} : i32
// CHECK: llvm.mlir.global internal constant @[[$NB1]](2 : i32) {addr_space = 0 : i32} : i32
}
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index e80094df1eed2..3b89e6a508ec7 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -2133,3 +2133,19 @@ func.func @nvvm_read_sreg_clock64_wrong_type() {
%0 = nvvm.read.ptx.sreg.clock64 : i32
return
}
+
+// -----
+
+func.func @nvvm_barrier_id_out_of_range() {
+ // expected-error at +1 {{'nvvm.barrier' op barrier id must be in the range [0, 15], got 16}}
+ nvvm.barrier id = 16
+ return
+}
+
+// -----
+
+func.func @nvvm_barrier_reduction_id_out_of_range(%pred : i32) {
+ // expected-error at +1 {{'nvvm.barrier.reduction' op barrier id must be in the range [0, 15], got 42}}
+ %0 = nvvm.barrier.reduction id = 42 #nvvm.reduction<and> %pred -> i32
+ return
+}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index e3a98cc9cfc34..7dd43e137a17b 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -37,14 +37,30 @@ func.func @nvvm_rcp(%arg0: f32) -> f32 {
}
// CHECK-LABEL: @llvm_nvvm_barrier
-// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
-llvm.func @llvm_nvvm_barrier(%barId : i32, %numberOfThreads : i32) {
- // CHECK: nvvm.barrier
- nvvm.barrier
- // CHECK: nvvm.barrier id = %[[barId]]
- nvvm.barrier id = %barId
- // CHECK: nvvm.barrier id = %[[barId]] number_of_threads = %[[numberOfThreads]]
- nvvm.barrier id = %barId number_of_threads = %numberOfThreads
+// CHECK-SAME: (%[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier(%numberOfThreads : i32) {
+ // CHECK: nvvm.barrier{{ *$}}
+ nvvm.barrier
+ // CHECK: nvvm.barrier id = 5
+ nvvm.barrier id = 5
+ // CHECK: nvvm.barrier id = 5 number_of_threads = %[[numberOfThreads]]
+ nvvm.barrier id = 5 number_of_threads = %numberOfThreads
+ // CHECK: nvvm.barrier number_of_threads = %[[numberOfThreads]]
+ nvvm.barrier number_of_threads = %numberOfThreads
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_barrier_reduction
+// CHECK-SAME: (%[[pred:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_reduction(%pred : i32) {
+ // CHECK: nvvm.barrier.reduction #nvvm.reduction<and> %[[pred]] -> i32
+ %0 = nvvm.barrier.reduction #nvvm.reduction<and> %pred -> i32
+ // CHECK: nvvm.barrier.reduction #nvvm.reduction<or> %[[pred]] -> i32
+ %1 = nvvm.barrier.reduction #nvvm.reduction<or> %pred -> i32
+ // CHECK: nvvm.barrier.reduction #nvvm.reduction<popc> %[[pred]] -> i32
+ %2 = nvvm.barrier.reduction #nvvm.reduction<popc> %pred -> i32
+ // CHECK: nvvm.barrier.reduction id = 3 #nvvm.reduction<and> %[[pred]] -> i32
+ %3 = nvvm.barrier.reduction id = 3 #nvvm.reduction<and> %pred -> i32
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
index 7e654eb8dc572..9b918ad019330 100644
--- a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
@@ -2,29 +2,36 @@
// RUN: mlir-opt %s | mlir-opt | FileCheck %s
// LLVM-LABEL: @llvm_nvvm_barrier(
-// LLVM-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]], i32 %[[redOperand:.*]])
-llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32, %redOperand : i32) {
+// LLVM-SAME: i32 %[[numThreads:.*]], i32 %[[redOperand:.*]])
+llvm.func @llvm_nvvm_barrier(%numberOfThreads : i32, %redOperand : i32) {
// LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
// CHECK: nvvm.barrier
nvvm.barrier
- // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %[[barId]])
- // CHECK: nvvm.barrier id = %{{.*}}
- nvvm.barrier id = %barID
- // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %[[barId]], i32 %[[numThreads]])
- // CHECK: nvvm.barrier id = %{{.*}} number_of_threads = %{{.*}}
- nvvm.barrier id = %barID number_of_threads = %numberOfThreads
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 5)
+ // CHECK: nvvm.barrier id = 5
+ nvvm.barrier id = 5
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 5, i32 %[[numThreads]])
+ // CHECK: nvvm.barrier id = 5 number_of_threads = %{{.*}}
+ nvvm.barrier id = 5 number_of_threads = %numberOfThreads
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 0, i32 %[[numThreads]])
+ // CHECK: nvvm.barrier number_of_threads = %{{.*}}
+ nvvm.barrier number_of_threads = %numberOfThreads
// LLVM: %[[redOperandCmp1:.*]] = icmp ne i32 %[[redOperand]], 0
// LLVM: %{{.*}} = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 0, i1 %[[redOperandCmp1]])
- // CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<and> %{{.*}} -> i32
- %0 = nvvm.barrier #nvvm.reduction<and> %redOperand -> i32
+ // CHECK: %{{.*}} = nvvm.barrier.reduction #nvvm.reduction<and> %{{.*}} -> i32
+ %0 = nvvm.barrier.reduction #nvvm.reduction<and> %redOperand -> i32
// LLVM: %[[redOperandCmp2:.*]] = icmp ne i32 %[[redOperand]], 0
// LLVM: %{{.*}} = call i1 @llvm.nvvm.barrier.cta.red.or.aligned.all(i32 0, i1 %[[redOperandCmp2]])
- // CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<or> %{{.*}} -> i32
- %1 = nvvm.barrier #nvvm.reduction<or> %redOperand -> i32
+ // CHECK: %{{.*}} = nvvm.barrier.reduction #nvvm.reduction<or> %{{.*}} -> i32
+ %1 = nvvm.barrier.reduction #nvvm.reduction<or> %redOperand -> i32
// LLVM: %[[redOperandCmp3:.*]] = icmp ne i32 %[[redOperand]], 0
// LLVM: %{{.*}} = call i32 @llvm.nvvm.barrier.cta.red.popc.aligned.all(i32 0, i1 %[[redOperandCmp3]])
- // CHECK: %{{.*}} = nvvm.barrier #nvvm.reduction<popc> %{{.*}} -> i32
- %2 = nvvm.barrier #nvvm.reduction<popc> %redOperand -> i32
+ // CHECK: %{{.*}} = nvvm.barrier.reduction #nvvm.reduction<popc> %{{.*}} -> i32
+ %2 = nvvm.barrier.reduction #nvvm.reduction<popc> %redOperand -> i32
+ // LLVM: %[[redOperandCmp4:.*]] = icmp ne i32 %[[redOperand]], 0
+ // LLVM: %{{.*}} = call i1 @llvm.nvvm.barrier.cta.red.and.aligned.all(i32 3, i1 %[[redOperandCmp4]])
+ // CHECK: %{{.*}} = nvvm.barrier.reduction id = 3 #nvvm.reduction<and> %{{.*}} -> i32
+ %3 = nvvm.barrier.reduction id = 3 #nvvm.reduction<and> %redOperand -> i32
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 6e96e918d5f0d..82e7373a40baa 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -1,12 +1,5 @@
// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
-llvm.func @kernel_func(%numberOfThreads : i32) {
- // expected-error @below {{'nvvm.barrier' op barrier id is missing, it should be set between 0 to 15}}
- nvvm.barrier number_of_threads = %numberOfThreads
-}
-
-// -----
-
// expected-error @below {{'"nvvm.minctasm"' attribute must be integer constant}}
llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.minctasm = "foo"} {
llvm.return
diff --git a/mlir/test/python/dialects/nvvm.py b/mlir/test/python/dialects/nvvm.py
index d727a39e956e0..bb8dd68deebad 100644
--- a/mlir/test/python/dialects/nvvm.py
+++ b/mlir/test/python/dialects/nvvm.py
@@ -130,12 +130,8 @@ def test_barriers():
@func.FuncOp.from_py_func(i32, i32, f32)
def barriers(mask, vi32, vf32):
- c0 = arith.constant(T.i32(), 0)
cffff = arith.constant(T.i32(), 0xFFFF)
- nvvm.barrier(
- barrier_id=c0,
- number_of_threads=cffff,
- )
+ nvvm.barrier(number_of_threads=cffff)
pred = arith.constant(T.i32(), 1)
for reduction in (
@@ -143,7 +139,8 @@ def barriers(mask, vi32, vf32):
nvvm.BarrierReduction.OR,
nvvm.BarrierReduction.POPC,
):
- pred = nvvm.barrier(
+ pred = nvvm.barrier_reduction(
+ res=T.i32(),
reduction_op=reduction,
reduction_predicate=pred,
)
@@ -163,13 +160,12 @@ def barriers(mask, vi32, vf32):
# CHECK-LABEL: func.func @barriers(
# CHECK: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: f32) -> i32 {
-# CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
# CHECK: %[[CONSTANT_1:.*]] = arith.constant 65535 : i32
-# CHECK: nvvm.barrier id = %[[CONSTANT_0]] number_of_threads = %[[CONSTANT_1]]
+# CHECK: nvvm.barrier number_of_threads = %[[CONSTANT_1]]
# CHECK: %[[PRED:.*]] = arith.constant 1 : i32
-# CHECK: %[[BARRIER_1:.*]] = nvvm.barrier #nvvm.reduction<and> %[[PRED]] -> i32
-# CHECK: %[[BARRIER_2:.*]] = nvvm.barrier #nvvm.reduction<or> %[[BARRIER_1]] -> i32
-# CHECK: %[[BARRIER_3:.*]] = nvvm.barrier #nvvm.reduction<popc> %[[BARRIER_2]] -> i32
+# CHECK: %[[BARRIER_1:.*]] = nvvm.barrier.reduction #nvvm.reduction<and> %[[PRED]] -> i32
+# CHECK: %[[BARRIER_2:.*]] = nvvm.barrier.reduction #nvvm.reduction<or> %[[BARRIER_1]] -> i32
+# CHECK: %[[BARRIER_3:.*]] = nvvm.barrier.reduction #nvvm.reduction<popc> %[[BARRIER_2]] -> i32
# CHECK: nvvm.barrier
# CHECK: nvvm.bar.warp.sync %[[ARG0]] : i32
# CHECK: nvvm.cluster.arrive
More information about the Mlir-commits
mailing list