[flang-commits] [flang] [mlir] [MLIR][NVVM] Split nvvm.barrier into nvvm.barrier and nvvm.barrier.reduction (PR #199404)
via flang-commits
flang-commits at lists.llvm.org
Sun May 24 01:26:56 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: xys-syx
<details>
<summary>Changes</summary>
This PR is the second step of the three-PR plan following the direction suggested in #<!-- -->192203 to clean up `nvvm.barrier`:
- splits the reduction form into its own op so that each op has a single clear meaning and the structure is reflected directly in the IR.
- `barrierId` is modeled as `DefaultValuedAttr<I32Attr, "0">` rather than an `Optional<I32>` SSA value. PTX requires the barrier id to be a 4-bit immediate (`0..15`), so an attribute better matches the underlying constraint.
- Both ops now share a range verifier that rejects `barrierId` values outside `[0, 15]`
A follow-up PR will add the `aligned` attribute on top of this split.
(I am not able to merge PR, please help me)
---
Patch is 26.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199404.diff
10 Files Affected:
- (modified) flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp (+11-10)
- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+51-27)
- (modified) mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp (+39-5)
- (modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+34-39)
- (modified) mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir (+2-15)
- (modified) mlir/test/Dialect/LLVMIR/invalid.mlir (+16)
- (modified) mlir/test/Dialect/LLVMIR/nvvm.mlir (+24-8)
- (modified) mlir/test/Target/LLVMIR/nvvm/barrier.mlir (+21-14)
- (modified) mlir/test/Target/LLVMIR/nvvmir-invalid.mlir (-7)
- (modified) mlir/test/python/dialects/nvvm.py (+7-11)
``````````diff
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index bc95d7d2893a7..f7402b1730b7d 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1331,7 +1331,8 @@ CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
// SYNCTHREADS
void CUDAIntrinsicLibrary::genSyncThreads(
llvm::ArrayRef<fir::ExtendedValue> args) {
- mlir::NVVM::BarrierOp::create(builder, loc);
+ mlir::NVVM::BarrierOp::create(builder, loc, /*barrierId=*/0,
+ /*numberOfThreads=*/mlir::Value{});
}
// SYNCTHREADS_AND
@@ -1339,12 +1340,12 @@ mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
- return mlir::NVVM::BarrierOp::create(
- builder, loc, resultType, {}, {},
+ return mlir::NVVM::BarrierReductionOp::create(
+ builder, loc, resultType, /*barrierId=*/0,
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::AND),
arg)
- .getResult(0);
+ .getResult();
}
// SYNCTHREADS_COUNT
@@ -1352,12 +1353,12 @@ mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
- return mlir::NVVM::BarrierOp::create(
- builder, loc, resultType, {}, {},
+ return mlir::NVVM::BarrierReductionOp::create(
+ builder, loc, resultType, /*barrierId=*/0,
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::POPC),
arg)
- .getResult(0);
+ .getResult();
}
// SYNCTHREADS_OR
@@ -1365,12 +1366,12 @@ mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
- return mlir::NVVM::BarrierOp::create(
- builder, loc, resultType, {}, {},
+ return mlir::NVVM::BarrierReductionOp::create(
+ builder, loc, resultType, /*barrierId=*/0,
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::OR),
arg)
- .getResult(0);
+ .getResult();
}
// SYNCWARP
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 030c33526b16a..80a7559fb0444 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1123,24 +1123,23 @@ def BarrierReductionAttr
let assemblyFormat = "`<` $value `>`";
}
-def NVVM_BarrierOp : NVVM_SingleResultIntrinsicOp<"barrier",
- [AttrSizedOperandSegments, InferTypeOpAdaptorWithIsCompatible]> {
+def NVVM_BarrierOp : NVVM_VoidIntrinsicOp<"barrier"> {
let summary = "CTA Barrier Synchronization Op";
let description = [{
The `nvvm.barrier` operation performs barrier synchronization and communication
within a CTA (Cooperative Thread Array). It causes executing threads to wait for
all non-exited threads participating in the barrier to arrive.
- The operation takes two optional operands:
+ The operation takes the following operands and attributes:
- `barrierId`: Specifies a logical barrier resource with value 0 through 15.
Each CTA instance has sixteen barriers numbered 0..15. Defaults to 0 if not specified.
- `numberOfThreads`: Specifies the number of threads participating in the barrier.
When specified, the value must be a multiple of the warp size. If not specified,
all threads in the CTA participate in the barrier.
- - `reductionOp`: specifies the reduction operation (`popc`, `and`, `or`).
- - `reductionPredicate`: specifies the predicate to be used with the
- `reductionOp`.
+
+ Reduction variants of the barrier instruction are modeled by the
+ `nvvm.barrier.reduction` op.
The barrier operation guarantees that when the barrier completes, prior memory
accesses requested by participating threads are performed relative to all threads
@@ -1157,32 +1156,57 @@ def NVVM_BarrierOp : NVVM_SingleResultIntrinsicOp<"barrier",
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
}];
- let arguments = (ins Optional<I32>:$barrierId, Optional<I32>:$numberOfThreads,
- OptionalAttr<BarrierReductionAttr>:$reductionOp,
- Optional<I32>:$reductionPredicate);
- string llvmBuilder = [{
- auto [id, args] = NVVM::BarrierOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- if ($reductionOp)
- $res = builder.CreateZExt(createIntrinsicCall(builder, id, args),
- builder.getInt32Ty());
- else
- createIntrinsicCall(builder, id, args);
- }];
- let results = (outs Optional<I32>:$res);
+ let arguments = (ins
+ DefaultValuedAttr<I32Attr, "0">:$barrierId,
+ Optional<I32>:$numberOfThreads);
+
+ let assemblyFormat =
+ "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? "
+ "attr-dict";
let hasVerifier = 1;
+}
+
+def NVVM_BarrierReductionOp :
+ NVVM_IntrinsicLoweringOp<"barrier.reduction"> {
+ let summary = "CTA Barrier Reduction Op";
+ let description = [{
+ The `nvvm.barrier.reduction` operation performs barrier synchronization with a
+ reduction across the per-thread predicates contributed by participating threads
+ in a CTA.
+
+ - `barrierId`: Specifies a logical barrier resource with value 0 through 15.
+ Defaults to 0.
+ - `reductionOp`: The reduction kind (`popc`, `and`, `or`) applied across the
+ per-thread predicates.
+ - `reductionPredicate`: The per-thread i32 predicate. It is compared against
+ zero to form the i1 value fed into the reduction.
+
+ The result is the i32 reduction value computed across all threads
+ participating in the barrier. This op always lowers to the aligned form of
+ the `@llvm.nvvm.barrier.cta.red.*` intrinsic family.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
+ }];
+
+ let arguments = (ins
+ DefaultValuedAttr<I32Attr, "0">:$barrierId,
+ BarrierReductionAttr:$reductionOp,
+ I32:$reductionPredicate);
+ let results = (outs I32:$res);
let assemblyFormat =
- "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? "
- "(qualified($reductionOp)^ $reductionPredicate)? (`->` type($res)^)? attr-dict";
+ "(`id` `=` $barrierId^)? qualified($reductionOp) $reductionPredicate "
+ "`->` type($res) attr-dict";
- let builders = [OpBuilder<(ins), [{
- return build($_builder, $_state, TypeRange{}, Value{}, Value{}, {}, Value{});
- }]>,
- OpBuilder<(ins "Value":$barrierId), [{
- return build($_builder, $_state, TypeRange{}, barrierId, Value{}, {}, Value{});
- }]>];
+ string llvmBuilder = [{
+ auto [id, args] = NVVM::BarrierReductionOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ $res = builder.CreateZExt(createIntrinsicCall(builder, id, args),
+ builder.getInt32Ty());
+ }];
+
+ let hasVerifier = 1;
}
def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive">
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index dab68fd734236..e03b37a1c0c61 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -412,6 +412,36 @@ struct AssertOpToAssertfailLowering
}
};
+/// Follows the SSA chain `insertvalue[0] -> load -> addressof -> global`
+/// to recover the integer initializer behind field [0] of `namedBarrier`,
+/// since `nvvm.barrier` carries `barrierId` as an `IntegerAttr`.
+static FailureOr<uint32_t>
+extractStaticNamedBarrierId(Operation *contextOp, Value namedBarrier) {
+ auto insertOp = namedBarrier.getDefiningOp<LLVM::InsertValueOp>();
+ while (insertOp && insertOp.getPosition() != ArrayRef<int64_t>{0})
+ insertOp = insertOp.getContainer().getDefiningOp<LLVM::InsertValueOp>();
+ if (!insertOp)
+ return failure();
+ auto loadOp = insertOp.getValue().getDefiningOp<LLVM::LoadOp>();
+ if (!loadOp)
+ return failure();
+ auto addrOf = loadOp.getAddr().getDefiningOp<LLVM::AddressOfOp>();
+ if (!addrOf)
+ return failure();
+ Operation *symbolTableOp =
+ contextOp->getParentWithTrait<OpTrait::SymbolTable>();
+ if (!symbolTableOp)
+ return failure();
+ auto globalOp = dyn_cast_or_null<LLVM::GlobalOp>(
+ SymbolTable::lookupSymbolIn(symbolTableOp, addrOf.getGlobalNameAttr()));
+ if (!globalOp)
+ return failure();
+ auto initAttr = dyn_cast_or_null<IntegerAttr>(globalOp.getValueAttr());
+ if (!initAttr)
+ return failure();
+ return static_cast<uint32_t>(initAttr.getInt());
+}
+
struct GPUBarrierOpToNVVMLowering final
: public ConvertOpToLLVMPattern<gpu::BarrierOp> {
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
@@ -421,12 +451,15 @@ struct GPUBarrierOpToNVVMLowering final
ConversionPatternRewriter &rewriter) const override {
if (Value namedBarrier = adaptor.getNamedBarrier()) {
Location loc = op.getLoc();
- Value barrierId =
- LLVM::ExtractValueOp::create(rewriter, loc, namedBarrier, 0);
+ FailureOr<uint32_t> barrierId =
+ extractStaticNamedBarrierId(op, namedBarrier);
+ if (failed(barrierId))
+ return rewriter.notifyMatchFailure(
+ op, "could not recover the static barrier id behind the named "
+ "barrier handle");
Value numberOfThreads =
LLVM::ExtractValueOp::create(rewriter, loc, namedBarrier, 1);
- NVVM::BarrierOp::create(rewriter, loc, barrierId, numberOfThreads,
- NVVM::BarrierReductionAttr{}, Value{});
+ NVVM::BarrierOp::create(rewriter, loc, *barrierId, numberOfThreads);
rewriter.eraseOp(op);
return success();
}
@@ -434,7 +467,8 @@ struct GPUBarrierOpToNVVMLowering final
gpu::BarrierScope scope = op.getScope();
switch (scope) {
case gpu::BarrierScope::Workgroup:
- rewriter.replaceOpWithNewOp<NVVM::BarrierOp>(op);
+ rewriter.replaceOpWithNewOp<NVVM::BarrierOp>(op, /*barrierId=*/0,
+ /*numberOfThreads=*/Value{});
return success();
case gpu::BarrierScope::Subgroup: {
// Emit __syncwarp(0xFFFFFFFF) for full-warp sync.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index aa9e05013eaed..29f969874c591 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -2926,32 +2926,22 @@ LogicalResult NVVM::SetMaxRegisterOp::verify() {
return success();
}
-LogicalResult NVVM::BarrierOp::verify() {
- if (getNumberOfThreads() && !getBarrierId())
- return emitOpError(
- "barrier id is missing, it should be set between 0 to 15");
-
- if (getBarrierId() && (getReductionOp() || getReductionPredicate()))
- return emitOpError("reduction are only available when id is 0");
-
- if ((getReductionOp() && !getReductionPredicate()) ||
- (!getReductionOp() && getReductionPredicate()))
- return emitOpError("reduction predicate and reduction operation must be "
- "specified together");
-
+/// Common verifier for `nvvm.barrier` and `nvvm.barrier.reduction`: PTX
+/// restricts the logical barrier resource id to the 4-bit range 0..15.
+template <typename BarrierLikeOp>
+static LogicalResult verifyBarrierIdRange(BarrierLikeOp op) {
+ if (op.getBarrierId() > 15)
+ return op.emitOpError("barrier id must be in the range [0, 15], got ")
+ << op.getBarrierId();
return success();
}
-LogicalResult BarrierOp::inferReturnTypes(
- MLIRContext *context, std::optional<Location> location,
- BarrierOp::Adaptor adaptor, SmallVectorImpl<Type> &inferredReturnTypes) {
- if (adaptor.getReductionOp())
- inferredReturnTypes.push_back(IntegerType::get(context, 32));
- return success();
+LogicalResult NVVM::BarrierOp::verify() {
+ return verifyBarrierIdRange(*this);
}
-bool BarrierOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
- return isCompatibleReturnTypesOptionalResult(l, r);
+LogicalResult NVVM::BarrierReductionOp::verify() {
+ return verifyBarrierIdRange(*this);
}
LogicalResult NVVM::Tcgen05CpOp::verify() {
@@ -3456,32 +3446,37 @@ void SubFOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::BarrierOp>(op);
- llvm::Value *barrierId = thisOp.getBarrierId()
- ? mt.lookupValue(thisOp.getBarrierId())
- : builder.getInt32(0);
+ llvm::SmallVector<llvm::Value *> args = {
+ builder.getInt32(thisOp.getBarrierId())};
llvm::Intrinsic::ID id;
- llvm::SmallVector<llvm::Value *> args = {barrierId};
if (thisOp.getNumberOfThreads()) {
id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count;
args.push_back(mt.lookupValue(thisOp.getNumberOfThreads()));
- } else if (thisOp.getReductionOp()) {
- switch (*thisOp.getReductionOp()) {
- case NVVM::BarrierReduction::AND:
- id = llvm::Intrinsic::nvvm_barrier_cta_red_and_aligned_all;
- break;
- case NVVM::BarrierReduction::OR:
- id = llvm::Intrinsic::nvvm_barrier_cta_red_or_aligned_all;
- break;
- case NVVM::BarrierReduction::POPC:
- id = llvm::Intrinsic::nvvm_barrier_cta_red_popc_aligned_all;
- break;
- }
- args.push_back(builder.CreateICmpNE(
- mt.lookupValue(thisOp.getReductionPredicate()), builder.getInt32(0)));
} else {
id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all;
}
+ return {id, std::move(args)};
+}
+mlir::NVVM::IDArgPair NVVM::BarrierReductionOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::BarrierReductionOp>(op);
+ llvm::Intrinsic::ID id;
+ switch (thisOp.getReductionOp()) {
+ case NVVM::BarrierReduction::AND:
+ id = llvm::Intrinsic::nvvm_barrier_cta_red_and_aligned_all;
+ break;
+ case NVVM::BarrierReduction::OR:
+ id = llvm::Intrinsic::nvvm_barrier_cta_red_or_aligned_all;
+ break;
+ case NVVM::BarrierReduction::POPC:
+ id = llvm::Intrinsic::nvvm_barrier_cta_red_popc_aligned_all;
+ break;
+ }
+ llvm::SmallVector<llvm::Value *> args = {
+ builder.getInt32(thisOp.getBarrierId()),
+ builder.CreateICmpNE(mt.lookupValue(thisOp.getReductionPredicate()),
+ builder.getInt32(0))};
return {id, std::move(args)};
}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index b96069ac41a44..6086b8fae84c1 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -218,27 +218,14 @@ gpu.module @test_module_5 {
// CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[ID1]], %[[DESC3]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[THREADS1]], %[[DESC4]][1] : !llvm.struct<(i32, i32)>
%nb1 = gpu.initialize_named_barrier %c2 : i32 -> !gpu.named_barrier
- // CHECK: %[[BARRIER_ID0:.*]] = llvm.extractvalue %[[DESC2]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[BARRIER_THREADS0:.*]] = llvm.extractvalue %[[DESC2]][1] : !llvm.struct<(i32, i32)>
- // CHECK: nvvm.barrier id = %[[BARRIER_ID0]] number_of_threads = %[[BARRIER_THREADS0]]
+ // CHECK: nvvm.barrier id = 1 number_of_threads = %[[BARRIER_THREADS0]]
gpu.barrier named(%nb0 : !gpu.named_barrier)
- // CHECK: %[[BARRIER_ID1:.*]] = llvm.extractvalue %[[DESC5]][0] : !llvm.struct<(i32, i32)>
// CHECK: %[[BARRIER_THREADS1:.*]] = llvm.extractvalue %[[DESC5]][1] : !llvm.struct<(i32, i32)>
- // CHECK: nvvm.barrier id = %[[BARRIER_ID1]] number_of_threads = %[[BARRIER_THREADS1]]
+ // CHECK: nvvm.barrier id = 2 number_of_threads = %[[BARRIER_THREADS1]]
gpu.barrier named(%nb1 : !gpu.named_barrier)
func.return
}
-
- // CHECK-LABEL: func @gpu_named_barrier_arg
- // CHECK-SAME: (%[[NB:.*]]: !llvm.struct<(i32, i32)>)
- func.func @gpu_named_barrier_arg(%nb : !gpu.named_barrier) {
- // CHECK: %[[BARRIER_ID:.*]] = llvm.extractvalue %[[NB]][0] : !llvm.struct<(i32, i32)>
- // CHECK: %[[BARRIER_THREADS:.*]] = llvm.extractvalue %[[NB]][1] : !llvm.struct<(i32, i32)>
- // CHECK: nvvm.barrier id = %[[BARRIER_ID]] number_of_threads = %[[BARRIER_THREADS]]
- gpu.barrier named(%nb : !gpu.named_barrier)
- func.return
- }
-
// CHECK: llvm.mlir.global internal constant @[[$NB0]](1 : i32) {addr_space = 0 : i32} : i32
// CHECK: llvm.mlir.global internal constant @[[$NB1]](2 : i32) {addr_space = 0 : i32} : i32
}
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index e80094df1eed2..3b89e6a508ec7 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -2133,3 +2133,19 @@ func.func @nvvm_read_sreg_clock64_wrong_type() {
%0 = nvvm.read.ptx.sreg.clock64 : i32
return
}
+
+// -----
+
+func.func @nvvm_barrier_id_out_of_range() {
+ // expected-error at +1 {{'nvvm.barrier' op barrier id must be in the range [0, 15], got 16}}
+ nvvm.barrier id = 16
+ return
+}
+
+// -----
+
+func.func @nvvm_barrier_reduction_id_out_of_range(%pred : i32) {
+ // expected-error at +1 {{'nvvm.barrier.reduction' op barrier id must be in the range [0, 15], got 42}}
+ %0 = nvvm.barrier.reduction id = 42 #nvvm.reduction<and> %pred -> i32
+ return
+}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index e3a98cc9cfc34..7dd43e137a17b 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -37,14 +37,30 @@ func.func @nvvm_rcp(%arg0: f32) -> f32 {
}
// CHECK-LABEL: @llvm_nvvm_barrier
-// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
-llvm.func @llvm_nvvm_barrier(%barId : i32, %numberOfThreads : i32) {
- // CHECK: nvvm.barrier
- nvvm.barrier
- // CHECK: nvvm.barrier id = %[[barId]]
- nvvm.barrier id = %barId
- // CHECK: nvvm.barrier id = %[[barId]] number_of_threads = %[[numberOfThreads]]
- nvvm.barrier id = %barId number_of_threads = %numberOfThreads
+// CHECK-SAME: (%[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier(%numberOfThreads : i32) {
+ // CHECK: nvvm.barrier{{ *$}}
+ nvvm.barrier
+ // CHECK: nvvm.barrier id = 5
+ nvvm.barrier id = 5
+ // CHECK: nvvm.barrier id = 5 number_of_threads = %[[numberOfThreads]]
+ nvvm.barrier id = 5 number_of_threads = %numberOfThreads
+ // CHECK: nvvm.barrier number_of_threads = %[[numberOfThreads]]
+ nvvm.barrier number_of_threads = %numberOfThreads
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_barrier_reduction
+// CHECK-SAME: (%[[pred:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_reduction(%pred : i32) {
+ // CHECK: nvvm.barrier.reduction #nvvm.reduction<and> %[[pred]] -> i32
+ %0 = nvvm.barrier.reduction #nvvm.reduction<and> %pred -> i32
+ // CHECK: nvvm.barrier.reduction #nvvm.reduction<or> %[[pred]] -> i32
+ %1 = nvvm.barrier.reduction #nvvm.reduction<or> %pred -> i32
+ // CHECK: nvvm.barrier.reduction #nvvm.reduction<popc> %[[pred]] -> i32
+ %2 = nvvm.barrier.reduction #nvvm.reduction<popc> %pred -> i32
+ // CHECK: nvvm.barrier.reduction id = 3 #nvvm.reduction<and> %[[pred]] -> i32
+ %3 = nvvm.barrier.reduction id = 3 #nvvm.reduction<and> %pred -> i32
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir b/mlir/test/Target/LLVMI...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/199404
More information about the flang-commits
mailing list