[Mlir-commits] [mlir] [milr][gpu] Make barrier elimination address-space aware (PR #178101)
Krzysztof Drewniak
llvmlistbot at llvm.org
Tue Jan 27 09:44:59 PST 2026
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/178101
>From 1886eb02c7e1ec5b93136f7f2d3c735976a261da Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 27 Jan 2026 01:49:01 +0000
Subject: [PATCH 1/3] [milr][gpu] Make barrier elimination address-space aware
Upgrade the barrier eliminiation pass to account for the address spaces
of accessed memory when deciding which barriers to eliminiate. In particular,
a loop that only reads and writes global memory that has a workgoup-memory-fencing
barrier inside of it will now have that barrier marked for elimiination, as the
global memory traffic is not being synchronized by the barrier.
The pass is also adjusted to ignore barriers whose memory fencing list is [],
as those do not synchronize memory and therefore the logic in this pass would
potentially incorrectly remove them after proving that fact.
---
.../GPU/Transforms/EliminateBarriers.cpp | 240 +++++++++++++-----
.../test/Dialect/GPU/barrier-elimination.mlir | 174 +++++++++++++
2 files changed, 345 insertions(+), 69 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
index 025d1acf8d6ba..90fe9ebf9ae1e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
@@ -78,14 +78,121 @@ static void addAllValuelessEffects(
effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Free>());
}
+/// Looks through known "view-like" ops to find the base memref.
+static Value getBase(Value v) {
+ while (true) {
+ Operation *definingOp = v.getDefiningOp();
+ if (!definingOp)
+ break;
+
+ bool shouldContinue = TypeSwitch<Operation *, bool>(v.getDefiningOp())
+ .Case<ViewLikeOpInterface>([&](auto op) {
+ v = op.getViewSource();
+ return true;
+ })
+ .Case<memref::TransposeOp>([&](auto op) {
+ v = op.getIn();
+ return true;
+ })
+ .Default(false);
+ if (!shouldContinue)
+ break;
+ }
+ return v;
+}
+
+/// Returns `true` if accesses to the given memory space could potentially be
+/// fenced by a barrier synchoronizing on the given `fencedAddressSpaces`. If
+/// the set of address spaces is not given, it is equal to all possible address
+/// spaces. Memory spaces that are not `#gpu.address_space` are deemed to
+/// overlap with all GPU address spaces.
+static bool isAddressSpacePotentiallyFenced(
+ Attribute memorySpace,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
+ if (!fencedAddressSpaces)
+ return true;
+
+ auto gpuMemSpace = dyn_cast_if_present<gpu::AddressSpaceAttr>(memorySpace);
+ if (!gpuMemSpace)
+ return true;
+
+ // Check if this GPU address space is in the fenced set.
+ return llvm::is_contained(*fencedAddressSpaces, gpuMemSpace);
+}
+
+/// Returns `true` if the effect operates on a memref whose memory space
+/// cannot be one of the given fenced address spaces. This will both look at the
+/// address space of the effect's operand and of the base memref for that
+/// operand so as to handle casts that add and remove address space information.
+static bool effectCannotAffectAddressSpaces(
+ const MemoryEffects::EffectInstance &effect,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
+ if (!fencedAddressSpaces)
+ return false;
+
+ Value value = effect.getValue();
+ if (!value)
+ return false;
+
+ auto mightMatch = [&](Value v) {
+ auto memrefType = dyn_cast<BaseMemRefType>(v.getType());
+ if (!memrefType)
+ return true;
+ return isAddressSpacePotentiallyFenced(memrefType.getMemorySpace(),
+ fencedAddressSpaces);
+ };
+
+ if (!mightMatch(value))
+ return true;
+
+ Value base = getBase(value);
+ return !mightMatch(base);
+}
+
+/// Returns `true` if `op` is a `BarrierOp` that fences any address spaces that
+/// could overlap with the given fenced address spaces.
+static bool isBarrierWithCommonFencedMemory(
+ Operation *op,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
+ auto barrier = dyn_cast<BarrierOp>(op);
+ if (!barrier)
+ return false;
+
+ std::optional<ArrayAttr> otherFencedSpaces = barrier.getAddressSpaces();
+ // Barriers with unspecified fencing fence everything.
+ if (!otherFencedSpaces)
+ return true;
+ // while barriers that fence nothing can't close off our search.
+ if (otherFencedSpaces->empty())
+ return false;
+
+ // If we fence all memory, we've got fencing in common with anything but the
+ // non-merory barrier.
+ if (!fencedAddressSpaces)
+ return true;
+
+ // Check if there's any intersection between the two sets.
+ for (Attribute ourSpace : *fencedAddressSpaces) {
+ if (llvm::is_contained(*otherFencedSpaces, ourSpace))
+ return true;
+ }
+
+ return false;
+}
+
/// Collect the memory effects of the given op in 'effects'. Returns 'true' if
/// it could extract the effect information from the op, otherwise returns
/// 'false' and conservatively populates the list with all possible effects
-/// associated with no particular value or symbol.
-static bool
-collectEffects(Operation *op,
- SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
- bool ignoreBarriers = true) {
+/// associated with no particular value or symbol. `fencedAddressSpaces` is,
+/// if given, the set of GPU memory spaces that are being synchronized by the
+/// barrier being syrchronized - memory operations where the value being
+/// impacted is known and either it or its base value have an address space that
+/// is known to be distinct from the ones being synchronized on will not be
+/// included in the effect set.
+static bool collectEffects(
+ Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+ bool ignoreBarriers = true) {
// Skip over barriers to avoid infinite recursion (those barriers would ask
// this barrier again).
if (ignoreBarriers && isa<BarrierOp>(op))
@@ -98,14 +205,19 @@ collectEffects(Operation *op,
if (auto iface = dyn_cast<MemoryEffectOpInterface>(op)) {
SmallVector<MemoryEffects::EffectInstance> localEffects;
iface.getEffects(localEffects);
- llvm::append_range(effects, localEffects);
+ // Filter out effects that cannot affect the fenced address spaces.
+ for (const MemoryEffects::EffectInstance &effect : localEffects) {
+ if (!effectCannotAffectAddressSpaces(effect, fencedAddressSpaces))
+ effects.push_back(effect);
+ }
return true;
}
if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
for (auto ®ion : op->getRegions()) {
for (auto &block : region) {
for (auto &innerOp : block)
- if (!collectEffects(&innerOp, effects, ignoreBarriers))
+ if (!collectEffects(&innerOp, effects, fencedAddressSpaces,
+ ignoreBarriers))
return false;
}
}
@@ -120,22 +232,22 @@ collectEffects(Operation *op,
/// Get all effects before the given operation caused by other operations in the
/// same block. That is, this will not consider operations beyond the block.
-static bool
-getEffectsBeforeInBlock(Operation *op,
- SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
- bool stopAtBarrier) {
+static bool getEffectsBeforeInBlock(
+ Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+ bool stopAtBarrier) {
if (op == &op->getBlock()->front())
return true;
for (Operation *it = op->getPrevNode(); it != nullptr;
it = it->getPrevNode()) {
- if (isa<BarrierOp>(it)) {
+ if (isBarrierWithCommonFencedMemory(it, fencedAddressSpaces)) {
if (stopAtBarrier)
return true;
continue;
}
- if (!collectEffects(it, effects))
+ if (!collectEffects(it, effects, fencedAddressSpaces))
return false;
}
return true;
@@ -147,10 +259,10 @@ getEffectsBeforeInBlock(Operation *op,
/// set. Returns `true` if the memory effects added to `effects` are exact,
/// `false` if they are a conservative over-approximation. The latter means that
/// `effects` contain instances not associated with a specific value.
-static bool
-getEffectsBefore(Operation *op,
- SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
- bool stopAtBarrier) {
+static bool getEffectsBefore(
+ Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+ bool stopAtBarrier) {
if (!op->getBlock())
return true;
@@ -162,7 +274,7 @@ getEffectsBefore(Operation *op,
}
// Collect all effects before the op.
- getEffectsBeforeInBlock(op, effects, stopAtBarrier);
+ getEffectsBeforeInBlock(op, effects, fencedAddressSpaces, stopAtBarrier);
// Stop if reached the parallel region boundary.
if (isParallelRegionBoundary(op->getParentOp()))
@@ -171,7 +283,7 @@ getEffectsBefore(Operation *op,
Operation *parent = op->getParentOp();
// Otherwise, keep collecting above the parent operation.
if (!parent->hasTrait<OpTrait::IsIsolatedFromAbove>() &&
- !getEffectsBefore(parent, effects, stopAtBarrier))
+ !getEffectsBefore(parent, effects, fencedAddressSpaces, stopAtBarrier))
return false;
// If the op is loop-like, collect effects from the trailing operations until
@@ -191,7 +303,7 @@ getEffectsBefore(Operation *op,
if (isSequentialLoopLike(parent)) {
// Assuming loop terminators have no side effects.
return getEffectsBeforeInBlock(op->getBlock()->getTerminator(), effects,
- /*stopAtBarrier=*/true);
+ fencedAddressSpaces, /*stopAtBarrier=*/true);
}
// If the parent operation is not guaranteed to execute its (single-block)
@@ -201,7 +313,7 @@ getEffectsBefore(Operation *op,
op->getParentOp()->walk([&](Operation *in) {
if (conservative)
return WalkResult::interrupt();
- if (!collectEffects(in, effects)) {
+ if (!collectEffects(in, effects, fencedAddressSpaces)) {
conservative = true;
return WalkResult::interrupt();
}
@@ -213,21 +325,21 @@ getEffectsBefore(Operation *op,
/// Get all effects after the given operation caused by other operations in the
/// same block. That is, this will not consider operations beyond the block.
-static bool
-getEffectsAfterInBlock(Operation *op,
- SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
- bool stopAtBarrier) {
+static bool getEffectsAfterInBlock(
+ Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+ bool stopAtBarrier) {
if (op == &op->getBlock()->back())
return true;
for (Operation *it = op->getNextNode(); it != nullptr;
it = it->getNextNode()) {
- if (isa<BarrierOp>(it)) {
+ if (isBarrierWithCommonFencedMemory(it, fencedAddressSpaces)) {
if (stopAtBarrier)
return true;
continue;
}
- if (!collectEffects(it, effects))
+ if (!collectEffects(it, effects, fencedAddressSpaces))
return false;
}
return true;
@@ -239,10 +351,10 @@ getEffectsAfterInBlock(Operation *op,
/// set. Returns `true` if the memory effects added to `effects` are exact,
/// `false` if they are a conservative over-approximation. The latter means that
/// `effects` contain instances not associated with a specific value.
-static bool
-getEffectsAfter(Operation *op,
- SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
- bool stopAtBarrier) {
+static bool getEffectsAfter(
+ Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+ bool stopAtBarrier) {
if (!op->getBlock())
return true;
@@ -254,7 +366,7 @@ getEffectsAfter(Operation *op,
}
// Collect all effects after the op.
- getEffectsAfterInBlock(op, effects, stopAtBarrier);
+ getEffectsAfterInBlock(op, effects, fencedAddressSpaces, stopAtBarrier);
Operation *parent = op->getParentOp();
// Stop if reached the parallel region boundary.
@@ -264,7 +376,7 @@ getEffectsAfter(Operation *op,
// Otherwise, keep collecting below the parent operation.
// Don't look into, for example, neighboring functions
if (!parent->hasTrait<OpTrait::IsIsolatedFromAbove>() &&
- !getEffectsAfter(parent, effects, stopAtBarrier))
+ !getEffectsAfter(parent, effects, fencedAddressSpaces, stopAtBarrier))
return false;
// If the op is loop-like, collect effects from the leading operations until
@@ -282,11 +394,14 @@ getEffectsAfter(Operation *op,
// operation `op2` at iteration `i-1` and the side effects must be ordered
// appropriately.
if (isSequentialLoopLike(parent)) {
- if (isa<BarrierOp>(op->getBlock()->front()))
+ if (isBarrierWithCommonFencedMemory(&op->getBlock()->front(),
+ fencedAddressSpaces))
return true;
- bool exact = collectEffects(&op->getBlock()->front(), effects);
+ bool exact =
+ collectEffects(&op->getBlock()->front(), effects, fencedAddressSpaces);
return getEffectsAfterInBlock(&op->getBlock()->front(), effects,
+ fencedAddressSpaces,
/*stopAtBarrier=*/true) &&
exact;
}
@@ -298,7 +413,7 @@ getEffectsAfter(Operation *op,
op->getParentOp()->walk([&](Operation *in) {
if (conservative)
return WalkResult::interrupt();
- if (!collectEffects(in, effects)) {
+ if (!collectEffects(in, effects, fencedAddressSpaces)) {
conservative = true;
return WalkResult::interrupt();
}
@@ -308,35 +423,6 @@ getEffectsAfter(Operation *op,
return !conservative;
}
-/// Looks through known "view-like" ops to find the base memref.
-static Value getBase(Value v) {
- while (true) {
- Operation *definingOp = v.getDefiningOp();
- if (!definingOp)
- break;
-
- bool shouldContinue =
- TypeSwitch<Operation *, bool>(v.getDefiningOp())
- .Case<memref::CastOp, memref::SubViewOp, memref::ViewOp>(
- [&](auto op) {
- v = op.getSource();
- return true;
- })
- .Case<memref::TransposeOp>([&](auto op) {
- v = op.getIn();
- return true;
- })
- .Case<memref::CollapseShapeOp, memref::ExpandShapeOp>([&](auto op) {
- v = op.getSrc();
- return true;
- })
- .Default(false);
- if (!shouldContinue)
- break;
- }
- return v;
-}
-
/// Returns `true` if the value is defined as a function argument.
static bool isFunctionArgument(Value v) {
auto arg = dyn_cast<BlockArgument>(v);
@@ -352,8 +438,6 @@ static Value propagatesCapture(Operation *op) {
[](ViewLikeOpInterface viewLike) { return viewLike.getViewSource(); })
.Case([](CastOpInterface castLike) { return castLike->getOperand(0); })
.Case([](memref::TransposeOp transpose) { return transpose.getIn(); })
- .Case<memref::ExpandShapeOp, memref::CollapseShapeOp>(
- [](auto op) { return op.getSrc(); })
.Default(nullptr);
}
@@ -583,11 +667,29 @@ class BarrierElimination final : public OpRewritePattern<BarrierOp> {
LDBG() << "checking the necessity of: " << barrier << " "
<< barrier.getLoc();
+ std::optional<ArrayAttr> fencedMemSpaces = barrier.getAddressSpaces();
+ if (fencedMemSpaces && fencedMemSpaces->empty()) {
+ LDBG()
+ << "barrier is not used to synchronize memory accesses, retain it\n";
+ return failure();
+ }
+
+ // Convert the fenced address spaces to the proper type for passing through.
+ SmallVector<gpu::AddressSpaceAttr> fencedSpacesStorage;
+ std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedSpaces;
+ if (fencedMemSpaces) {
+ for (Attribute attr : *fencedMemSpaces)
+ fencedSpacesStorage.push_back(cast<gpu::AddressSpaceAttr>(attr));
+ fencedSpaces = fencedSpacesStorage;
+ }
+
SmallVector<MemoryEffects::EffectInstance> beforeEffects;
- getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier=*/true);
+ getEffectsBefore(barrier, beforeEffects, fencedSpaces,
+ /*stopAtBarrier=*/true);
SmallVector<MemoryEffects::EffectInstance> afterEffects;
- getEffectsAfter(barrier, afterEffects, /*stopAtBarrier=*/true);
+ getEffectsAfter(barrier, afterEffects, fencedSpaces,
+ /*stopAtBarrier=*/true);
if (!haveConflictingEffects(beforeEffects, afterEffects)) {
LDBG() << "the surrounding barriers are sufficient, removing " << barrier;
diff --git a/mlir/test/Dialect/GPU/barrier-elimination.mlir b/mlir/test/Dialect/GPU/barrier-elimination.mlir
index 7f6619adcd78f..1af973e041373 100644
--- a/mlir/test/Dialect/GPU/barrier-elimination.mlir
+++ b/mlir/test/Dialect/GPU/barrier-elimination.mlir
@@ -106,6 +106,34 @@ func.func @read_read_write_loop_trailing_sync(%arg0: memref<?xf32>, %arg1: f32)
return
}
+// CHECK-LABEL: @read_read_write_loop_trailing_sync_non_memory_barrier
+func.func @read_read_write_loop_trailing_sync_non_memory_barrier(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
+ %c0 = arith.constant 0 : index
+ %c42 = arith.constant 42 : index
+ %c1 = arith.constant 1 : index
+ scf.for %i = %c0 to %c42 step %c1 {
+ // CHECK: load
+ %0 = memref.load %arg0[%i] : memref<?xf32>
+ // This can't be removed because it's a barrier that isn't fencing memory. We
+ // don't know why it's here, so we leave it alone.
+ // CHECK: gpu.barrier memfence []
+ gpu.barrier memfence []
+ // However, this can be removed as with the previoius example.
+ // CHECK-NOT: gpu.barrier
+ gpu.barrier
+ // CHECK: load
+ %1 = memref.load %arg0[%i] : memref<?xf32>
+ %2 = arith.addf %0, %1 : f32
+ // CHECK: gpu.barrier
+ gpu.barrier
+ // CHECK: store
+ memref.store %2, %arg0[%i] : memref<?xf32>
+ // CHECK: gpu.barrier
+ gpu.barrier
+ }
+ return
+}
+
// CHECK-LABEL: @write_write_noalias
func.func @write_write_noalias(%arg0: index, %arg1: f32) -> (memref<42xf32>, memref<10xf32>)
attributes {__parallel_region_boundary_for_test} {
@@ -199,3 +227,149 @@ func.func @nested_loop_barrier_only() attributes {__parallel_region_boundary_for
}
return
}
+
+
+// CHECK-LABEL: @workgroup_barrier_global_memory
+func.func @workgroup_barrier_global_memory(
+ %global: memref<?xf32, #gpu.address_space<global>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // The barrier only fences workgroup memory, so the global memory write/read
+ // conflict doesn't matter - barrier can be removed.
+ // CHECK-NOT: gpu.barrier
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ // CHECK: load
+ %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ return %0 : f32
+}
+
+
+// CHECK-LABEL: @workgroup_barrier_workgroup_memory
+func.func @workgroup_barrier_workgroup_memory(
+ %workgroup: memref<?xf32, #gpu.address_space<workgroup>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+ // The barrier fences workgroup memory and there's a write/read conflict on
+ // workgroup memory - barrier must be retained.
+ // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ // CHECK: load
+ %0 = memref.load %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+ return %0 : f32
+}
+
+// Two barriers with non-overlapping address space sets: the inner workgroup
+// barrier should not stop at the outer global barrier.
+// CHECK-LABEL: @non_overlapping_barriers
+func.func @non_overlapping_barriers(
+ %global: memref<?xf32, #gpu.address_space<global>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // This global barrier guards the write/read on global memory.
+ // CHECK: gpu.barrier memfence [#gpu.address_space<global>]
+ gpu.barrier memfence [#gpu.address_space<global>]
+ // This workgroup barrier can be removed, but shouldn't cause the barrier above
+ // to be removed.
+ // CHECK-NOT: gpu.barrier memfence [#gpu.address_space<workgroup>]
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ // CHECK: load
+ %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ return %0 : f32
+}
+
+// CHECK-LABEL: @unknown_address_space
+func.func @unknown_address_space(
+ %unknown: memref<?xf32>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %unknown[%idx] : memref<?xf32>
+ // This barrier cannot be removed because the unknown-memory-space memref could
+ // point to workgroup memory.
+ // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ // CHECK: load
+ %0 = memref.load %unknown[%idx] : memref<?xf32>
+ return %0 : f32
+}
+
+// CHECK-LABEL: @mixed_address_spaces
+func.func @mixed_address_spaces(
+ %global: memref<?xf32, #gpu.address_space<global>>,
+ %workgroup: memref<?xf32, #gpu.address_space<workgroup>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // CHECK: store
+ memref.store %val, %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+ // Barrier fences both global and workgroup. There are conflicts on at least one of them,
+ // so the barrier must be retained.
+ // CHECK: gpu.barrier memfence [#gpu.address_space<global>, #gpu.address_space<workgroup>]
+ gpu.barrier memfence [#gpu.address_space<global>, #gpu.address_space<workgroup>]
+ // CHECK: load
+ %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // CHECK: load
+ %1 = memref.load %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+ %2 = arith.addf %0, %1 : f32
+ return %2 : f32
+}
+
+// CHECK-LABEL: @full_barrier_with_global_conflict
+func.func @full_barrier_with_global_conflict(
+ %global: memref<?xf32, #gpu.address_space<global>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // CHECK: gpu.barrier{{$}}
+ gpu.barrier
+ // CHECK: load
+ %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ return %0 : f32
+}
+
+// CHECK-LABEL: @barrier_fencing_nothing_removed
+func.func @barrier_fencing_nothing_removed()
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK-NOT: gpu.barrier
+ gpu.barrier
+ return
+}
+
+// CHECK-LABEL: @empty_barrrier_retained
+func.func @empty_barrrier_retained()
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: gpu.barrier memfence []
+ gpu.barrier memfence []
+ return
+}
+
+// CHECK-LABEL: @read_write_loop_no_workgroup
+func.func @read_write_loop_no_workgroup(%arg0: memref<?xf32, #gpu.address_space<global>>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
+ %c0 = arith.constant 0 : index
+ %c42 = arith.constant 42 : index
+ %c1 = arith.constant 1 : index
+ // CHECK: scf.for
+ scf.for %i = %c0 to %c42 step %c1 {
+ // Barrier can be eliminated because it only fences workgroup memory, which
+ // this loop does not use.
+ // CHECK-NOT: gpu.barrier
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ // CHECK: load
+ %0 = memref.load %arg0[%i] : memref<?xf32, #gpu.address_space<global>>
+ %1 = arith.addf %0, %0 : f32
+ // Fences workgroup memory and so has no efect here.
+ // CHECK-NOT: gpu.barrier
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ // CHECK: store
+ memref.store %0, %arg0[%i] : memref<?xf32, #gpu.address_space<global>>
+ }
+ return
+}
>From 747ee4b2b72feeefe5729f394319b4e0076dbd9b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 27 Jan 2026 09:11:36 -0800
Subject: [PATCH 2/3] TypeSwitch style fix
Co-authored-by: Jakub Kuderski <kubakuderski at gmail.com>
---
mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
index 90fe9ebf9ae1e..bac5d09e9ef2a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
@@ -86,11 +86,11 @@ static Value getBase(Value v) {
break;
bool shouldContinue = TypeSwitch<Operation *, bool>(v.getDefiningOp())
- .Case<ViewLikeOpInterface>([&](auto op) {
+ .Case([&](ViewLikeOpInterface op) {
v = op.getViewSource();
return true;
})
- .Case<memref::TransposeOp>([&](auto op) {
+ .Case([&](memref::TransposeOp op) {
v = op.getIn();
return true;
})
>From 71b3c6ccf3c49ab455ffda2af37471dc1d62b19b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 27 Jan 2026 17:44:44 +0000
Subject: [PATCH 3/3] Review comments - expanded on logic, removed excess
double negatives. Also added some extra tests.
---
.../GPU/Transforms/EliminateBarriers.cpp | 38 ++++++++++++-------
.../test/Dialect/GPU/barrier-elimination.mlir | 33 ++++++++++++++++
2 files changed, 58 insertions(+), 13 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
index bac5d09e9ef2a..8b19f4fb9f1f7 100644
--- a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
@@ -120,19 +120,22 @@ static bool isAddressSpacePotentiallyFenced(
return llvm::is_contained(*fencedAddressSpaces, gpuMemSpace);
}
-/// Returns `true` if the effect operates on a memref whose memory space
-/// cannot be one of the given fenced address spaces. This will both look at the
-/// address space of the effect's operand and of the base memref for that
-/// operand so as to handle casts that add and remove address space information.
-static bool effectCannotAffectAddressSpaces(
+/// Succeeds if the effect operates on a memref whose memory space
+/// could be one of the given fenced address spaces. This will both look at the
+/// address space of the effect's operand and of the view-like operations that
+/// define that memref, so as to inspect any memory-space casts or similar
+/// operations (like amdgpu buffer casts) that may provide more information.
+/// This assumes that directly-conflicting casts (that is, for example, casting
+/// a memref in global memory to make it one in workspace memory) can't happen.
+static LogicalResult effectMightAffectAddressSpaces(
const MemoryEffects::EffectInstance &effect,
std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
if (!fencedAddressSpaces)
- return false;
+ return success();
Value value = effect.getValue();
if (!value)
- return false;
+ return success();
auto mightMatch = [&](Value v) {
auto memrefType = dyn_cast<BaseMemRefType>(v.getType());
@@ -143,10 +146,18 @@ static bool effectCannotAffectAddressSpaces(
};
if (!mightMatch(value))
- return true;
+ return failure();
+
+ Value base = value;
+ while (auto viewLike = base.getDefiningOp<ViewLikeOpInterface>()) {
+ base = viewLike.getViewSource();
+ // We assume that we won't see directly incompatible casts, like global =>
+ // flat/null => workspace.
+ if (!mightMatch(base))
+ return failure();
+ }
- Value base = getBase(value);
- return !mightMatch(base);
+ return success();
}
/// Returns `true` if `op` is a `BarrierOp` that fences any address spaces that
@@ -207,7 +218,8 @@ static bool collectEffects(
iface.getEffects(localEffects);
// Filter out effects that cannot affect the fenced address spaces.
for (const MemoryEffects::EffectInstance &effect : localEffects) {
- if (!effectCannotAffectAddressSpaces(effect, fencedAddressSpaces))
+ if (succeeded(
+ effectMightAffectAddressSpaces(effect, fencedAddressSpaces)))
effects.push_back(effect);
}
return true;
@@ -678,8 +690,8 @@ class BarrierElimination final : public OpRewritePattern<BarrierOp> {
SmallVector<gpu::AddressSpaceAttr> fencedSpacesStorage;
std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedSpaces;
if (fencedMemSpaces) {
- for (Attribute attr : *fencedMemSpaces)
- fencedSpacesStorage.push_back(cast<gpu::AddressSpaceAttr>(attr));
+ fencedSpacesStorage = llvm::map_to_vector(
+ *fencedMemSpaces, llvm::CastTo<gpu::AddressSpaceAttr>);
fencedSpaces = fencedSpacesStorage;
}
diff --git a/mlir/test/Dialect/GPU/barrier-elimination.mlir b/mlir/test/Dialect/GPU/barrier-elimination.mlir
index 1af973e041373..b9ceb8a8d424b 100644
--- a/mlir/test/Dialect/GPU/barrier-elimination.mlir
+++ b/mlir/test/Dialect/GPU/barrier-elimination.mlir
@@ -373,3 +373,36 @@ func.func @read_write_loop_no_workgroup(%arg0: memref<?xf32, #gpu.address_space<
}
return
}
+
+// CHECK-LABEL: @global_barrier_buffer_cast
+func.func @global_barrier_buffer_cast(
+ %global: memref<?xf32, #gpu.address_space<global>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // The buffer cast below shouldn't make this barrier go away
+ // CHECK: gpu.barrier memfence [#gpu.address_space<global>]
+ gpu.barrier memfence [#gpu.address_space<global>]
+ %cast = amdgpu.fat_raw_buffer_cast %global : memref<?xf32, #gpu.address_space<global>> to memref<?xf32, #amdgpu.address_space<fat_raw_buffer>>
+ // CHECK: load
+ %0 = memref.load %cast[%idx] : memref<?xf32, #amdgpu.address_space<fat_raw_buffer>>
+ return %0 : f32
+}
+
+// CHECK-LABEL: @workgroup_barrier_buffer_cast
+func.func @workgroup_barrier_buffer_cast(
+ %global: memref<?xf32, #gpu.address_space<global>>,
+ %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+ // CHECK: store
+ memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+ // The amdgpu buffer resource is formed from global memory, and so can't alias
+ // workgroup memory, so this barrier can be eliminated.
+ // CHECK-NOT: gpu.barrierw
+ gpu.barrier memfence [#gpu.address_space<workgroup>]
+ %cast = amdgpu.fat_raw_buffer_cast %global : memref<?xf32, #gpu.address_space<global>> to memref<?xf32, #amdgpu.address_space<fat_raw_buffer>>
+ // CHECK: load
+ %0 = memref.load %cast[%idx] : memref<?xf32, #amdgpu.address_space<fat_raw_buffer>>
+ return %0 : f32
+}
More information about the Mlir-commits
mailing list