[Mlir-commits] [mlir] [milr][gpu] Make barrier elimination address-space aware (PR #178101)

Mon Jan 26 17:57:54 PST 2026

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/178101

Upgrade the barrier eliminiation pass to account for the address spaces of accessed memory when deciding which barriers to eliminiate. In particular, a loop that only reads and writes global memory that has a workgoup-memory-fencing barrier inside of it will now have that barrier marked for elimiination, as the global memory traffic is not being synchronized by the barrier.

The pass is also adjusted to ignore barriers whose memory fencing list is [], as those do not synchronize memory and therefore the logic in this pass would potentially incorrectly remove them after proving that fact.

>From 1886eb02c7e1ec5b93136f7f2d3c735976a261da Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 27 Jan 2026 01:49:01 +0000
Subject: [PATCH] [milr][gpu] Make barrier elimination address-space aware

Upgrade the barrier eliminiation pass to account for the address spaces
of accessed memory when deciding which barriers to eliminiate. In particular,
a loop that only reads and writes global memory that has a workgoup-memory-fencing
barrier inside of it will now have that barrier marked for elimiination, as the
global memory traffic is not being synchronized by the barrier.

The pass is also adjusted to ignore barriers whose memory fencing list is [],
as those do not synchronize memory and therefore the logic in this pass would
potentially incorrectly remove them after proving that fact.
---
 .../GPU/Transforms/EliminateBarriers.cpp      | 240 +++++++++++++-----
 .../test/Dialect/GPU/barrier-elimination.mlir | 174 +++++++++++++
 2 files changed, 345 insertions(+), 69 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
index 025d1acf8d6ba..90fe9ebf9ae1e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/EliminateBarriers.cpp
@@ -78,14 +78,121 @@ static void addAllValuelessEffects(
   effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Free>());
 }
 
+/// Looks through known "view-like" ops to find the base memref.
+static Value getBase(Value v) {
+  while (true) {
+    Operation *definingOp = v.getDefiningOp();
+    if (!definingOp)
+      break;
+
+    bool shouldContinue = TypeSwitch<Operation *, bool>(v.getDefiningOp())
+                              .Case<ViewLikeOpInterface>([&](auto op) {
+                                v = op.getViewSource();
+                                return true;
+                              })
+                              .Case<memref::TransposeOp>([&](auto op) {
+                                v = op.getIn();
+                                return true;
+                              })
+                              .Default(false);
+    if (!shouldContinue)
+      break;
+  }
+  return v;
+}
+
+/// Returns `true` if accesses to the given memory space could potentially be
+/// fenced by a barrier synchoronizing on the given `fencedAddressSpaces`. If
+/// the set of address spaces is not given, it is equal to all possible address
+/// spaces. Memory spaces that are not `#gpu.address_space` are deemed to
+/// overlap with all GPU address spaces.
+static bool isAddressSpacePotentiallyFenced(
+    Attribute memorySpace,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
+  if (!fencedAddressSpaces)
+    return true;
+
+  auto gpuMemSpace = dyn_cast_if_present<gpu::AddressSpaceAttr>(memorySpace);
+  if (!gpuMemSpace)
+    return true;
+
+  // Check if this GPU address space is in the fenced set.
+  return llvm::is_contained(*fencedAddressSpaces, gpuMemSpace);
+}
+
+/// Returns `true` if the effect operates on a memref whose memory  space
+/// cannot be one of the given fenced address spaces. This will both look at the
+/// address space of the effect's operand and of the base memref for that
+/// operand so as to handle casts that add and remove address space information.
+static bool effectCannotAffectAddressSpaces(
+    const MemoryEffects::EffectInstance &effect,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
+  if (!fencedAddressSpaces)
+    return false;
+
+  Value value = effect.getValue();
+  if (!value)
+    return false;
+
+  auto mightMatch = [&](Value v) {
+    auto memrefType = dyn_cast<BaseMemRefType>(v.getType());
+    if (!memrefType)
+      return true;
+    return isAddressSpacePotentiallyFenced(memrefType.getMemorySpace(),
+                                           fencedAddressSpaces);
+  };
+
+  if (!mightMatch(value))
+    return true;
+
+  Value base = getBase(value);
+  return !mightMatch(base);
+}
+
+/// Returns `true` if `op` is a `BarrierOp` that fences any address spaces that
+/// could overlap with the given fenced address spaces.
+static bool isBarrierWithCommonFencedMemory(
+    Operation *op,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces) {
+  auto barrier = dyn_cast<BarrierOp>(op);
+  if (!barrier)
+    return false;
+
+  std::optional<ArrayAttr> otherFencedSpaces = barrier.getAddressSpaces();
+  // Barriers with unspecified fencing fence everything.
+  if (!otherFencedSpaces)
+    return true;
+  // while barriers that fence nothing can't close off our search.
+  if (otherFencedSpaces->empty())
+    return false;
+
+  // If we fence all memory, we've got fencing in common with anything but the
+  // non-merory barrier.
+  if (!fencedAddressSpaces)
+    return true;
+
+  // Check if there's any intersection between the two sets.
+  for (Attribute ourSpace : *fencedAddressSpaces) {
+    if (llvm::is_contained(*otherFencedSpaces, ourSpace))
+      return true;
+  }
+
+  return false;
+}
+
 /// Collect the memory effects of the given op in 'effects'. Returns 'true' if
 /// it could extract the effect information from the op, otherwise returns
 /// 'false' and conservatively populates the list with all possible effects
-/// associated with no particular value or symbol.
-static bool
-collectEffects(Operation *op,
-               SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
-               bool ignoreBarriers = true) {
+/// associated with no particular value or symbol. `fencedAddressSpaces` is,
+/// if given, the set of GPU memory spaces that are being synchronized by the
+/// barrier being syrchronized - memory operations where the value being
+/// impacted is known and either it or its base value have an address space that
+/// is known to be distinct from the ones being synchronized on will not be
+/// included in the effect set.
+static bool collectEffects(
+    Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+    bool ignoreBarriers = true) {
   // Skip over barriers to avoid infinite recursion (those barriers would ask
   // this barrier again).
   if (ignoreBarriers && isa<BarrierOp>(op))
@@ -98,14 +205,19 @@ collectEffects(Operation *op,
   if (auto iface = dyn_cast<MemoryEffectOpInterface>(op)) {
     SmallVector<MemoryEffects::EffectInstance> localEffects;
     iface.getEffects(localEffects);
-    llvm::append_range(effects, localEffects);
+    // Filter out effects that cannot affect the fenced address spaces.
+    for (const MemoryEffects::EffectInstance &effect : localEffects) {
+      if (!effectCannotAffectAddressSpaces(effect, fencedAddressSpaces))
+        effects.push_back(effect);
+    }
     return true;
   }
   if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
     for (auto &region : op->getRegions()) {
       for (auto &block : region) {
         for (auto &innerOp : block)
-          if (!collectEffects(&innerOp, effects, ignoreBarriers))
+          if (!collectEffects(&innerOp, effects, fencedAddressSpaces,
+                              ignoreBarriers))
             return false;
       }
     }
@@ -120,22 +232,22 @@ collectEffects(Operation *op,
 
 /// Get all effects before the given operation caused by other operations in the
 /// same block. That is, this will not consider operations beyond the block.
-static bool
-getEffectsBeforeInBlock(Operation *op,
-                        SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
-                        bool stopAtBarrier) {
+static bool getEffectsBeforeInBlock(
+    Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+    bool stopAtBarrier) {
   if (op == &op->getBlock()->front())
     return true;
 
   for (Operation *it = op->getPrevNode(); it != nullptr;
        it = it->getPrevNode()) {
-    if (isa<BarrierOp>(it)) {
+    if (isBarrierWithCommonFencedMemory(it, fencedAddressSpaces)) {
       if (stopAtBarrier)
         return true;
       continue;
     }
 
-    if (!collectEffects(it, effects))
+    if (!collectEffects(it, effects, fencedAddressSpaces))
       return false;
   }
   return true;
@@ -147,10 +259,10 @@ getEffectsBeforeInBlock(Operation *op,
 /// set. Returns `true` if the memory effects added to `effects` are exact,
 /// `false` if they are a conservative over-approximation. The latter means that
 /// `effects` contain instances not associated with a specific value.
-static bool
-getEffectsBefore(Operation *op,
-                 SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
-                 bool stopAtBarrier) {
+static bool getEffectsBefore(
+    Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+    bool stopAtBarrier) {
   if (!op->getBlock())
     return true;
 
@@ -162,7 +274,7 @@ getEffectsBefore(Operation *op,
   }
 
   // Collect all effects before the op.
-  getEffectsBeforeInBlock(op, effects, stopAtBarrier);
+  getEffectsBeforeInBlock(op, effects, fencedAddressSpaces, stopAtBarrier);
 
   // Stop if reached the parallel region boundary.
   if (isParallelRegionBoundary(op->getParentOp()))
@@ -171,7 +283,7 @@ getEffectsBefore(Operation *op,
   Operation *parent = op->getParentOp();
   // Otherwise, keep collecting above the parent operation.
   if (!parent->hasTrait<OpTrait::IsIsolatedFromAbove>() &&
-      !getEffectsBefore(parent, effects, stopAtBarrier))
+      !getEffectsBefore(parent, effects, fencedAddressSpaces, stopAtBarrier))
     return false;
 
   // If the op is loop-like, collect effects from the trailing operations until
@@ -191,7 +303,7 @@ getEffectsBefore(Operation *op,
   if (isSequentialLoopLike(parent)) {
     // Assuming loop terminators have no side effects.
     return getEffectsBeforeInBlock(op->getBlock()->getTerminator(), effects,
-                                   /*stopAtBarrier=*/true);
+                                   fencedAddressSpaces, /*stopAtBarrier=*/true);
   }
 
   // If the parent operation is not guaranteed to execute its (single-block)
@@ -201,7 +313,7 @@ getEffectsBefore(Operation *op,
     op->getParentOp()->walk([&](Operation *in) {
       if (conservative)
         return WalkResult::interrupt();
-      if (!collectEffects(in, effects)) {
+      if (!collectEffects(in, effects, fencedAddressSpaces)) {
         conservative = true;
         return WalkResult::interrupt();
       }
@@ -213,21 +325,21 @@ getEffectsBefore(Operation *op,
 
 /// Get all effects after the given operation caused by other operations in the
 /// same block. That is, this will not consider operations beyond the block.
-static bool
-getEffectsAfterInBlock(Operation *op,
-                       SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
-                       bool stopAtBarrier) {
+static bool getEffectsAfterInBlock(
+    Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+    bool stopAtBarrier) {
   if (op == &op->getBlock()->back())
     return true;
 
   for (Operation *it = op->getNextNode(); it != nullptr;
        it = it->getNextNode()) {
-    if (isa<BarrierOp>(it)) {
+    if (isBarrierWithCommonFencedMemory(it, fencedAddressSpaces)) {
       if (stopAtBarrier)
         return true;
       continue;
     }
-    if (!collectEffects(it, effects))
+    if (!collectEffects(it, effects, fencedAddressSpaces))
       return false;
   }
   return true;
@@ -239,10 +351,10 @@ getEffectsAfterInBlock(Operation *op,
 /// set. Returns `true` if the memory effects added to `effects` are exact,
 /// `false` if they are a conservative over-approximation. The latter means that
 /// `effects` contain instances not associated with a specific value.
-static bool
-getEffectsAfter(Operation *op,
-                SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
-                bool stopAtBarrier) {
+static bool getEffectsAfter(
+    Operation *op, SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedAddressSpaces,
+    bool stopAtBarrier) {
   if (!op->getBlock())
     return true;
 
@@ -254,7 +366,7 @@ getEffectsAfter(Operation *op,
   }
 
   // Collect all effects after the op.
-  getEffectsAfterInBlock(op, effects, stopAtBarrier);
+  getEffectsAfterInBlock(op, effects, fencedAddressSpaces, stopAtBarrier);
 
   Operation *parent = op->getParentOp();
   // Stop if reached the parallel region boundary.
@@ -264,7 +376,7 @@ getEffectsAfter(Operation *op,
   // Otherwise, keep collecting below the parent operation.
   // Don't look into, for example, neighboring functions
   if (!parent->hasTrait<OpTrait::IsIsolatedFromAbove>() &&
-      !getEffectsAfter(parent, effects, stopAtBarrier))
+      !getEffectsAfter(parent, effects, fencedAddressSpaces, stopAtBarrier))
     return false;
 
   // If the op is loop-like, collect effects from the leading operations until
@@ -282,11 +394,14 @@ getEffectsAfter(Operation *op,
   // operation `op2` at iteration `i-1` and the side effects must be ordered
   // appropriately.
   if (isSequentialLoopLike(parent)) {
-    if (isa<BarrierOp>(op->getBlock()->front()))
+    if (isBarrierWithCommonFencedMemory(&op->getBlock()->front(),
+                                        fencedAddressSpaces))
       return true;
 
-    bool exact = collectEffects(&op->getBlock()->front(), effects);
+    bool exact =
+        collectEffects(&op->getBlock()->front(), effects, fencedAddressSpaces);
     return getEffectsAfterInBlock(&op->getBlock()->front(), effects,
+                                  fencedAddressSpaces,
                                   /*stopAtBarrier=*/true) &&
            exact;
   }
@@ -298,7 +413,7 @@ getEffectsAfter(Operation *op,
     op->getParentOp()->walk([&](Operation *in) {
       if (conservative)
         return WalkResult::interrupt();
-      if (!collectEffects(in, effects)) {
+      if (!collectEffects(in, effects, fencedAddressSpaces)) {
         conservative = true;
         return WalkResult::interrupt();
       }
@@ -308,35 +423,6 @@ getEffectsAfter(Operation *op,
   return !conservative;
 }
 
-/// Looks through known "view-like" ops to find the base memref.
-static Value getBase(Value v) {
-  while (true) {
-    Operation *definingOp = v.getDefiningOp();
-    if (!definingOp)
-      break;
-
-    bool shouldContinue =
-        TypeSwitch<Operation *, bool>(v.getDefiningOp())
-            .Case<memref::CastOp, memref::SubViewOp, memref::ViewOp>(
-                [&](auto op) {
-                  v = op.getSource();
-                  return true;
-                })
-            .Case<memref::TransposeOp>([&](auto op) {
-              v = op.getIn();
-              return true;
-            })
-            .Case<memref::CollapseShapeOp, memref::ExpandShapeOp>([&](auto op) {
-              v = op.getSrc();
-              return true;
-            })
-            .Default(false);
-    if (!shouldContinue)
-      break;
-  }
-  return v;
-}
-
 /// Returns `true` if the value is defined as a function argument.
 static bool isFunctionArgument(Value v) {
   auto arg = dyn_cast<BlockArgument>(v);
@@ -352,8 +438,6 @@ static Value propagatesCapture(Operation *op) {
           [](ViewLikeOpInterface viewLike) { return viewLike.getViewSource(); })
       .Case([](CastOpInterface castLike) { return castLike->getOperand(0); })
       .Case([](memref::TransposeOp transpose) { return transpose.getIn(); })
-      .Case<memref::ExpandShapeOp, memref::CollapseShapeOp>(
-          [](auto op) { return op.getSrc(); })
       .Default(nullptr);
 }
 
@@ -583,11 +667,29 @@ class BarrierElimination final : public OpRewritePattern<BarrierOp> {
     LDBG() << "checking the necessity of: " << barrier << " "
            << barrier.getLoc();
 
+    std::optional<ArrayAttr> fencedMemSpaces = barrier.getAddressSpaces();
+    if (fencedMemSpaces && fencedMemSpaces->empty()) {
+      LDBG()
+          << "barrier is not used to synchronize memory accesses, retain it\n";
+      return failure();
+    }
+
+    // Convert the fenced address spaces to the proper type for passing through.
+    SmallVector<gpu::AddressSpaceAttr> fencedSpacesStorage;
+    std::optional<ArrayRef<gpu::AddressSpaceAttr>> fencedSpaces;
+    if (fencedMemSpaces) {
+      for (Attribute attr : *fencedMemSpaces)
+        fencedSpacesStorage.push_back(cast<gpu::AddressSpaceAttr>(attr));
+      fencedSpaces = fencedSpacesStorage;
+    }
+
     SmallVector<MemoryEffects::EffectInstance> beforeEffects;
-    getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier=*/true);
+    getEffectsBefore(barrier, beforeEffects, fencedSpaces,
+                     /*stopAtBarrier=*/true);
 
     SmallVector<MemoryEffects::EffectInstance> afterEffects;
-    getEffectsAfter(barrier, afterEffects, /*stopAtBarrier=*/true);
+    getEffectsAfter(barrier, afterEffects, fencedSpaces,
+                    /*stopAtBarrier=*/true);
 
     if (!haveConflictingEffects(beforeEffects, afterEffects)) {
       LDBG() << "the surrounding barriers are sufficient, removing " << barrier;
diff --git a/mlir/test/Dialect/GPU/barrier-elimination.mlir b/mlir/test/Dialect/GPU/barrier-elimination.mlir
index 7f6619adcd78f..1af973e041373 100644
--- a/mlir/test/Dialect/GPU/barrier-elimination.mlir
+++ b/mlir/test/Dialect/GPU/barrier-elimination.mlir
@@ -106,6 +106,34 @@ func.func @read_read_write_loop_trailing_sync(%arg0: memref<?xf32>, %arg1: f32)
   return
 }
 
+// CHECK-LABEL: @read_read_write_loop_trailing_sync_non_memory_barrier
+func.func @read_read_write_loop_trailing_sync_non_memory_barrier(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
+  %c0 = arith.constant 0 : index
+  %c42 = arith.constant 42 : index
+  %c1 = arith.constant 1 : index
+  scf.for %i = %c0 to %c42 step %c1 {
+    // CHECK: load
+    %0 = memref.load %arg0[%i] : memref<?xf32>
+    // This can't be removed because it's a barrier that isn't fencing memory. We
+    // don't know why it's here, so we leave it alone.
+    // CHECK: gpu.barrier memfence []
+    gpu.barrier memfence []
+    // However, this can be removed as with the previoius example.
+    // CHECK-NOT: gpu.barrier
+    gpu.barrier
+    // CHECK: load
+    %1 = memref.load %arg0[%i] : memref<?xf32>
+    %2 = arith.addf %0, %1 : f32
+    // CHECK: gpu.barrier
+    gpu.barrier
+    // CHECK: store
+    memref.store %2, %arg0[%i] : memref<?xf32>
+    // CHECK: gpu.barrier
+    gpu.barrier
+  }
+  return
+}
+
 // CHECK-LABEL: @write_write_noalias
 func.func @write_write_noalias(%arg0: index, %arg1: f32) -> (memref<42xf32>, memref<10xf32>)
 attributes {__parallel_region_boundary_for_test} {
@@ -199,3 +227,149 @@ func.func @nested_loop_barrier_only() attributes {__parallel_region_boundary_for
   }
   return
 }
+
+
+// CHECK-LABEL: @workgroup_barrier_global_memory
+func.func @workgroup_barrier_global_memory(
+    %global: memref<?xf32, #gpu.address_space<global>>,
+    %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: store
+  memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  // The barrier only fences workgroup memory, so the global memory write/read
+  // conflict doesn't matter - barrier can be removed.
+  // CHECK-NOT: gpu.barrier
+  gpu.barrier memfence [#gpu.address_space<workgroup>]
+  // CHECK: load
+  %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  return %0 : f32
+}
+
+
+// CHECK-LABEL: @workgroup_barrier_workgroup_memory
+func.func @workgroup_barrier_workgroup_memory(
+    %workgroup: memref<?xf32, #gpu.address_space<workgroup>>,
+    %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: store
+  memref.store %val, %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+  // The barrier fences workgroup memory and there's a write/read conflict on
+  // workgroup memory - barrier must be retained.
+  // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
+  gpu.barrier memfence [#gpu.address_space<workgroup>]
+  // CHECK: load
+  %0 = memref.load %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+  return %0 : f32
+}
+
+// Two barriers with non-overlapping address space sets: the inner workgroup
+// barrier should not stop at the outer global barrier.
+// CHECK-LABEL: @non_overlapping_barriers
+func.func @non_overlapping_barriers(
+    %global: memref<?xf32, #gpu.address_space<global>>,
+    %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: store
+  memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  // This global barrier guards the write/read on global memory.
+  // CHECK: gpu.barrier memfence [#gpu.address_space<global>]
+  gpu.barrier memfence [#gpu.address_space<global>]
+  // This workgroup barrier can be removed, but shouldn't cause the barrier above
+  // to be removed.
+  // CHECK-NOT: gpu.barrier memfence [#gpu.address_space<workgroup>]
+  gpu.barrier memfence [#gpu.address_space<workgroup>]
+  // CHECK: load
+  %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  return %0 : f32
+}
+
+// CHECK-LABEL: @unknown_address_space
+func.func @unknown_address_space(
+    %unknown: memref<?xf32>,
+    %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: store
+  memref.store %val, %unknown[%idx] : memref<?xf32>
+  // This barrier cannot be removed because the unknown-memory-space memref could
+  // point to workgroup memory.
+  // CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
+  gpu.barrier memfence [#gpu.address_space<workgroup>]
+  // CHECK: load
+  %0 = memref.load %unknown[%idx] : memref<?xf32>
+  return %0 : f32
+}
+
+// CHECK-LABEL: @mixed_address_spaces
+func.func @mixed_address_spaces(
+    %global: memref<?xf32, #gpu.address_space<global>>,
+    %workgroup: memref<?xf32, #gpu.address_space<workgroup>>,
+    %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: store
+  memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  // CHECK: store
+  memref.store %val, %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+  // Barrier fences both global and workgroup. There are conflicts on at least one of them,
+  // so the barrier must be retained.
+  // CHECK: gpu.barrier memfence [#gpu.address_space<global>, #gpu.address_space<workgroup>]
+  gpu.barrier memfence [#gpu.address_space<global>, #gpu.address_space<workgroup>]
+  // CHECK: load
+  %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  // CHECK: load
+  %1 = memref.load %workgroup[%idx] : memref<?xf32, #gpu.address_space<workgroup>>
+  %2 = arith.addf %0, %1 : f32
+  return %2 : f32
+}
+
+// CHECK-LABEL: @full_barrier_with_global_conflict
+func.func @full_barrier_with_global_conflict(
+    %global: memref<?xf32, #gpu.address_space<global>>,
+    %idx: index, %val: f32) -> f32
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: store
+  memref.store %val, %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  // CHECK: gpu.barrier{{$}}
+  gpu.barrier
+  // CHECK: load
+  %0 = memref.load %global[%idx] : memref<?xf32, #gpu.address_space<global>>
+  return %0 : f32
+}
+
+// CHECK-LABEL: @barrier_fencing_nothing_removed
+func.func @barrier_fencing_nothing_removed()
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK-NOT: gpu.barrier
+  gpu.barrier
+  return
+}
+
+// CHECK-LABEL: @empty_barrrier_retained
+func.func @empty_barrrier_retained()
+attributes {__parallel_region_boundary_for_test} {
+  // CHECK: gpu.barrier memfence []
+  gpu.barrier memfence []
+  return
+}
+
+// CHECK-LABEL: @read_write_loop_no_workgroup
+func.func @read_write_loop_no_workgroup(%arg0: memref<?xf32, #gpu.address_space<global>>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
+  %c0 = arith.constant 0 : index
+  %c42 = arith.constant 42 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: scf.for
+  scf.for %i = %c0 to %c42 step %c1 {
+    // Barrier can be eliminated because it only fences workgroup memory, which
+    // this loop does not use.
+    // CHECK-NOT: gpu.barrier
+    gpu.barrier memfence [#gpu.address_space<workgroup>]
+    // CHECK: load
+    %0 = memref.load %arg0[%i] : memref<?xf32, #gpu.address_space<global>>
+    %1 = arith.addf %0, %0 : f32
+    // Fences workgroup memory and so has no efect here.
+    // CHECK-NOT: gpu.barrier
+    gpu.barrier memfence [#gpu.address_space<workgroup>]
+    // CHECK: store
+    memref.store %0, %arg0[%i] : memref<?xf32, #gpu.address_space<global>>
+  }
+  return
+}