[llvm] [OpenMPOpt] Fix incorrect end-of-kernel barrier removal (PR #65670)
Daniel Woodworth via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 11 12:57:56 PDT 2023
https://github.com/dwoodwor-intel updated https://github.com/llvm/llvm-project/pull/65670:
>From b74d06bde6e7b8466a1497bd5e6f4e52eca019c4 Mon Sep 17 00:00:00 2001
From: Daniel Woodworth <daniel.woodworth at intel.com>
Date: Thu, 7 Sep 2023 15:54:02 -0400
Subject: [PATCH 1/3] [OpenMPOpt] Fix incorrect end-of-kernel barrier removal
Barrier removal in OpenMPOpt normally removes barriers by proving that
they are redundant with barriers preceding them. However, it can't do
this with the "pseudo-barrier" at the end of kernels because that can't
be removed. Instead, it removes the barriers preceding the end of the
kernel which that end-of-kernel barrier is redundant with. However,
these barriers aren't always redundant with the end-of-kernel barrier
when loops are involved, and removing them can lead to incorrect results
in compiled code.
This change fixes this by requiring that these pre-end-of-kernel
barriers are also only reaching aligned barriers before removing them.
---
llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 16 +++++---
.../test/Transforms/OpenMP/barrier_removal.ll | 37 ++++++++++++++++++-
2 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 63493eb78c451a6..05697341eab64fa 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2676,17 +2676,19 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
if (!ED.EncounteredAssumes.empty() && !A.isModulePass())
return;
- // We can remove this barrier, if it is one, or all aligned barriers
- // reaching the kernel end. In the latter case we can transitively work
- // our way back until we find a barrier that guards a side-effect if we
- // are dealing with the kernel end here.
+ // We can remove this barrier, if it is one, or aligned barriers reaching
+ // the kernel end (if CB is nullptr). Aligned barriers reaching the kernel
+ // end may have other successors besides the kernel end (especially if
+ // they're in loops) with non-local side-effects, so those barriers can
+ // only be removed if they also only reach the kernel end. If those
+ // barriers have other barriers reaching them, those can be transitively
+ // removed as well.
if (CB) {
DeletedBarriers.insert(CB);
A.deleteAfterManifest(*CB);
++NumBarriersEliminated;
Changed = ChangeStatus::CHANGED;
} else if (!ED.AlignedBarriers.empty()) {
- NumBarriersEliminated += ED.AlignedBarriers.size();
Changed = ChangeStatus::CHANGED;
SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(),
ED.AlignedBarriers.end());
@@ -2697,7 +2699,11 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
continue;
if (LastCB->getFunction() != getAnchorScope())
continue;
+ const ExecutionDomainTy &PostLastED = CEDMap[{LastCB, POST}];
+ if (!PostLastED.IsReachingAlignedBarrierOnly)
+ continue;
if (!DeletedBarriers.count(LastCB)) {
+ ++NumBarriersEliminated;
A.deleteAfterManifest(*LastCB);
continue;
}
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index e45f746cbf4395d..4a7491e9404c0fa 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -736,13 +736,16 @@ define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p)
; MODULE-NEXT: call void @barrier_then_write0(ptr [[P]])
; MODULE-NEXT: br label [[T0B3:%.*]]
; MODULE: t0b3:
+; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: br label [[M3:%.*]]
; MODULE: f03:
; MODULE-NEXT: call void @barrier_then_write0(ptr [[P]])
; MODULE-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
; MODULE: t13:
+; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: br label [[M3]]
; MODULE: f13:
+; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: br label [[M3]]
; MODULE: m3:
; MODULE-NEXT: call void @write_then_barrier0(ptr [[P]])
@@ -1065,8 +1068,38 @@ define void @caller_barrier2() "kernel" {
ret void
}
+define void @loop_barrier() "kernel" {
+; CHECK-LABEL: define {{[^@]+}}@loop_barrier
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: call void @unknown()
+; CHECK-NEXT: call void @aligned_barrier()
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I_NEXT]], 128
+; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ call void @unknown()
+ call void @aligned_barrier()
+ %i.next = add nuw nsw i32 %i, 1
+ %cond = icmp ne i32 %i.next, 128
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
!llvm.module.flags = !{!16,!15}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25}
+!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26}
!0 = !{ptr @pos_empty_1, !"kernel", i32 1}
!1 = !{ptr @pos_empty_2, !"kernel", i32 1}
@@ -1079,6 +1112,7 @@ define void @caller_barrier2() "kernel" {
!23 = !{ptr @pos_empty_8, !"kernel", i32 1}
!24 = !{ptr @caller_barrier1, !"kernel", i32 1}
!25 = !{ptr @caller_barrier2, !"kernel", i32 1}
+!26 = !{ptr @loop_barrier, !"kernel", i32 1}
!6 = !{ptr @neg_empty_8, !"kernel", i32 1}
!19 = !{ptr @neg_empty_9, !"kernel", i32 1}
!20 = !{ptr @pos_empty_10, !"kernel", i32 1}
@@ -1128,4 +1162,5 @@ define void @caller_barrier2() "kernel" {
; CHECK: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
; CHECK: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
; CHECK: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
+; CHECK: [[META25:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
;.
>From 131c2ff99573e20738797b5c5b5405654c08e619 Mon Sep 17 00:00:00 2001
From: Daniel Woodworth <daniel.woodworth at intel.com>
Date: Mon, 11 Sep 2023 14:51:59 -0400
Subject: [PATCH 2/3] amend! [OpenMPOpt] Fix incorrect end-of-kernel barrier
removal
[OpenMPOpt] Fix incorrect end-of-kernel barrier removal
Barrier removal in OpenMPOpt normally removes barriers by proving that
they are redundant with barriers preceding them. However, it can't do
this with the "pseudo-barrier" at the end of kernels because that can't
be removed. Instead, it removes the barriers preceding the end of the
kernel which that end-of-kernel barrier is redundant with. However,
these barriers aren't always redundant with the end-of-kernel barrier
when loops are involved, and removing them can lead to incorrect results
in compiled code.
This change fixes this by requiring that these pre-end-of-kernel
barriers also have the kernel end as a unique successor before removing
them.
---
llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 24 ++-
.../test/Transforms/OpenMP/barrier_removal.ll | 151 +++++++++++++++++-
2 files changed, 163 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 05697341eab64fa..8a8d27d9be38b8d 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2621,6 +2621,17 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
}
};
+/// Determines if \p BB exits the function unconditionally itself or reaches a
+/// block that does through only unique successors.
+static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) {
+ if (succ_empty(BB))
+ return true;
+ const BasicBlock *const Successor = BB->getUniqueSuccessor();
+ if (!Successor)
+ return false;
+ return hasFunctionEndAsUniqueSuccessor(Successor);
+}
+
struct AAExecutionDomainFunction : public AAExecutionDomain {
AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
: AAExecutionDomain(IRP, A) {}
@@ -2678,11 +2689,11 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
// We can remove this barrier, if it is one, or aligned barriers reaching
// the kernel end (if CB is nullptr). Aligned barriers reaching the kernel
- // end may have other successors besides the kernel end (especially if
- // they're in loops) with non-local side-effects, so those barriers can
- // only be removed if they also only reach the kernel end. If those
- // barriers have other barriers reaching them, those can be transitively
- // removed as well.
+ // end should only be removed if the kernel end is their unique successor;
+ // otherwise, they may have side-effects that aren't accounted for in the
+ // kernel end in their other successors. If those barriers have other
+ // barriers reaching them, those can be transitively removed as well as
+ // long as the kernel end is also their unique successor.
if (CB) {
DeletedBarriers.insert(CB);
A.deleteAfterManifest(*CB);
@@ -2699,8 +2710,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
continue;
if (LastCB->getFunction() != getAnchorScope())
continue;
- const ExecutionDomainTy &PostLastED = CEDMap[{LastCB, POST}];
- if (!PostLastED.IsReachingAlignedBarrierOnly)
+ if (!hasFunctionEndAsUniqueSuccessor(LastCB->getParent()))
continue;
if (!DeletedBarriers.count(LastCB)) {
++NumBarriersEliminated;
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 4a7491e9404c0fa..8aca820afdfa883 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -736,16 +736,13 @@ define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p)
; MODULE-NEXT: call void @barrier_then_write0(ptr [[P]])
; MODULE-NEXT: br label [[T0B3:%.*]]
; MODULE: t0b3:
-; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: br label [[M3:%.*]]
; MODULE: f03:
; MODULE-NEXT: call void @barrier_then_write0(ptr [[P]])
; MODULE-NEXT: br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
; MODULE: t13:
-; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: br label [[M3]]
; MODULE: f13:
-; MODULE-NEXT: call void @aligned_barrier()
; MODULE-NEXT: br label [[M3]]
; MODULE: m3:
; MODULE-NEXT: call void @write_then_barrier0(ptr [[P]])
@@ -1098,8 +1095,144 @@ exit:
ret void
}
+define void @loop_barrier_end_barriers() "kernel" {
+; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: call void @unknown()
+; CHECK-NEXT: call void @aligned_barrier()
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I_NEXT]], 128
+; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ call void @unknown()
+ call void @aligned_barrier()
+ %i.next = add nuw nsw i32 %i, 1
+ %cond = icmp ne i32 %i.next, 128
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ ret void
+}
+
+define void @loop_barrier_end_barriers_unknown() "kernel" {
+; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_unknown
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: call void @unknown()
+; CHECK-NEXT: call void @aligned_barrier()
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I_NEXT]], 128
+; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: call void @unknown()
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ call void @unknown()
+ call void @aligned_barrier()
+ %i.next = add nuw nsw i32 %i, 1
+ %cond = icmp ne i32 %i.next, 128
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ call void @unknown()
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ ret void
+}
+
+define void @loop_barrier_store() "kernel" {
+; CHECK-LABEL: define {{[^@]+}}@loop_barrier_store
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: store i32 [[I]], ptr @G1, align 4
+; CHECK-NEXT: call void @aligned_barrier()
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I_NEXT]], 128
+; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ store i32 %i, ptr @G1
+ call void @aligned_barrier()
+ %i.next = add nuw nsw i32 %i, 1
+ %cond = icmp ne i32 %i.next, 128
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+define void @loop_barrier_end_barriers_store() "kernel" {
+; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_store
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: store i32 [[I]], ptr @G1, align 4
+; CHECK-NEXT: call void @aligned_barrier()
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I_NEXT]], 128
+; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: store i32 [[I_NEXT]], ptr @G1, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+ store i32 %i, ptr @G1
+ call void @aligned_barrier()
+ %i.next = add nuw nsw i32 %i, 1
+ %cond = icmp ne i32 %i.next, 128
+ br i1 %cond, label %loop, label %exit
+
+exit:
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ store i32 %i.next, ptr @G1
+ call void @aligned_barrier()
+ call void @aligned_barrier()
+ ret void
+}
+
!llvm.module.flags = !{!16,!15}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26}
+!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26,!27,!28,!29,!30}
!0 = !{ptr @pos_empty_1, !"kernel", i32 1}
!1 = !{ptr @pos_empty_2, !"kernel", i32 1}
@@ -1113,6 +1246,10 @@ exit:
!24 = !{ptr @caller_barrier1, !"kernel", i32 1}
!25 = !{ptr @caller_barrier2, !"kernel", i32 1}
!26 = !{ptr @loop_barrier, !"kernel", i32 1}
+!27 = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
+!28 = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
+!29 = !{ptr @loop_barrier_store, !"kernel", i32 1}
+!30 = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
!6 = !{ptr @neg_empty_8, !"kernel", i32 1}
!19 = !{ptr @neg_empty_9, !"kernel", i32 1}
!20 = !{ptr @pos_empty_10, !"kernel", i32 1}
@@ -1162,5 +1299,9 @@ exit:
; CHECK: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
; CHECK: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
; CHECK: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
-; CHECK: [[META25:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
+; CHECK: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
+; CHECK: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
+; CHECK: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
+; CHECK: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1}
+; CHECK: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
;.
>From dc73ca1620c516649634a814ecc91fe83601f60f Mon Sep 17 00:00:00 2001
From: Daniel Woodworth <daniel.woodworth at intel.com>
Date: Mon, 11 Sep 2023 15:56:26 -0400
Subject: [PATCH 3/3] amend! amend! [OpenMPOpt] Fix incorrect end-of-kernel
barrier removal
[OpenMPOpt] Fix incorrect end-of-kernel barrier removal
Barrier removal in OpenMPOpt normally removes barriers by proving that
they are redundant with barriers preceding them. However, it can't do
this with the "pseudo-barrier" at the end of kernels because that can't
be removed. Instead, it removes the barriers preceding the end of the
kernel which that end-of-kernel barrier is redundant with. However,
these barriers aren't always redundant with the end-of-kernel barrier
when loops are involved, and removing them can lead to incorrect results
in compiled code.
This change fixes this by requiring that these pre-end-of-kernel
barriers also have the kernel end as a unique successor before removing
them. It also changes the initialization of `ExitED` for kernels since
the kernel end is not an aligned barrier.
---
llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 8a8d27d9be38b8d..d49c23336146b97 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2994,7 +2994,7 @@ bool AAExecutionDomainFunction::handleCallees(Attributor &A,
EntryBBED.IsExecutedByInitialThreadOnly = false;
EntryBBED.IsReachedFromAlignedBarrierOnly = true;
EntryBBED.EncounteredNonLocalSideEffect = false;
- ExitED.IsReachingAlignedBarrierOnly = true;
+ ExitED.IsReachingAlignedBarrierOnly = false;
} else {
EntryBBED.IsExecutedByInitialThreadOnly = false;
EntryBBED.IsReachedFromAlignedBarrierOnly = false;
More information about the llvm-commits
mailing list