[llvm] 3d31ada - [DSE] Improve partial overlap detection
Evgeniy Brevnov via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 30 08:23:35 PDT 2020
Author: Evgeniy Brevnov
Date: 2020-10-30T22:23:20+07:00
New Revision: 3d31adaec443daee75c62823082fa2912bbd267e
URL: https://github.com/llvm/llvm-project/commit/3d31adaec443daee75c62823082fa2912bbd267e
DIFF: https://github.com/llvm/llvm-project/commit/3d31adaec443daee75c62823082fa2912bbd267e.diff
LOG: [DSE] Improve partial overlap detection
Currently isOverwrite returns OW_MaybePartial even for accesss known not to overlap. This is not a big problem for legacy implementation (since isPartialOverwrite follows isOverwrite and clarifies the result). Contrary SSA based version does a lot of work to later find out that accesses don't overlap. Besides negative impact on compile time we quickly reach MemorySSAPartialStoreLimit and miss optimization opportunities.
Note: In fact, I think it would be cleaner implementation if isOverwrite returned fully clarified result in the first place whithout need to call isPartialOverwrite. This can be done as a follow up. What do you think?
Reviewed By: fhahn, asbirlea
Differential Revision: https://reviews.llvm.org/D90371
Added:
Modified:
llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index acdb1c4fa8c3..e578d15dfc50 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -501,28 +501,40 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
if (BP1 != BP2)
return OW_Unknown;
- // The later store completely overlaps the earlier store if:
- //
- // 1. Both start at the same offset and the later one's size is greater than
- // or equal to the earlier one's, or
- //
- // |--earlier--|
- // |-- later --|
- //
- // 2. The earlier store has an offset greater than the later offset, but which
- // still lies completely within the later store.
- //
- // |--earlier--|
- // |----- later ------|
+ // The later access completely overlaps the earlier store if and only if
+ // both start and end of the earlier one is "inside" the later one:
+ // |<->|--earlier--|<->|
+ // |-------later-------|
+ // Accesses may overlap if and only if start of one of them is "inside"
+ // another one:
+ // |<->|--earlier--|<----->|
+ // |-------later-------|
+ // OR
+ // |----- earlier -----|
+ // |<->|---later---|<----->|
//
// We have to be careful here as *Off is signed while *.Size is unsigned.
- if (EarlierOff >= LaterOff &&
- LaterSize >= EarlierSize &&
- uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
- return OW_Complete;
- // Later may overwrite earlier completely with other partial writes.
- return OW_MaybePartial;
+ // Check if the earlier access starts "not before" the later one.
+ if (EarlierOff >= LaterOff) {
+ // If the earlier access ends "not after" the later access then the earlier
+ // one is completely overwritten by the later one.
+ if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
+ return OW_Complete;
+ // If start of the earlier access is "before" end of the later access then
+ // accesses overlap.
+ else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize)
+ return OW_MaybePartial;
+ }
+ // If start of the later access is "before" end of the earlier access then
+ // accesses overlap.
+ else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) {
+ return OW_MaybePartial;
+ }
+
+ // Can reach here only if accesses are known not to overlap. There is no
+ // dedicated code to indicate no overlap so signal "unknown".
+ return OW_Unknown;
}
/// Return 'OW_Complete' if a store to the 'Later' location completely
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
index ec1b9a5ee514..ab957e0c3cf0 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
-; RUN: opt -S -dse -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
+; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux"
@@ -213,41 +212,21 @@ declare i32 @fa(i8*, i8**, i32, i8, i8*)
; We miss this case, because of an aggressive limit of partial overlap analysis.
; With a larger partial store limit, we remove the memset.
define void @test4() {
-; DEFAULT-LIMIT-LABEL: @test4(
-; DEFAULT-LIMIT-NEXT: entry:
-; DEFAULT-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; DEFAULT-LIMIT-NEXT: [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8*
-; DEFAULT-LIMIT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32
-; DEFAULT-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false)
-; DEFAULT-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; DEFAULT-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; DEFAULT-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; DEFAULT-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; DEFAULT-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; DEFAULT-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; DEFAULT-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; DEFAULT-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]])
-; DEFAULT-LIMIT-NEXT: ret void
-;
-; LARGER-LIMIT-LABEL: @test4(
-; LARGER-LIMIT-NEXT: entry:
-; LARGER-LIMIT-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; LARGER-LIMIT-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; LARGER-LIMIT-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; LARGER-LIMIT-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; LARGER-LIMIT-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; LARGER-LIMIT-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; LARGER-LIMIT-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; LARGER-LIMIT-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; LARGER-LIMIT-NEXT: call void @goFunc(%struct.foostruct* [[BANG]])
-; LARGER-LIMIT-NEXT: ret void
-;
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; CHECK-NEXT: [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; CHECK-NEXT: [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; CHECK-NEXT: [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; CHECK-NEXT: [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; CHECK-NEXT: store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; CHECK-NEXT: [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; CHECK-NEXT: store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; CHECK-NEXT: call void @goFunc(%struct.foostruct* [[BANG]])
+; CHECK-NEXT: ret void
entry:
%bang = alloca %struct.foostruct, align 8
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
index 8a71c7397917..2ed717343a8a 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -dse %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
-; RUN: opt -dse -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
+; RUN: opt -dse %s -S | FileCheck --check-prefixes=CHECK %s
%struct.ham = type { [3 x double], [3 x double]}
@@ -11,52 +10,27 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
; We miss this case, because of an aggressive limit of partial overlap analysis.
; With a larger partial store limit, we remove the memset.
define void @overlap1(%struct.ham* %arg, i1 %cond) {
-; DEFAULT-LIMIT-LABEL: @overlap1(
-; DEFAULT-LIMIT-NEXT: bb:
-; DEFAULT-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; DEFAULT-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; DEFAULT-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; DEFAULT-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; DEFAULT-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; DEFAULT-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; DEFAULT-LIMIT-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
-; DEFAULT-LIMIT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32
-; DEFAULT-LIMIT-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false)
-; DEFAULT-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; DEFAULT-LIMIT: bb7:
-; DEFAULT-LIMIT-NEXT: br label [[BB9:%.*]]
-; DEFAULT-LIMIT: bb8:
-; DEFAULT-LIMIT-NEXT: br label [[BB9]]
-; DEFAULT-LIMIT: bb9:
-; DEFAULT-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8
-; DEFAULT-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8
-; DEFAULT-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8
-; DEFAULT-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8
-; DEFAULT-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8
-; DEFAULT-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8
-; DEFAULT-LIMIT-NEXT: ret void
-;
-; LARGER-LIMIT-LABEL: @overlap1(
-; LARGER-LIMIT-NEXT: bb:
-; LARGER-LIMIT-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; LARGER-LIMIT-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; LARGER-LIMIT-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; LARGER-LIMIT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; LARGER-LIMIT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; LARGER-LIMIT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; LARGER-LIMIT-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; LARGER-LIMIT: bb7:
-; LARGER-LIMIT-NEXT: br label [[BB9:%.*]]
-; LARGER-LIMIT: bb8:
-; LARGER-LIMIT-NEXT: br label [[BB9]]
-; LARGER-LIMIT: bb9:
-; LARGER-LIMIT-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8
-; LARGER-LIMIT-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8
-; LARGER-LIMIT-NEXT: store double 3.000000e+00, double* [[TMP]], align 8
-; LARGER-LIMIT-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8
-; LARGER-LIMIT-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8
-; LARGER-LIMIT-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8
-; LARGER-LIMIT-NEXT: ret void
+; CHECK-LABEL: @overlap1(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; CHECK: bb7:
+; CHECK-NEXT: br label [[BB9:%.*]]
+; CHECK: bb8:
+; CHECK-NEXT: br label [[BB9]]
+; CHECK: bb9:
+; CHECK-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8
+; CHECK-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8
+; CHECK-NEXT: store double 3.000000e+00, double* [[TMP]], align 8
+; CHECK-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8
+; CHECK-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8
+; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8
+; CHECK-NEXT: ret void
;
bb:
%tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2
More information about the llvm-commits
mailing list