[llvm] 3d31ada - [DSE] Improve partial overlap detection

Fri Oct 30 08:23:35 PDT 2020

Author: Evgeniy Brevnov
Date: 2020-10-30T22:23:20+07:00
New Revision: 3d31adaec443daee75c62823082fa2912bbd267e

URL: https://github.com/llvm/llvm-project/commit/3d31adaec443daee75c62823082fa2912bbd267e
DIFF: https://github.com/llvm/llvm-project/commit/3d31adaec443daee75c62823082fa2912bbd267e.diff

LOG: [DSE] Improve partial overlap detection

Currently isOverwrite returns OW_MaybePartial even for accesss known not to overlap. This is not a big problem for legacy implementation (since isPartialOverwrite follows isOverwrite and clarifies the result). Contrary SSA based version does a lot of work to later find out that accesses don't overlap. Besides negative impact on compile time we quickly reach MemorySSAPartialStoreLimit and miss optimization opportunities.

Note: In fact, I think it would be cleaner implementation if isOverwrite returned fully clarified result in the first place whithout need to call isPartialOverwrite. This can be done as a follow up. What do you think?

Reviewed By: fhahn, asbirlea

Differential Revision: https://reviews.llvm.org/D90371

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
    llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
    llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index acdb1c4fa8c3..e578d15dfc50 100644

--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -501,28 +501,40 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
   if (BP1 != BP2)
     return OW_Unknown;
 
-  // The later store completely overlaps the earlier store if:
-  //
-  // 1. Both start at the same offset and the later one's size is greater than
-  //    or equal to the earlier one's, or
-  //
-  //      |--earlier--|
-  //      |--   later   --|
-  //
-  // 2. The earlier store has an offset greater than the later offset, but which
-  //    still lies completely within the later store.
-  //
-  //        |--earlier--|
-  //    |-----  later  ------|
+  // The later access completely overlaps the earlier store if and only if
+  // both start and end of the earlier one is "inside" the later one:
+  //    |<->|--earlier--|<->|
+  //    |-------later-------|
+  // Accesses may overlap if and only if start of one of them is "inside"
+  // another one:
+  //    |<->|--earlier--|<----->|
+  //    |-------later-------|
+  //           OR
+  //    |----- earlier -----|
+  //    |<->|---later---|<----->|
   //
   // We have to be careful here as *Off is signed while *.Size is unsigned.
-  if (EarlierOff >= LaterOff &&
-      LaterSize >= EarlierSize &&
-      uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
-    return OW_Complete;
 
-  // Later may overwrite earlier completely with other partial writes.
-  return OW_MaybePartial;
+  // Check if the earlier access starts "not before" the later one.
+  if (EarlierOff >= LaterOff) {
+    // If the earlier access ends "not after" the later access then the earlier
+    // one is completely overwritten by the later one.
+    if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
+      return OW_Complete;
+    // If start of the earlier access is "before" end of the later access then
+    // accesses overlap.
+    else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize)
+      return OW_MaybePartial;
+  }
+  // If start of the later access is "before" end of the earlier access then
+  // accesses overlap.
+  else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) {
+    return OW_MaybePartial;
+  }
+
+  // Can reach here only if accesses are known not to overlap. There is no
+  // dedicated code to indicate no overlap so signal "unknown".
+  return OW_Unknown;
 }
 
 /// Return 'OW_Complete' if a store to the 'Later' location completely

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
index ec1b9a5ee514..ab957e0c3cf0 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
-; RUN: opt -S -dse -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
+; RUN: opt -S -dse -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
@@ -213,41 +212,21 @@ declare i32 @fa(i8*, i8**, i32, i8, i8*)
 ; We miss this case, because of an aggressive limit of partial overlap analysis.
 ; With a larger partial store limit, we remove the memset.
 define void @test4()  {
-; DEFAULT-LIMIT-LABEL: @test4(
-; DEFAULT-LIMIT-NEXT:  entry:
-; DEFAULT-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; DEFAULT-LIMIT-NEXT:    [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8*
-; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32
-; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false)
-; DEFAULT-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; DEFAULT-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; DEFAULT-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; DEFAULT-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; DEFAULT-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; DEFAULT-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; DEFAULT-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
-; DEFAULT-LIMIT-NEXT:    ret void
-;
-; LARGER-LIMIT-LABEL: @test4(
-; LARGER-LIMIT-NEXT:  entry:
-; LARGER-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; LARGER-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; LARGER-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; LARGER-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; LARGER-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; LARGER-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; LARGER-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; LARGER-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
-; LARGER-LIMIT-NEXT:    ret void
-;
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; CHECK-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; CHECK-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; CHECK-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; CHECK-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; CHECK-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; CHECK-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; CHECK-NEXT:    ret void
 entry:
 
   %bang = alloca %struct.foostruct, align 8

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
index 8a71c7397917..2ed717343a8a 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -dse %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
-; RUN: opt -dse -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
+; RUN: opt -dse %s -S | FileCheck --check-prefixes=CHECK %s
 
 
 %struct.ham = type { [3 x double], [3 x double]}
@@ -11,52 +10,27 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
 ; We miss this case, because of an aggressive limit of partial overlap analysis.
 ; With a larger partial store limit, we remove the memset.
 define void @overlap1(%struct.ham* %arg, i1 %cond) {
-; DEFAULT-LIMIT-LABEL: @overlap1(
-; DEFAULT-LIMIT-NEXT:  bb:
-; DEFAULT-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; DEFAULT-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; DEFAULT-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; DEFAULT-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; DEFAULT-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; DEFAULT-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; DEFAULT-LIMIT-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
-; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32
-; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false)
-; DEFAULT-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; DEFAULT-LIMIT:       bb7:
-; DEFAULT-LIMIT-NEXT:    br label [[BB9:%.*]]
-; DEFAULT-LIMIT:       bb8:
-; DEFAULT-LIMIT-NEXT:    br label [[BB9]]
-; DEFAULT-LIMIT:       bb9:
-; DEFAULT-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
-; DEFAULT-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
-; DEFAULT-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
-; DEFAULT-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
-; DEFAULT-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
-; DEFAULT-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
-; DEFAULT-LIMIT-NEXT:    ret void
-;
-; LARGER-LIMIT-LABEL: @overlap1(
-; LARGER-LIMIT-NEXT:  bb:
-; LARGER-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; LARGER-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; LARGER-LIMIT:       bb7:
-; LARGER-LIMIT-NEXT:    br label [[BB9:%.*]]
-; LARGER-LIMIT:       bb8:
-; LARGER-LIMIT-NEXT:    br label [[BB9]]
-; LARGER-LIMIT:       bb9:
-; LARGER-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
-; LARGER-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
-; LARGER-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
-; LARGER-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
-; LARGER-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
-; LARGER-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
-; LARGER-LIMIT-NEXT:    ret void
+; CHECK-LABEL: @overlap1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb9:
+; CHECK-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; CHECK-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; CHECK-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; CHECK-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; CHECK-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; CHECK-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; CHECK-NEXT:    ret void
 ;
 bb:
   %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2