[llvm] 989051d - [DSE] Extending isOverwrite to support offsetted fully overlapping stores

Wed Mar 10 12:09:42 PST 2021

Author: Matteo Favaro
Date: 2021-03-10T21:09:33+01:00
New Revision: 989051d5f899c22fd2b3b4bb039b2913be0a2a4a

URL: https://github.com/llvm/llvm-project/commit/989051d5f899c22fd2b3b4bb039b2913be0a2a4a
DIFF: https://github.com/llvm/llvm-project/commit/989051d5f899c22fd2b3b4bb039b2913be0a2a4a.diff

LOG: [DSE] Extending isOverwrite to support offsetted fully overlapping stores

The isOverwrite function is making sure to identify if two stores
are fully overlapping and ideally we would like to identify all the
instances of OW_Complete as they'll yield possibly killable stores.
The current implementation is incapable of spotting instances where
the earlier store is offsetted compared to the later store, but
still fully overlapped. The limitation seems to lie on the
computation of the base pointers with the
GetPointerBaseWithConstantOffset API that often yields different
base pointers even if the stores are guaranteed to partially overlap
(e.g. the alias analysis is returning AliasResult::PartialAlias).

The patch relies on the offsets computed and cached by BatchAAResults
(available after D93529) to determine if the offsetted overlapping
is OW_Complete.

Differential Revision: https://reviews.llvm.org/D97676

Added: 
    llvm/test/Transforms/DeadStoreElimination/offsetted-overlapping-stores.ll

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 9cf9760c074e..4b5b705442d1 100644

--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -390,20 +390,29 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
   const uint64_t LaterSize = Later.Size.getValue();
   const uint64_t EarlierSize = Earlier.Size.getValue();
 
-  const Value *P1 = Earlier.Ptr->stripPointerCasts();
-  const Value *P2 = Later.Ptr->stripPointerCasts();
+  // Query the alias information
+  AliasResult AAR = AA.alias(Later, Earlier);
 
   // If the start pointers are the same, we just have to compare sizes to see if
   // the later store was larger than the earlier store.
-  if (P1 == P2 || AA.isMustAlias(P1, P2)) {
+  if (AAR == AliasResult::MustAlias) {
     // Make sure that the Later size is >= the Earlier size.
     if (LaterSize >= EarlierSize)
       return OW_Complete;
   }
 
+  // If we hit a partial alias we may have a full overwrite
+  if (AAR == AliasResult::PartialAlias) {
+    int64_t Off = AA.getClobberOffset(Later, Earlier).getValueOr(0);
+    if (Off > 0 && (uint64_t)Off + EarlierSize <= LaterSize)
+      return OW_Complete;
+  }
+
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval/inalloca argument).  If so, then it clearly
   // overwrites any other store to the same object.
+  const Value *P1 = Earlier.Ptr->stripPointerCasts();
+  const Value *P2 = Later.Ptr->stripPointerCasts();
   const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
 
   // If we can't resolve the same pointers to the same object, then we can't
@@ -987,8 +996,8 @@ struct DSEState {
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
-      : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
-        DL(F.getParent()->getDataLayout()) {}
+      : F(F), AA(AA), BatchAA(AA, /*CacheOffsets =*/true), MSSA(MSSA), DT(DT),
+        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()) {}
 
   static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                       DominatorTree &DT, PostDominatorTree &PDT,

diff  --git a/llvm/test/Transforms/DeadStoreElimination/offsetted-overlapping-stores.ll b/llvm/test/Transforms/DeadStoreElimination/offsetted-overlapping-stores.ll
new file mode 100644
index 000000000000..bb53f87e340e
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/offsetted-overlapping-stores.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -dse -S | FileCheck %s
+
+ at BUFFER = external local_unnamed_addr global [0 x i8], align 1
+
+define void @ArrayTestFullyOverlapping(i64 %0) {
+;
+; The DSE pass will try to kill the store of size i32 using the store of
+; size i64 because they fully overlap, in fact:
+;
+; - they use the same base pointer (in SCEV style '@BUFFER + %0')
+; - the offset between the two stores is 32 bits
+; - the size of the earlier store is 32 bits
+; - the size of the later store is 64 bits
+;
+; CHECK-LABEL: @ArrayTestFullyOverlapping(
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP0:%.*]], -8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
+; CHECK-NEXT:    store i64 0, i64* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %2 = add i64 %0, -8
+  %3 = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 %2
+  %4 = bitcast i8* %3 to i64*
+  %5 = add i64 %0, -4
+  %6 = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 %5
+  %7 = bitcast i8* %6 to i32*
+  store i32 1, i32* %7
+  store i64 0, i64* %4
+  ret void
+}
+
+define void @VectorTestFullyOverlapping(float* %arg, i32 %i) {
+; CHECK-LABEL: @VectorTestFullyOverlapping(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I2:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds float, float* [[ARG:%.*]], i64 [[I2]]
+; CHECK-NEXT:    [[I4:%.*]] = bitcast float* [[I3]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> zeroinitializer, <2 x float>* [[I4]], align 16
+; CHECK-NEXT:    ret void
+;
+bb:
+  %i7 = add nuw nsw i32 %i, 1
+  %i8 = zext i32 %i7 to i64
+  %i9 = getelementptr inbounds float, float* %arg, i64 %i8
+  store float 0.0, float* %i9, align 4
+  %i2 = zext i32 %i to i64
+  %i3 = getelementptr inbounds float, float* %arg, i64 %i2
+  %i4 = bitcast float* %i3 to <2 x float>*
+  store <2 x float> <float 0.0, float 0.0>, <2 x float>* %i4, align 16
+  ret void
+}
+
+define void @ArrayTestPartiallyOverlapping(i64 %0) {
+;
+; The DSE pass will not kill the store because the overlap is partial
+; and won't fully clobber the i32 store.
+;
+; CHECK-LABEL: @ArrayTestPartiallyOverlapping(
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP0:%.*]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP0]], 15
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
+; CHECK-NEXT:    store i32 1, i32* [[TMP7]], align 4
+; CHECK-NEXT:    store i64 0, i64* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %2 = add i64 %0, 10
+  %3 = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 %2
+  %4 = bitcast i8* %3 to i64*
+  %5 = add i64 %0, 15
+  %6 = getelementptr inbounds [0 x i8], [0 x i8]* @BUFFER, i64 0, i64 %5
+  %7 = bitcast i8* %6 to i32*
+  store i32 1, i32* %7
+  store i64 0, i64* %4
+  ret void
+}
+
+define void @VectorTestPartiallyOverlapping(float* %arg, i32 %i) {
+;
+; The DSE pass will not kill the store because the overlap is partial
+; and won't fully clobber the original store.
+;
+; CHECK-LABEL: @VectorTestPartiallyOverlapping(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I2:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds float, float* [[ARG:%.*]], i64 [[I2]]
+; CHECK-NEXT:    [[I4:%.*]] = bitcast float* [[I3]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float>* [[I4]], align 16
+; CHECK-NEXT:    [[I5:%.*]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I6:%.*]] = zext i32 [[I5]] to i64
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr inbounds float, float* [[ARG]], i64 [[I6]]
+; CHECK-NEXT:    [[I8:%.*]] = bitcast float* [[I7]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> zeroinitializer, <2 x float>* [[I8]], align 16
+; CHECK-NEXT:    ret void
+;
+bb:
+  %i2 = zext i32 %i to i64
+  %i3 = getelementptr inbounds float, float* %arg, i64 %i2
+  %i4 = bitcast float* %i3 to <2 x float>*
+  store <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float>* %i4, align 16
+  %i5 = add nuw nsw i32 %i, 1
+  %i6 = zext i32 %i5 to i64
+  %i7 = getelementptr inbounds float, float* %arg, i64 %i6
+  %i8 = bitcast float* %i7 to <2 x float>*
+  store <2 x float> <float 0.0, float 0.0>, <2 x float>* %i8, align 16
+  ret void
+}
+