[llvm] b5e208f - [DSE] Support looking through memory phis at end of function.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 30 05:28:23 PDT 2022


Author: Florian Hahn
Date: 2022-08-30T13:27:51+01:00
New Revision: b5e208fcbaa6c76a33a96e230e9df2121bed2a10

URL: https://github.com/llvm/llvm-project/commit/b5e208fcbaa6c76a33a96e230e9df2121bed2a10
DIFF: https://github.com/llvm/llvm-project/commit/b5e208fcbaa6c76a33a96e230e9df2121bed2a10.diff

LOG: [DSE] Support looking through memory phis at end of function.

Update isWriteAtEndOfFunction to look through MemoryPhis. The reason
MemoryPhis were skipped so far was the known AliasAnalysis issue with it
missing loop-carried dependences.

This problem is already addressed in other parts of the code by skipping
MemoryDefs that may be in difference loops. I think the same logic can
be applied here.

This can have a substantial impact on the number of stores removed in
some cases. For MultiSource/SPEC2006/SPEC2017 with -O3:

```
Metric: dse.NumFastStores

Program                                       dse.NumFastStores
                                              base              patch   diff
External/S...CINT2017rate/557.xz_r/557.xz_r     14.00             45.00 221.4%
External/S...te/538.imagick_r/538.imagick_r    439.00           1267.00 188.6%
MultiSourc...e/Applications/SIBsim4/SIBsim4      6.00             15.00 150.0%
MultiSourc...Prolangs-C/simulator/simulator      3.00              7.00 133.3%
MultiSource/Applications/siod/siod               3.00              7.00 133.3%
MultiSourc...arks/FreeBench/distray/distray      6.00              9.00  50.0%
MultiSourc...e/Applications/obsequi/Obsequi     22.00             30.00  36.4%
MultiSource/Benchmarks/Ptrdist/bc/bc            23.00             28.00  21.7%
External/S...NT2017rate/502.gcc_r/502.gcc_r   1258.00           1512.00  20.2%
External/S...te/520.omnetpp_r/520.omnetpp_r    954.00           1143.00  19.8%
External/S...rate/510.parest_r/510.parest_r   5961.00           7122.00  19.5%
External/S...C/CINT2006/445.gobmk/445.gobmk     47.00             56.00  19.1%
External/S...00.perlbench_r/500.perlbench_r    241.00            286.00  18.7%
External/S...NT2006/471.omnetpp/471.omnetpp     36.00             42.00  16.7%
External/S...06/400.perlbench/400.perlbench    183.00            210.00  14.8%
MultiSource/Applications/SPASS/SPASS            72.00             81.00  12.5%
External/S...17rate/541.leela_r/541.leela_r     72.00             80.00  11.1%
External/SPEC/CINT2006/403.gcc/403.gcc         585.00            642.00   9.7%
MultiSourc...e/Applications/sqlite3/sqlite3    120.00            131.00   9.2%
MultiSourc...Applications/hexxagon/hexxagon     11.00             12.00   9.1%
External/S.../CFP2006/453.povray/453.povray    566.00            615.00   8.7%
External/S...rate/511.povray_r/511.povray_r    578.00            627.00   8.5%
External/S...FP2006/482.sphinx3/482.sphinx3     12.00             13.00   8.3%
MultiSource/Applications/oggenc/oggenc         130.00            140.00   7.7%
MultiSourc...e/Applications/ClamAV/clamscan    250.00            268.00   7.2%
MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg     19.00             20.00   5.3%
MultiSourc...ch/consumer-jpeg/consumer-jpeg     19.00             20.00   5.3%
External/S...te/526.blender_r/526.blender_r   3747.00           3928.00   4.8%
MultiSourc...OE-ProxyApps-C++/miniFE/miniFE    104.00            108.00   3.8%
MultiSourc...ch/consumer-lame/consumer-lame     54.00             56.00   3.7%
MultiSource/Benchmarks/Bullet/bullet          1222.00           1264.00   3.4%
MultiSourc...nchmarks/tramp3d-v4/tramp3d-v4    973.00           1005.00   3.3%
External/S.../CFP2006/447.dealII/447.dealII   2699.00           2780.00   3.0%
External/S...06/483.xalancbmk/483.xalancbmk    788.00            810.00   2.8%
External/S.../CFP2006/450.soplex/450.soplex    180.00            185.00   2.8%
MultiSourc.../DOE-ProxyApps-C++/CLAMR/CLAMR    338.00            345.00   2.1%
MultiSourc...Benchmarks/7zip/7zip-benchmark    685.00            699.00   2.0%
External/S...FP2017rate/544.nab_r/544.nab_r    158.00            160.00   1.3%
MultiSourc...sumer-typeset/consumer-typeset    772.00            781.00   1.2%
External/S...2017rate/525.x264_r/525.x264_r    410.00            414.00   1.0%
External/S...23.xalancbmk_r/523.xalancbmk_r    998.00           1002.00   0.4%
```

Compile-time is almost neutral:

https://llvm-compile-time-tracker.com/compare.php?from=b3125ad3d60531a97eea20009cc9629a87755862&to=84007eee59004f43464eda7f5ba8263ed5158df8&stat=instructions

NewPM-O3: +0.03%
NewPM-ReleaseThinLTO: -0.01%
NewPM-ReleaseLTO-g: +0.03%

Reviewed By: asbirlea

Differential Revision: https://reviews.llvm.org/D132365

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
    llvm/test/Transforms/DeadStoreElimination/multiblock-memintrinsics.ll
    llvm/test/Transforms/DeadStoreElimination/phi-translation.ll
    llvm/test/Transforms/MemCpyOpt/memcpy.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c1e3e6031ec98..45c404a140af3 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1075,13 +1075,16 @@ struct DSEState {
       }
 
       MemoryAccess *UseAccess = WorkList[I];
-      // Simply adding the users of MemoryPhi to the worklist is not enough,
-      // because we might miss read clobbers in 
diff erent iterations of a loop,
-      // for example.
-      // TODO: Add support for phi translation to handle the loop case.
-      if (isa<MemoryPhi>(UseAccess))
-        return false;
+      if (isa<MemoryPhi>(UseAccess)) {
+        // AliasAnalysis does not account for loops. Limit elimination to
+        // candidates for which we can guarantee they always store to the same
+        // memory location.
+        if (!isGuaranteedLoopInvariant(MaybeLoc->Ptr))
+          return false;
 
+        PushMemUses(cast<MemoryPhi>(UseAccess));
+        continue;
+      }
       // TODO: Checking for aliasing is expensive. Consider reducing the amount
       // of times this is called and/or caching it.
       Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();

diff  --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-memintrinsics.ll
index 8615cf9e46d98..d0d13228ac5a2 100644
--- a/llvm/test/Transforms/DeadStoreElimination/multiblock-memintrinsics.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-memintrinsics.ll
@@ -138,8 +138,6 @@ define void @alloca_1(i1 %c) {
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
@@ -177,12 +175,8 @@ define void @alloca_2(i1 %c) {
 ; CHECK-NEXT:    call void @readonly_use(i32* [[P]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
-; CHECK-NEXT:    store i32 1, i32* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/DeadStoreElimination/phi-translation.ll b/llvm/test/Transforms/DeadStoreElimination/phi-translation.ll
index 9f764bdb0d0b1..158e7658aa4b4 100644
--- a/llvm/test/Transforms/DeadStoreElimination/phi-translation.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/phi-translation.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -dse -S %s | FileCheck %s
 
-; TODO: Both the stores in %then and %else can be eliminated by translating %p
+; Both the stores in %then and %else can be eliminated by translating %p
 ; through the phi.
 define void @memoryphi_translate_1(i1 %c) {
 ; CHECK-LABEL: @memoryphi_translate_1(
@@ -10,10 +10,8 @@ define void @memoryphi_translate_1(i1 %c) {
 ; CHECK-NEXT:    [[A_2:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    store i8 0, i8* [[A_1]], align 1
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    store i8 9, i8* [[A_2]], align 1
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[P:%.*]] = phi i8* [ [[A_1]], [[THEN]] ], [ [[A_2]], [[ELSE]] ]
@@ -39,7 +37,7 @@ end:
   ret void
 }
 
-; TODO: The store in %else can be eliminated by translating %p through the phi.
+; The store in %else can be eliminated by translating %p through the phi.
 ; The store in %then cannot be eliminated, because %a.1 is read before the final
 ; store.
 define i8 @memoryphi_translate_2(i1 %c) {
@@ -52,7 +50,6 @@ define i8 @memoryphi_translate_2(i1 %c) {
 ; CHECK-NEXT:    store i8 0, i8* [[A_1]], align 1
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    store i8 9, i8* [[A_2]], align 1
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[P:%.*]] = phi i8* [ [[A_1]], [[THEN]] ], [ [[A_2]], [[ELSE]] ]
@@ -80,7 +77,7 @@ end:
   ret i8 %l
 }
 
-; TODO: The store in %then can be eliminated by translating %p through the phi.
+; The store in %then can be eliminated by translating %p through the phi.
 ; The store in %else cannot be eliminated, because %a.2 is read before the final
 ; store.
 define i8 @memoryphi_translate_3(i1 %c) {
@@ -90,7 +87,6 @@ define i8 @memoryphi_translate_3(i1 %c) {
 ; CHECK-NEXT:    [[A_2:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    store i8 0, i8* [[A_1]], align 1
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    store i8 9, i8* [[A_2]], align 1
@@ -166,11 +162,9 @@ define void @memoryphi_translate_5(i1 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[B:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    [[C:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    store i8 0, i8* [[A]], align 1
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
-; CHECK-NEXT:    store i8 0, i8* [[C]], align 1
 ; CHECK-NEXT:    br label [[COND_END]]
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    [[P:%.*]] = phi i8* [ [[B]], [[COND_TRUE]] ], [ [[A]], [[ENTRY:%.*]] ]

diff  --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 9b97e36078e1f..8ba7772b31782 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -225,12 +225,9 @@ define i8 @test4_read_between(i8 *%P) {
 
 define void @test4_non_local(i8 *%P, i1 %c) {
 ; CHECK-LABEL: @test4_non_local(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A2]], i8* align 4 [[P:%.*]], i64 8, i1 false)
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[CALL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       call:
-; CHECK-NEXT:    call void @test4a(i8* byval(i8) align 1 [[P]])
+; CHECK-NEXT:    call void @test4a(i8* byval(i8) align 1 [[P:%.*]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void


        


More information about the llvm-commits mailing list