[llvm] cecc0d2 - [NewPM] Add an SROA pass after loop unroll

Fri Nov 1 15:00:54 PDT 2019

Author: Guozhi Wei
Date: 2019-11-01T14:59:08-07:00
New Revision: cecc0d27ad58c0aed8ef9ed99bbf691e137a0f26

URL: https://github.com/llvm/llvm-project/commit/cecc0d27ad58c0aed8ef9ed99bbf691e137a0f26
DIFF: https://github.com/llvm/llvm-project/commit/cecc0d27ad58c0aed8ef9ed99bbf691e137a0f26.diff

LOG: [NewPM] Add an SROA pass after loop unroll

If there is a small local array accessed in a loop, SROA can't handle memory
accesses with variant offset inside a loop, after the loop is fully unrolled,
all memory accesses to the array are with fixed offset, so now they can be
processed by SROA. But there is no more SROA passes after loop unroll. This
patch add an SROA pass after loop unroll to handle this pattern.

Differential Revision: https://reviews.llvm.org/D68593

Added: 
    llvm/test/Other/unroll-sroa.ll

Modified: 
    llvm/lib/Passes/PassBuilder.cpp
    llvm/test/Other/new-pm-defaults.ll
    llvm/test/Other/new-pm-thinlto-defaults.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5c4874de855c..15f7f3df136c 100644

--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -490,6 +490,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(
       std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
 
+  // Delete small array after loop unroll.
+  FPM.addPass(SROA());
+
   // Eliminate redundancies.
   if (Level != O1) {
     // These passes add substantial compile time so skip them at O1.

diff  --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 638c783725bf..009f19e544c8 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -179,6 +179,7 @@
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
 ; CHECK-O-NEXT: Finished Loop pass manager run.
+; CHECK-O-NEXT: Running pass: SROA on foo
 ; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-Os-NEXT: Running pass: GVN
 ; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index 7b6855b130e6..a0b4df044450 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -156,6 +156,7 @@
 ; CHECK-O-NEXT: Running pass: LoopDeletionPass
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-O-NEXT: Finished Loop pass manager run.
+; CHECK-O-NEXT: Running pass: SROA on foo
 ; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-Os-NEXT: Running pass: GVN
 ; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/unroll-sroa.ll b/llvm/test/Other/unroll-sroa.ll
new file mode 100644
index 000000000000..e65756284ae6
--- /dev/null
+++ b/llvm/test/Other/unroll-sroa.ll
@@ -0,0 +1,61 @@
+; RUN: opt -disable-verify -passes='default<O2>' -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The local array %tmp can only be optimized away by sroa after loop unroll.
+
+; CHECK-LABEL: define void @foo
+; CHECK-NOT:   alloca
+; CHECK-NOT:   call void @llvm.memcpy.p0i8.p0i8.i64
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32* %a, i32* %b) {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %tmp = alloca [4 x float], align 16
+  %i = alloca i32, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %iter2 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %iter2, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inptr = load i32*, i32** %a.addr, align 8
+  %idx2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %idx2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %inptr, i64 %idxprom
+  %val = load i32, i32* %arrayidx, align 4
+  %conv = sitofp i32 %val to float
+  %idx = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %idx to i64
+  %arrayidx2 = getelementptr inbounds [4 x float], [4 x float]* %tmp, i64 0, i64 %idxprom1
+  store float %conv, float* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %iter = load i32, i32* %i, align 4
+  %inc = add nsw i32 %iter, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  %dstptr = load i32*, i32** %b.addr, align 8
+  %dst = bitcast i32* %dstptr to i8*
+  %arraydecay = getelementptr inbounds [4 x float], [4 x float]* %tmp, i64 0, i64 0
+  %src = bitcast float* %arraydecay to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %dst, i8* align 16 %src, i64 16, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg)