[llvm] 163bb6d - [Passes][VectorCombine] enable early run generally and try load folds

Mon Nov 21 10:58:07 PST 2022

Author: Sanjay Patel
Date: 2022-11-21T13:57:55-05:00
New Revision: 163bb6d64e5f1220777c3ec2a8b58c0666a74d91

URL: https://github.com/llvm/llvm-project/commit/163bb6d64e5f1220777c3ec2a8b58c0666a74d91
DIFF: https://github.com/llvm/llvm-project/commit/163bb6d64e5f1220777c3ec2a8b58c0666a74d91.diff

LOG: [Passes][VectorCombine] enable early run generally and try load folds

An early run of VectorCombine was added with D102496 specifically to
deal with unnecessary vector ops produced with the C matrix extension.
This patch is proposing to try those folds in general and add a pair
of load folds to the menu.

The load transform will partly solve (see PhaseOrdering diffs) a
longstanding vectorization perf bug by removing redundant loads via GVN:
issue #17113

The main reason for not enabling the extra pass generally in the initial
patch was compile-time cost. The cost of VectorCombine was significantly
(surprisingly) improved with:
87debdadaf18
https://llvm-compile-time-tracker.com/compare.php?from=ffe05b8f57d97bc4340f791cb386c8d00e0739f2&to=87debdadaf18f8a5c7e5d563889e10731dc3554d&stat=instructions:u

...so the extra run is going to cost very little now - the total cost of
the 2 runs should be less than the 1 run before that micro-optimization:
https://llvm-compile-time-tracker.com/compare.php?from=5e8c2026d10e8e2c93c038c776853bed0e7c8fc1&to=2c4b68eab5ae969811f422714e0eba44c5f7eefb&stat=instructions:u

It may be possible to reduce the cost slightly more with a few more
earlier-exits like that, but it's probably in the noise based on timing
experiments.

Differential Revision: https://reviews.llvm.org/D138353

Added: 
    

Modified: 
    llvm/lib/Passes/PassBuilderPipelines.cpp
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Other/new-pm-defaults.ll
    llvm/test/Other/new-pm-thinlto-defaults.ll
    llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
    llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
    llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
    llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
    llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 21836817405f..285645b8ba47 100644

--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -615,10 +615,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // Delete small array after loop unroll.
   FPM.addPass(SROAPass());
 
-  // The matrix extension can introduce large vector operations early, which can
-  // benefit from running vector-combine early on.
-  if (EnableMatrix)
-    FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
+  // Try vectorization/scalarization transforms that are both improvements
+  // themselves and can allow further folds with GVN and InstCombine.
+  FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
 
   // Eliminate redundancies.
   FPM.addPass(MergedLoadStoreMotionPass());

diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 7bd9ee90b9e8..7769aaf99a0f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1700,8 +1700,6 @@ bool VectorCombine::run() {
     Builder.SetInsertPoint(&I);
     if (!TryEarlyFoldsOnly) {
       if (isa<FixedVectorType>(I.getType())) {
-        MadeChange |= vectorizeLoadInsert(I);
-        MadeChange |= widenSubvectorLoad(I);
         MadeChange |= foldInsExtFNeg(I);
         MadeChange |= foldBitcastShuf(I);
         MadeChange |= foldShuffleOfBinops(I);
@@ -1713,6 +1711,8 @@ bool VectorCombine::run() {
       }
     }
     if (isa<FixedVectorType>(I.getType())) {
+      MadeChange |= vectorizeLoadInsert(I);
+      MadeChange |= widenSubvectorLoad(I);
       MadeChange |= scalarizeBinopOrCmp(I);
       MadeChange |= scalarizeLoadExtract(I);
     }

diff  --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 303e164d0cfa..ddb84d0ce6f2 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -185,7 +185,7 @@
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
 ; CHECK-O-NEXT: Running pass: SROAPass on foo
-; CHECK-MATRIX: Running pass: VectorCombinePass
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index 42d553c19eb5..997f7072611e 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -158,6 +158,7 @@
 ; CHECK-O-NEXT: Running pass: LoopDeletionPass
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index e67b23d18fbd..51a0b45c7de7 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -119,6 +119,7 @@
 ; CHECK-O-NEXT: Running pass: LoopDeletionPass
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 01204ecc065d..0390d6492b89 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -128,6 +128,7 @@
 ; CHECK-O-NEXT: Running pass: LoopDeletionPass
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index ccd7345a8f50..b47d19a7b6ca 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -157,6 +157,7 @@
 ; CHECK-O-NEXT: Running pass: LoopDeletionPass
 ; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
 ; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index dcaf0ff96781..87d210944f64 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -122,6 +122,7 @@
 ; CHECK-O-NEXT: Running pass: IndVarSimplifyPass
 ; CHECK-O-NEXT: Running pass: LoopDeletionPass
 ; CHECK-O-NEXT: Running pass: SROAPass on foo
+; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
 ; CHECK-O23SZ-NEXT: Running pass: GVNPass
 ; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 1ed72290f2df..d37638a65c05 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -12,20 +12,13 @@ $getAt = comdat any
 define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
 ; SSE-LABEL: @ConvertVectors_ByRef(
 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; SSE-NEXT:    ret <4 x float> [[TMP7]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByRef(
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x float], ptr [[TMP0]], i64 0, i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 8
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i64 2
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP4]], i64 3
-; AVX-NEXT:    ret <4 x float> [[TMP6]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = alloca ptr, align 8
   %3 = alloca <4 x float>, align 16