[llvm] 6438ea4 - [VectorCombine] position pass after SLP in the optimization pipeline rather than before

Fri May 22 09:22:59 PDT 2020

Author: Sanjay Patel
Date: 2020-05-22T12:22:44-04:00
New Revision: 6438ea45e053378a3c461a879805174eaa864bdb

URL: https://github.com/llvm/llvm-project/commit/6438ea45e053378a3c461a879805174eaa864bdb
DIFF: https://github.com/llvm/llvm-project/commit/6438ea45e053378a3c461a879805174eaa864bdb.diff

LOG: [VectorCombine] position pass after SLP in the optimization pipeline rather than before

There are 2 known problem patterns shown in the test diffs here:
vector horizontal ops (an x86 specialization) and vector reductions.

SLP has greater ability to match and fold those than vector-combine,
so let SLP have first chance at that.

This is a quick fix while we continue to improve vector-combine and
possibly canonicalize to reduction intrinsics.

In the longer term, we should improve matching of these patterns
because if they were created in the "bad" forms shown here, then we
would miss optimizing them.

I'm not sure what is happening with alias analysis on the addsub test.
The old pass manager now shows an extra line for that, and we see an
improvement that comes from SLP vectorizing a store. I don't know
what's missing with the new pass manager to make that happen.
Strangely, I can't reproduce the behavior if I compile from C++ with
clang and invoke the new PM with "-fexperimental-new-pass-manager".

Differential Revision: https://reviews.llvm.org/D80236

Added: 
    

Modified: 
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
    llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
    llvm/test/Other/new-pm-defaults.ll
    llvm/test/Other/new-pm-thinlto-defaults.ll
    llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
    llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
    llvm/test/Other/opt-O2-pipeline.ll
    llvm/test/Other/opt-O3-pipeline.ll
    llvm/test/Other/opt-Os-pipeline.ll
    llvm/test/Other/opt-pipeline-vector-passes.ll
    llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
    llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 9b4cc1c7323a..d4d437a8d3e7 100644

--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -986,10 +986,6 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   OptimizePM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
-  // Enhance/cleanup vector code.
-  OptimizePM.addPass(VectorCombinePass());
-  OptimizePM.addPass(EarlyCSEPass());
-
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
   OptimizePM.addPass(LoopLoadEliminationPass());
@@ -1016,6 +1012,9 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (PTO.SLPVectorization)
     OptimizePM.addPass(SLPVectorizerPass());
 
+  // Enhance/cleanup vector code.
+  OptimizePM.addPass(VectorCombinePass());
+  OptimizePM.addPass(EarlyCSEPass());
   OptimizePM.addPass(InstCombinePass());
 
   // Unroll small loops to hide loop backedge latency and saturate any parallel

diff  --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index a0fa1fa58c75..6a2cc0212163 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -741,8 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLoopDistributePass());
 
   MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
-  MPM.add(createVectorCombinePass());
-  MPM.add(createEarlyCSEPass());
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
@@ -783,6 +781,10 @@ void PassManagerBuilder::populateModulePassManager(
     }
   }
 
+  // Enhance/cleanup vector code.
+  MPM.add(createVectorCombinePass());
+  MPM.add(createEarlyCSEPass());
+
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createInstructionCombiningPass());
 

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index 87f3435c23d4..8b1b404211d7 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -230,8 +230,6 @@
 ; GCN-O1-NEXT:       Optimization Remark Emitter
 ; GCN-O1-NEXT:       Inject TLI Mappings
 ; GCN-O1-NEXT:       Loop Vectorization
-; GCN-O1-NEXT:       Optimize scalar/vector ops
-; GCN-O1-NEXT:       Early CSE
 ; GCN-O1-NEXT:       Canonicalize natural loops
 ; GCN-O1-NEXT:       Scalar Evolution Analysis
 ; GCN-O1-NEXT:       Function Alias Analysis Results
@@ -247,6 +245,8 @@
 ; GCN-O1-NEXT:       Combine redundant instructions
 ; GCN-O1-NEXT:       Simplify the CFG
 ; GCN-O1-NEXT:       Dominator Tree Construction
+; GCN-O1-NEXT:       Optimize scalar/vector ops
+; GCN-O1-NEXT:       Early CSE
 ; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:       Function Alias Analysis Results
 ; GCN-O1-NEXT:       Natural Loop Information
@@ -571,8 +571,6 @@
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Inject TLI Mappings
 ; GCN-O2-NEXT:       Loop Vectorization
-; GCN-O2-NEXT:       Optimize scalar/vector ops
-; GCN-O2-NEXT:       Early CSE
 ; GCN-O2-NEXT:       Canonicalize natural loops
 ; GCN-O2-NEXT:       Scalar Evolution Analysis
 ; GCN-O2-NEXT:       Function Alias Analysis Results
@@ -598,6 +596,9 @@
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Inject TLI Mappings
 ; GCN-O2-NEXT:       SLP Vectorizer
+; GCN-O2-NEXT:       Optimize scalar/vector ops
+; GCN-O2-NEXT:       Early CSE
+; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Combine redundant instructions
 ; GCN-O2-NEXT:       Canonicalize natural loops
@@ -924,8 +925,6 @@
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Inject TLI Mappings
 ; GCN-O3-NEXT:       Loop Vectorization
-; GCN-O3-NEXT:       Optimize scalar/vector ops
-; GCN-O3-NEXT:       Early CSE
 ; GCN-O3-NEXT:       Canonicalize natural loops
 ; GCN-O3-NEXT:       Scalar Evolution Analysis
 ; GCN-O3-NEXT:       Function Alias Analysis Results
@@ -951,6 +950,9 @@
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Inject TLI Mappings
 ; GCN-O3-NEXT:       SLP Vectorizer
+; GCN-O3-NEXT:       Optimize scalar/vector ops
+; GCN-O3-NEXT:       Early CSE
+; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Combine redundant instructions
 ; GCN-O3-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ae3e1af86762..1c4ef4067afe 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -253,8 +253,6 @@
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
-; CHECK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -262,6 +260,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index 6bbad8941ea1..6d278c9e08e1 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -223,8 +223,6 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
-; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
@@ -232,6 +230,8 @@
 ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index c84a74d1ffac..44a4a3ad842f 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -191,8 +191,6 @@
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
-; CHECK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -200,6 +198,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index d4895bad34ef..4f3ddcaa1c6a 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -202,8 +202,6 @@
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
-; CHECK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -211,6 +209,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 6a331bbcd955..3edc2a6047dc 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -227,8 +227,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -254,6 +252,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index 2710e54c0065..ce44c2b7c262 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -232,8 +232,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -259,6 +257,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index 1015286f4334..fdc1906cf1b3 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -213,8 +213,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -240,6 +238,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll
index 1a1ad56c88e7..ae4620950e44 100644
--- a/llvm/test/Other/opt-pipeline-vector-passes.ll
+++ b/llvm/test/Other/opt-pipeline-vector-passes.ll
@@ -12,15 +12,15 @@
 
 ; OLDPM_O1-LABEL:  Pass Arguments:
 ; OLDPM_O1:        Loop Vectorization
-; OLDPM_O1:        Optimize scalar/vector ops
 ; OLDPM_O1-NOT:    SLP Vectorizer
+; OLDPM_O1:        Optimize scalar/vector ops
 
 ; Everything runs at -O2.
 
 ; OLDPM_O2-LABEL:  Pass Arguments:
 ; OLDPM_O2:        Loop Vectorization
-; OLDPM_O2:        Optimize scalar/vector ops
 ; OLDPM_O2:        SLP Vectorizer
+; OLDPM_O2:        Optimize scalar/vector ops
 
 ; The loop vectorizer still runs at both -O1/-O2 even with the
 ; debug flag, but it only works on loops explicitly annotated
@@ -28,24 +28,24 @@
 
 ; OLDPM_O1_FORCE_OFF-LABEL:  Pass Arguments:
 ; OLDPM_O1_FORCE_OFF:        Loop Vectorization
-; OLDPM_O1_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O1_FORCE_OFF-NOT:    SLP Vectorizer
+; OLDPM_O1_FORCE_OFF:        Optimize scalar/vector ops
 
 ; OLDPM_O2_FORCE_OFF-LABEL:  Pass Arguments:
 ; OLDPM_O2_FORCE_OFF:        Loop Vectorization
-; OLDPM_O2_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O2_FORCE_OFF:        SLP Vectorizer
+; OLDPM_O2_FORCE_OFF:        Optimize scalar/vector ops
 
 ; There should be no 
diff erence with the new pass manager.
 ; This is tested more thoroughly in other test files.
 
 ; NEWPM_O1-LABEL:  Running pass: LoopVectorizePass
-; NEWPM_O1:        Running pass: VectorCombinePass
 ; NEWPM_O1-NOT:    Running pass: SLPVectorizerPass
+; NEWPM_O1:        Running pass: VectorCombinePass
 
 ; NEWPM_O2-LABEL:  Running pass: LoopVectorizePass
-; NEWPM_O2:        Running pass: VectorCombinePass
 ; NEWPM_O2:        Running pass: SLPVectorizerPass
+; NEWPM_O2:        Running pass: VectorCombinePass
 
 define void @f() {
   ret void

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index 57acbeea4c4d..0e50efa68f61 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S                   | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -O3 -S                   | FileCheck %s --check-prefixes=CHECK,OLDPM
+; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,NEWPM
 
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -69,22 +69,30 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float>
 }
 
 define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
-; CHECK-LABEL: @add_aggregate_store(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
-; CHECK-NEXT:    store float [[TMP2]], float* [[R0]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
-; CHECK-NEXT:    store float [[TMP3]], float* [[R1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
-; CHECK-NEXT:    store float [[TMP5]], float* [[R2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
-; CHECK-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
-; CHECK-NEXT:    store float [[TMP6]], float* [[R3]], align 4
-; CHECK-NEXT:    ret void
+; OLDPM-LABEL: @add_aggregate_store(
+; OLDPM-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; OLDPM-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; OLDPM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OLDPM-NEXT:    [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
+; OLDPM-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; OLDPM-NEXT:    ret void
+;
+; NEWPM-LABEL: @add_aggregate_store(
+; NEWPM-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; NEWPM-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NEWPM-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
+; NEWPM-NEXT:    store float [[TMP2]], float* [[R0]], align 4
+; NEWPM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NEWPM-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
+; NEWPM-NEXT:    store float [[TMP3]], float* [[R1]], align 4
+; NEWPM-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; NEWPM-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NEWPM-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
+; NEWPM-NEXT:    store float [[TMP5]], float* [[R2]], align 4
+; NEWPM-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NEWPM-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
+; NEWPM-NEXT:    store float [[TMP6]], float* [[R3]], align 4
+; NEWPM-NEXT:    ret void
 ;
   %a00 = extractelement <2 x float> %a0, i32 0
   %b00 = extractelement <2 x float> %b0, i32 0

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
index bd86e64343ab..749ed967b842 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
@@ -9,18 +9,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: @hadd_reverse_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
-; CHECK-NEXT:    [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]]
-; CHECK-NEXT:    [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]]
-; CHECK-NEXT:    [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    ret <4 x float> [[VECINIT14]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -45,18 +37,11 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 
 define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: @reverse_hadd_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 66ec8157932c..304b38b09bce 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -5,19 +5,15 @@
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; FIXME: This should only need 2 'or' instructions.
-
 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; CHECK-NEXT:    ret i32 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %z = and <4 x i32> %x, %y
   %z0 = extractelement <4 x i32> %z, i32 0