[llvm] 71a3168 - [PassManager] adjust VectorCombine placement

Wed Mar 4 08:14:46 PST 2020

Author: Sanjay Patel
Date: 2020-03-04T11:10:49-05:00
New Revision: 71a316883d503ba9020d78089d276e73a6113cef

URL: https://github.com/llvm/llvm-project/commit/71a316883d503ba9020d78089d276e73a6113cef
DIFF: https://github.com/llvm/llvm-project/commit/71a316883d503ba9020d78089d276e73a6113cef.diff

LOG: [PassManager] adjust VectorCombine placement

The initial placement of vector-combine in the opt pipeline revealed phase ordering bugs:
https://bugs.llvm.org/show_bug.cgi?id=45015
https://bugs.llvm.org/show_bug.cgi?id=42022

This patch contains a few independent changes:

1. Move the pass up in the pipeline, so it happens just after loop-vectorization.
   This is only to keep vectorization passes together in the pipeline at the moment.
   I don't have evidence of interaction between these yet.
2. Add an -early-cse pass after -vector-combine to clean up redundant ops. This was
   partly proposed as far back as rL219644 (which is why it's effectively being moved
   in the old PM code). This is important because the subsequent -instcombine doesn't
   work as well without EarlyCSE. With the CSE, -instcombine is able to squash
   shuffles together in 1 of the tests (because those are simple "select" shuffles).
3. Remove the -vector-combine pass that was running after SLP. We may want to do that
   eventually, but I don't have a test case to support it yet.

Differential Revision: https://reviews.llvm.org/D75145

Added: 
    

Modified: 
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
    llvm/test/Other/new-pm-defaults.ll
    llvm/test/Other/new-pm-thinlto-defaults.ll
    llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
    llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
    llvm/test/Other/opt-O2-pipeline.ll
    llvm/test/Other/opt-O3-pipeline.ll
    llvm/test/Other/opt-Os-pipeline.ll
    llvm/test/Transforms/PhaseOrdering/X86/addsub.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e0ef37a9562b..a6a2a78f1c74 100644

--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -966,12 +966,15 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   OptimizePM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
+  // Enhance/cleanup vector code.
+  OptimizePM.addPass(VectorCombinePass());
+  OptimizePM.addPass(EarlyCSEPass());
+
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
   OptimizePM.addPass(LoopLoadEliminationPass());
 
   // Cleanup after the loop optimization passes.
-  OptimizePM.addPass(VectorCombinePass());
   OptimizePM.addPass(InstCombinePass());
 
   // Now that we've formed fast to execute loop structures, we do further
@@ -990,10 +993,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
                                      sinkCommonInsts(true)));
 
   // Optimize parallel scalar instruction chains into SIMD instructions.
-  if (PTO.SLPVectorization) {
+  if (PTO.SLPVectorization)
     OptimizePM.addPass(SLPVectorizerPass());
-    OptimizePM.addPass(VectorCombinePass());
-  }
 
   OptimizePM.addPass(InstCombinePass());
 

diff  --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index d321aa055a19..e332f0d83e1d 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -729,6 +729,8 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLoopDistributePass());
 
   MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
+  MPM.add(createVectorCombinePass());
+  MPM.add(createEarlyCSEPass());
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
@@ -739,7 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
   // on -O1 and no #pragma is found). Would be good to have these two passes
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
-  MPM.add(createVectorCombinePass());
   addInstructionCombiningPass(MPM);
   if (OptLevel > 1 && ExtraVectorizerPasses) {
     // At higher optimization levels, try to clean up any runtime overlap and
@@ -748,7 +749,6 @@ void PassManagerBuilder::populateModulePassManager(
     // common computations, hoist loop-invariant aspects out of any outer loop,
     // and unswitch the runtime checks if possible. Once hoisted, we may have
     // dead (or speculatable) control flows or more combining opportunities.
-    MPM.add(createEarlyCSEPass());
     MPM.add(createCorrelatedValuePropagationPass());
     addInstructionCombiningPass(MPM);
     MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
@@ -766,7 +766,6 @@ void PassManagerBuilder::populateModulePassManager(
 
   if (SLPVectorize) {
     MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
-    MPM.add(createVectorCombinePass());
     if (OptLevel > 1 && ExtraVectorizerPasses) {
       MPM.add(createEarlyCSEPass());
     }

diff  --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ed15fadee3f9..ca592ef4d21c 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -250,17 +250,15 @@
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
-; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
-; CHECK-O2-NEXT: Running pass: VectorCombinePass
-; CHECK-O3-NEXT: Running pass: VectorCombinePass
-; CHECK-Os-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index a6faeccb30dd..8967af5aca5b 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -220,17 +220,15 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
+; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
-; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
-; CHECK-POSTLINK-O2-NEXT: Running pass: VectorCombinePass
-; CHECK-POSTLINK-O3-NEXT: Running pass: VectorCombinePass
-; CHECK-POSTLINK-Os-NEXT: Running pass: VectorCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 0c2e9328c11a..1f262b8869dc 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -188,17 +188,15 @@
 ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
-; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
-; CHECK-O2-NEXT: Running pass: VectorCombinePass
-; CHECK-O3-NEXT: Running pass: VectorCombinePass
-; CHECK-Os-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 29d379ab54ad..cceea7f75521 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -199,17 +199,15 @@
 ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
-; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
-; CHECK-O2-NEXT: Running pass: VectorCombinePass
-; CHECK-O3-NEXT: Running pass: VectorCombinePass
-; CHECK-Os-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

diff  --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 28137f646ace..ef822ae4b440 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -225,6 +225,8 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -232,7 +234,6 @@
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Loop Load Elimination
-; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
@@ -250,8 +251,6 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       SLP Vectorizer
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index 61b5b9c973b8..b4f0ae3cd27d 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -230,6 +230,8 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -237,7 +239,6 @@
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Loop Load Elimination
-; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
@@ -255,8 +256,6 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       SLP Vectorizer
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index 81f82d080c70..23f6bedaacf6 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -212,6 +212,8 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -219,7 +221,6 @@
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Loop Load Elimination
-; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Lazy Branch Probability Analysis
@@ -237,8 +238,6 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       SLP Vectorizer
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index df0eeef88f73..956e10a06d90 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -4,7 +4,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; TODO: Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
+; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 ; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
 
@@ -12,11 +12,7 @@ define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @PR45015(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
-; CHECK-NEXT:    [[T8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[ARG]], [[ARG1]]
-; CHECK-NEXT:    [[T12:%.*]] = shufflevector <4 x float> [[T8]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
-; CHECK-NEXT:    [[T16:%.*]] = shufflevector <4 x float> [[T12]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[T16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    ret <4 x float> [[T16]]
 ;
   %t = extractelement <4 x float> %arg, i32 0
@@ -45,13 +41,9 @@ define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
 define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1) {
 ; CHECK-LABEL: @add_aggregate(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A0]], [[B0]]
-; CHECK-NEXT:    [[RETVAL_0_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1]], [[B1]]
-; CHECK-NEXT:    [[RETVAL_1_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[RETVAL_0_1_INSERT]], 0
-; CHECK-NEXT:    [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[RETVAL_1_1_INSERT]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP1]], 0
+; CHECK-NEXT:    [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[TMP2]], 1
 ; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[FCA_1_INSERT]]
 ;
   %a00 = extractelement <2 x float> %a0, i32 0
@@ -81,18 +73,16 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    store float [[TMP2]], float* [[R0]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[A0]], [[B0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
-; CHECK-NEXT:    store float [[TMP4]], float* [[R1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], float* [[R1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
-; CHECK-NEXT:    store float [[TMP6]], float* [[R2]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[A1]], [[B1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; CHECK-NEXT:    store float [[TMP5]], float* [[R2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
-; CHECK-NEXT:    store float [[TMP8]], float* [[R3]], align 4
+; CHECK-NEXT:    store float [[TMP6]], float* [[R3]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %a00 = extractelement <2 x float> %a0, i32 0