[llvm] r356130 - [ARM] Run ARMParallelDSP in the IRPasses phase

Thu Mar 14 03:57:40 PDT 2019

Author: sam_parker
Date: Thu Mar 14 03:57:40 2019
New Revision: 356130

URL: http://llvm.org/viewvc/llvm-project?rev=356130&view=rev
Log:
[ARM] Run ARMParallelDSP in the IRPasses phase

Run EarlyCSE before ParallelDSP and do this in the backend IR opt
phase.

Differential Revision: https://reviews.llvm.org/D59257

Added:
    llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
Modified:
    llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp
    llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
    llvm/trunk/test/CodeGen/ARM/loop-indexing.ll
    llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll
    llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
    llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
    llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll

Modified: llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp Thu Mar 14 03:57:40 2019
@@ -403,6 +403,12 @@ void ARMPassConfig::addIRPasses() {
 
   TargetPassConfig::addIRPasses();
 
+  // Run the parallel DSP pass and its helpers.
+  if (getOptLevel() == CodeGenOpt::Aggressive) {
+    addPass(createEarlyCSEPass());
+    addPass(createARMParallelDSPPass());
+  }
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createInterleavedAccessPass());
@@ -415,9 +421,6 @@ void ARMPassConfig::addCodeGenPrepare()
 }
 
 bool ARMPassConfig::addPreISel() {
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(createARMParallelDSPPass());
-
   if ((TM->getOptLevel() != CodeGenOpt::None &&
        EnableGlobalMerge == cl::BOU_UNSET) ||
       EnableGlobalMerge == cl::BOU_TRUE) {

Modified: llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll Thu Mar 14 03:57:40 2019
@@ -103,12 +103,6 @@ bb8:
   %35 = add i8 %33, 87
   %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
   store volatile i8 %iftmp.5.0.7, i8* %p8, align 1
-  ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
-  ; CHECK-NOT: [[REGISTER]],
-  ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
-  ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
-  ; CHECK-NOT: [[REGISTER]],
-  ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
   %36 = udiv i32 %2, 100000000
   %37 = urem i32 %36, 10
   %38 = icmp ult i32 %37, 10

Added: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll?rev=356130&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll Thu Mar 14 03:57:40 2019
@@ -0,0 +1,150 @@
+; RUN: llc -mtriple=arm -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK:  ModulePass Manager
+; CHECK:    Pre-ISel Intrinsic Lowering
+; CHECK:    FunctionPass Manager
+; CHECK:      Expand Atomic instructions
+; CHECK:      Simplify the CFG
+; CHECK:      Dominator Tree Construction
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Module Verifier
+; CHECK:      Natural Loop Information
+; CHECK:      Canonicalize natural loops
+; CHECK:      Scalar Evolution Analysis
+; CHECK:      Loop Pass Manager
+; CHECK:        Induction Variable Users
+; CHECK:        Loop Strength Reduction
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Function Alias Analysis Results
+; CHECK:      Merge contiguous icmps into a memcmp
+; CHECK:      Expand memcmp() to load/stores
+; CHECK:      Lower Garbage Collection Instructions
+; CHECK:      Shadow Stack GC Lowering
+; CHECK:      Remove unreachable blocks from the CFG
+; CHECK:      Dominator Tree Construction
+; CHECK:      Natural Loop Information
+; CHECK:      Branch Probability Analysis
+; CHECK:      Block Frequency Analysis
+; CHECK:      Constant Hoisting
+; CHECK:      Partially inline calls to library functions
+; CHECK:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
+; CHECK:      Scalarize Masked Memory Intrinsics
+; CHECK:      Expand reduction intrinsics
+; CHECK:      Dominator Tree Construction
+; CHECK:      Early CSE
+; CHECK:      Natural Loop Information
+; CHECK:      Scalar Evolution Analysis
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Function Alias Analysis Results
+; CHECK:      Loop Pass Manager
+; CHECK:        Transform loops to use DSP intrinsics
+; CHECK:      Interleaved Access Pass
+; CHECK:      ARM IR optimizations
+; CHECK:      Dominator Tree Construction
+; CHECK:      Natural Loop Information
+; CHECK:      CodeGen Prepare
+; CHECK:    Rewrite Symbols
+; CHECK:    FunctionPass Manager
+; CHECK:      Dominator Tree Construction
+; CHECK:      Exception handling preparation
+; CHECK:      Merge internal globals
+; CHECK:      Safe Stack instrumentation pass
+; CHECK:      Insert stack protectors
+; CHECK:      Module Verifier
+; CHECK:      Dominator Tree Construction
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Function Alias Analysis Results
+; CHECK:      Natural Loop Information
+; CHECK:      Branch Probability Analysis
+; CHECK:      ARM Instruction Selection
+; CHECK:      Expand ISel Pseudo-instructions
+; CHECK:      Early Tail Duplication
+; CHECK:      Optimize machine instruction PHIs
+; CHECK:      Slot index numbering
+; CHECK:      Merge disjoint stack slots
+; CHECK:      Local Stack Slot Allocation
+; CHECK:      Remove dead machine instructions
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Early Machine Loop Invariant Code Motion
+; CHECK:      Machine Common Subexpression Elimination
+; CHECK:      MachinePostDominator Tree Construction
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      Machine code sinking
+; CHECK:      Peephole Optimizations
+; CHECK:      Remove dead machine instructions
+; CHECK:      ARM MLA / MLS expansion pass
+; CHECK:      ARM pre- register allocation load / store optimization pass
+; CHECK:      ARM A15 S->D optimizer
+; CHECK:      Detect Dead Lanes
+; CHECK:      Process Implicit Definitions
+; CHECK:      Remove unreachable machine basic blocks
+; CHECK:      Live Variable Analysis
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Eliminate PHI nodes for register allocation
+; CHECK:      Two-Address instruction pass
+; CHECK:      Slot index numbering
+; CHECK:      Live Interval Analysis
+; CHECK:      Simple Register Coalescing
+; CHECK:      Rename Disconnected Subregister Components
+; CHECK:      Machine Instruction Scheduler
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      Debug Variable Analysis
+; CHECK:      Live Stack Slot Analysis
+; CHECK:      Virtual Register Map
+; CHECK:      Live Register Matrix
+; CHECK:      Bundle Machine CFG Edges
+; CHECK:      Spill Code Placement Analysis
+; CHECK:      Lazy Machine Block Frequency Analysis
+; CHECK:      Machine Optimization Remark Emitter
+; CHECK:      Greedy Register Allocator
+; CHECK:      Virtual Register Rewriter
+; CHECK:      Stack Slot Coloring
+; CHECK:      Machine Copy Propagation Pass
+; CHECK:      Machine Loop Invariant Code Motion
+; CHECK:      PostRA Machine Sink
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      MachinePostDominator Tree Construction
+; CHECK:      Lazy Machine Block Frequency Analysis
+; CHECK:      Machine Optimization Remark Emitter
+; CHECK:      Shrink Wrapping analysis
+; CHECK:      Prologue/Epilogue Insertion & Frame Finalization
+; CHECK:      Control Flow Optimizer
+; CHECK:      Tail Duplication
+; CHECK:      Machine Copy Propagation Pass
+; CHECK:      Post-RA pseudo instruction expansion pass
+; CHECK:      ARM load / store optimization pass
+; CHECK:      ReachingDefAnalysis
+; CHECK:      ARM Execution Domain Fix
+; CHECK:      BreakFalseDeps
+; CHECK:      ARM pseudo instruction expansion pass
+; CHECK:      Thumb2 instruction size reduce pass
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      If Converter
+; CHECK:      Thumb IT blocks insertion pass
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Post RA top-down list latency scheduler
+; CHECK:      Analyze Machine Code For Garbage Collection
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      MachinePostDominator Tree Construction
+; CHECK:      Branch Probability Basic Block Placement
+; CHECK:      Thumb2 instruction size reduce pass
+; CHECK:      Unpack machine instruction bundles
+; CHECK:      optimise barriers pass
+; CHECK:      ARM constant island placement and branch shortening pass
+; CHECK:      Contiguously Lay Out Funclets
+; CHECK:      StackMap Liveness Analysis
+; CHECK:      Live DEBUG_VALUE analysis
+; CHECK:      Insert fentry calls
+; CHECK:      Insert XRay ops
+; CHECK:      Implement the 'patchable-function' attribute
+; CHECK:      Lazy Machine Block Frequency Analysis
+; CHECK:      Machine Optimization Remark Emitter
+; CHECK:      ARM Assembly Printer
+; CHECK:      Free MachineFunction

Modified: llvm/trunk/test/CodeGen/ARM/loop-indexing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/loop-indexing.ll?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/loop-indexing.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/loop-indexing.ll Thu Mar 14 03:57:40 2019
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
-; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
-; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
 
 ; Tests to check that post increment addressing modes are used instead of
 ; updating base pointers with add instructions.

Modified: llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll Thu Mar 14 03:57:40 2019
@@ -5,67 +5,133 @@ target datalayout = "e-p:32:32:32-i1:8:8
 ; This test used to test vector spilling using vstmia/vldmia instructions, but
 ; the changes for PR:18825 prevent that spilling.
 
+; VST1 and VLD1 are now used for spilling/restoring.
+;
+; TODO:
+; I think more vldm should be generated, initial ones are used to load some
+; elements and then a sequence of vldr are used:
+; vldr  d15, [r1, #104]
+; vldr  d13, [r2, #96]
+; vldr  d9, [r1, #120]
+; vldr  d11, [r2, #112]
+; vldr  d14, [r1, #96]
+; vldr  d12, [r2, #88]
+; vldr  d8, [r1, #112]
+; vldr  d10, [r2, #104]
+
+; Also this patterns repeats several times which certainly seems like a vld1.64
+; should be used to load the data:
+; vldr  d16, [r1, #16]
+; vldr  d17, [r1, #24]
+; vst1.64 {d16, d17}, [lr:128]    @ 16-byte Spill
+
 ; CHECK: test:
-; CHECK: vstmia
-; CHECK: vldmia
-define void @test(i64* %src) #0 {
+; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}}
+; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}}
+define <16 x i64> @test(i64* %src0, i64* %src1) #0 {
 entry:
-  %arrayidx39 = getelementptr inbounds i64, i64* %src, i32 13
-  %vecinit285 = shufflevector <16 x i64> undef, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit285, <16 x i64>* undef, align 128
-  %0 = load i64, i64* undef, align 8
-  %vecinit379 = insertelement <16 x i64> undef, i64 %0, i32 9
-  %1 = load i64, i64* undef, align 8
-  %vecinit419 = insertelement <16 x i64> undef, i64 %1, i32 15
-  store <16 x i64> %vecinit419, <16 x i64>* undef, align 128
-  %vecinit579 = insertelement <16 x i64> undef, i64 0, i32 4
-  %vecinit582 = shufflevector <16 x i64> %vecinit579, <16 x i64> <i64 6, i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit584 = insertelement <16 x i64> %vecinit582, i64 undef, i32 9
-  %vecinit586 = insertelement <16 x i64> %vecinit584, i64 0, i32 10
-  %vecinit589 = shufflevector <16 x i64> %vecinit586, <16 x i64> <i64 12, i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 18, i32 19, i32 undef>
-  %2 = load i64, i64* undef, align 8
-  %vecinit591 = insertelement <16 x i64> %vecinit589, i64 %2, i32 15
-  store <16 x i64> %vecinit591, <16 x i64>* undef, align 128
-  %vecinit694 = shufflevector <16 x i64> undef, <16 x i64> <i64 13, i64 14, i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-  store <16 x i64> %vecinit694, <16 x i64>* undef, align 128
-  %3 = load i64, i64* undef, align 8
-  %vecinit1331 = insertelement <16 x i64> undef, i64 %3, i32 14
-  %4 = load i64, i64* undef, align 8
-  %vecinit1468 = insertelement <16 x i64> undef, i64 %4, i32 11
-  %vecinit1471 = shufflevector <16 x i64> %vecinit1468, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
-  %vecinit1474 = shufflevector <16 x i64> %vecinit1471, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit1474, <16 x i64>* undef, align 128
-  %vecinit1552 = shufflevector <16 x i64> undef, <16 x i64> <i64 10, i64 11, i64 12, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1555 = shufflevector <16 x i64> %vecinit1552, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
-  %vecinit1558 = shufflevector <16 x i64> %vecinit1555, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit1558, <16 x i64>* undef, align 128
-  %vecinit1591 = shufflevector <16 x i64> undef, <16 x i64> <i64 3, i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1594 = shufflevector <16 x i64> %vecinit1591, <16 x i64> <i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1597 = shufflevector <16 x i64> %vecinit1594, <16 x i64> <i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1599 = insertelement <16 x i64> %vecinit1597, i64 undef, i32 8
-  %vecinit1601 = insertelement <16 x i64> %vecinit1599, i64 undef, i32 9
-  %vecinit1603 = insertelement <16 x i64> %vecinit1601, i64 undef, i32 10
-  %5 = load i64, i64* undef, align 8
-  %vecinit1605 = insertelement <16 x i64> %vecinit1603, i64 %5, i32 11
-  %vecinit1608 = shufflevector <16 x i64> %vecinit1605, <16 x i64> <i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 undef>
-  %6 = load i64, i64* undef, align 8
-  %vecinit1610 = insertelement <16 x i64> %vecinit1608, i64 %6, i32 15
-  store <16 x i64> %vecinit1610, <16 x i64>* undef, align 128
-  %vecinit2226 = shufflevector <16 x i64> undef, <16 x i64> <i64 6, i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %7 = load i64, i64* undef, align 8
-  %vecinit2228 = insertelement <16 x i64> %vecinit2226, i64 %7, i32 8
-  %vecinit2230 = insertelement <16 x i64> %vecinit2228, i64 undef, i32 9
-  %vecinit2233 = shufflevector <16 x i64> %vecinit2230, <16 x i64> <i64 11, i64 12, i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef>
-  %vecinit2236 = shufflevector <16 x i64> %vecinit2233, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit2236, <16 x i64>* undef, align 128
-  %vecinit2246 = shufflevector <16 x i64> undef, <16 x i64> <i64 4, i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit2249 = shufflevector <16 x i64> %vecinit2246, <16 x i64> <i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit2252 = shufflevector <16 x i64> %vecinit2249, <16 x i64> <i64 10, i64 11, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit2255 = shufflevector <16 x i64> %vecinit2252, <16 x i64> <i64 12, i64 13, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 undef, i32 undef, i32 undef>
-  %8 = load i64, i64* %arrayidx39, align 8
-  %vecinit2257 = insertelement <16 x i64> %vecinit2255, i64 %8, i32 13
-  %vecinit2260 = shufflevector <16 x i64> %vecinit2257, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit2260, <16 x i64>* null, align 128
-  ret void
+  %addr.0 = getelementptr inbounds i64, i64* %src0, i32 0
+  %el.0 = load i64, i64* %addr.0, align 8
+  %addr.1 = getelementptr inbounds i64, i64* %src0, i32 1
+  %el.1 = load i64, i64* %addr.1, align 8
+  %addr.2 = getelementptr inbounds i64, i64* %src0, i32 2
+  %el.2 = load i64, i64* %addr.2, align 8
+  %addr.3 = getelementptr inbounds i64, i64* %src0, i32 3
+  %el.3 = load i64, i64* %addr.3, align 8
+  %addr.4 = getelementptr inbounds i64, i64* %src0, i32 4
+  %el.4 = load i64, i64* %addr.4, align 8
+  %addr.5 = getelementptr inbounds i64, i64* %src0, i32 5
+  %el.5 = load i64, i64* %addr.5, align 8
+  %addr.6 = getelementptr inbounds i64, i64* %src0, i32 6
+  %el.6 = load i64, i64* %addr.6, align 8
+  %addr.7 = getelementptr inbounds i64, i64* %src0, i32 7
+  %el.7 = load i64, i64* %addr.7, align 8
+  %addr.8 = getelementptr inbounds i64, i64* %src0, i32 8
+  %el.8 = load i64, i64* %addr.8, align 8
+  %addr.9 = getelementptr inbounds i64, i64* %src0, i32 9
+  %el.9 = load i64, i64* %addr.9, align 8
+  %addr.10 = getelementptr inbounds i64, i64* %src0, i32 10
+  %el.10 = load i64, i64* %addr.10, align 8
+  %addr.11 = getelementptr inbounds i64, i64* %src0, i32 11
+  %el.11 = load i64, i64* %addr.11, align 8
+  %addr.12 = getelementptr inbounds i64, i64* %src0, i32 12
+  %el.12 = load i64, i64* %addr.12, align 8
+  %addr.13 = getelementptr inbounds i64, i64* %src0, i32 13
+  %el.13 = load i64, i64* %addr.13, align 8
+  %addr.14 = getelementptr inbounds i64, i64* %src0, i32 14
+  %el.14 = load i64, i64* %addr.14, align 8
+  %addr.15 = getelementptr inbounds i64, i64* %src0, i32 15
+  %el.15 = load i64, i64* %addr.15, align 8
+
+  %addr.0.1 = getelementptr inbounds i64, i64* %src1, i32 0
+  %el.0.1 = load i64, i64* %addr.0.1, align 8
+  %addr.1.1 = getelementptr inbounds i64, i64* %src1, i32 1
+  %el.1.1 = load i64, i64* %addr.1.1, align 8
+  %addr.2.1 = getelementptr inbounds i64, i64* %src1, i32 2
+  %el.2.1 = load i64, i64* %addr.2.1, align 8
+  %addr.3.1 = getelementptr inbounds i64, i64* %src1, i32 3
+  %el.3.1 = load i64, i64* %addr.3.1, align 8
+  %addr.4.1 = getelementptr inbounds i64, i64* %src1, i32 4
+  %el.4.1 = load i64, i64* %addr.4.1, align 8
+  %addr.5.1 = getelementptr inbounds i64, i64* %src1, i32 5
+  %el.5.1 = load i64, i64* %addr.5.1, align 8
+  %addr.6.1 = getelementptr inbounds i64, i64* %src1, i32 6
+  %el.6.1 = load i64, i64* %addr.6.1, align 8
+  %addr.7.1 = getelementptr inbounds i64, i64* %src1, i32 7
+  %el.7.1 = load i64, i64* %addr.7.1, align 8
+  %addr.8.1 = getelementptr inbounds i64, i64* %src1, i32 8
+  %el.8.1 = load i64, i64* %addr.8.1, align 8
+  %addr.9.1 = getelementptr inbounds i64, i64* %src1, i32 9
+  %el.9.1 = load i64, i64* %addr.9.1, align 8
+  %addr.10.1 = getelementptr inbounds i64, i64* %src1, i32 10
+  %el.10.1 = load i64, i64* %addr.10.1, align 8
+  %addr.11.1 = getelementptr inbounds i64, i64* %src1, i32 11
+  %el.11.1 = load i64, i64* %addr.11.1, align 8
+  %addr.12.1 = getelementptr inbounds i64, i64* %src1, i32 12
+  %el.12.1 = load i64, i64* %addr.12.1, align 8
+  %addr.13.1 = getelementptr inbounds i64, i64* %src1, i32 13
+  %el.13.1 = load i64, i64* %addr.13.1, align 8
+  %addr.14.1 = getelementptr inbounds i64, i64* %src1, i32 14
+  %el.14.1 = load i64, i64* %addr.14.1, align 8
+  %addr.15.1 = getelementptr inbounds i64, i64* %src1, i32 15
+  %el.15.1 = load i64, i64* %addr.15.1, align 8
+  %vec.0 = insertelement <16 x i64> undef, i64 %el.0, i32 0
+  %vec.1 = insertelement <16 x i64> %vec.0, i64 %el.1, i32 1
+  %vec.2 = insertelement <16 x i64> %vec.1, i64 %el.2, i32 2
+  %vec.3 = insertelement <16 x i64> %vec.2, i64 %el.3, i32 3
+  %vec.4 = insertelement <16 x i64> %vec.3, i64 %el.4, i32 4
+  %vec.5 = insertelement <16 x i64> %vec.4, i64 %el.5, i32 5
+  %vec.6 = insertelement <16 x i64> %vec.5, i64 %el.6, i32 6
+  %vec.7 = insertelement <16 x i64> %vec.6, i64 %el.7, i32 7
+  %vec.8 = insertelement <16 x i64> %vec.7, i64 %el.8, i32 8
+  %vec.9 = insertelement <16 x i64> %vec.8, i64 %el.9, i32 9
+  %vec.10 = insertelement <16 x i64> %vec.9, i64 %el.10, i32 10
+  %vec.11 = insertelement <16 x i64> %vec.10, i64 %el.11, i32 11
+  %vec.12 = insertelement <16 x i64> %vec.11, i64 %el.12, i32 12
+  %vec.13 = insertelement <16 x i64> %vec.12, i64 %el.13, i32 13
+  %vec.14 = insertelement <16 x i64> %vec.13, i64 %el.14, i32 14
+  %vec.15 = insertelement <16 x i64> %vec.14, i64 %el.15, i32 15
+  call void @capture(i64* %src0, i64* %src1)
+  %vec.0.1 = insertelement <16 x i64> undef, i64 %el.0.1, i32 0
+  %vec.1.1 = insertelement <16 x i64> %vec.0.1, i64 %el.1.1, i32 1
+  %vec.2.1 = insertelement <16 x i64> %vec.1.1, i64 %el.2.1, i32 2
+  %vec.3.1 = insertelement <16 x i64> %vec.2.1, i64 %el.3.1, i32 3
+  %vec.4.1 = insertelement <16 x i64> %vec.3.1, i64 %el.4.1, i32 4
+  %vec.5.1 = insertelement <16 x i64> %vec.4.1, i64 %el.5.1, i32 5
+  %vec.6.1 = insertelement <16 x i64> %vec.5.1, i64 %el.6.1, i32 6
+  %vec.7.1 = insertelement <16 x i64> %vec.6.1, i64 %el.7.1, i32 7
+  %vec.8.1 = insertelement <16 x i64> %vec.7.1, i64 %el.7.1, i32 8
+  %vec.9.1 = insertelement <16 x i64> %vec.8.1, i64 %el.8.1, i32 9
+  %vec.10.1 = insertelement <16 x i64> %vec.9.1, i64 %el.9.1, i32 10
+  %vec.11.1 = insertelement <16 x i64> %vec.10.1, i64 %el.10.1, i32 11
+  %vec.12.1 = insertelement <16 x i64> %vec.11.1, i64 %el.11.1, i32 12
+  %vec.13.1 = insertelement <16 x i64> %vec.12.1, i64 %el.12.1, i32 13
+  %vec.14.1 = insertelement <16 x i64> %vec.13.1, i64 %el.13.1, i32 14
+  %vec.15.1 = insertelement <16 x i64> %vec.14.1, i64 %el.14.1, i32 15
+  %res = add <16 x i64> %vec.15, %vec.15.1
+  ret <16 x i64> %res
 }
+
+declare void @capture(i64*, i64*)
+
 attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }

Modified: llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll Thu Mar 14 03:57:40 2019
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O2 | FileCheck %s
 ; rdar://7493908
 
 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more

Modified: llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll Thu Mar 14 03:57:40 2019
@@ -32,13 +32,12 @@
 
 define fastcc i32 @parse_percent_token() nounwind {
 entry:
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
 ; Do not convert into single stream code. BranchProbability Analysis assumes
 ; that branches which goes to "ret" instruction have lower probabilities.
   switch i32 undef, label %bb7 [

Modified: llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll?rev=356130&r1=356129&r2=356130&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll (original)
+++ llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll Thu Mar 14 03:57:40 2019
@@ -302,7 +302,6 @@ declare <1 x i64> @llvm.arm.neon.vld1.v1
 ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
-; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], {{r[0-9]}}
 ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]