[llvm] dc8a41d - [ARM] Simplify address calculation for NEON load/store
Andrew Savonichev via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 14 05:26:18 PDT 2021
Author: Andrew Savonichev
Date: 2021-10-14T15:23:10+03:00
New Revision: dc8a41de34933bc10c4d5d89c539dd0dc80d59cc
URL: https://github.com/llvm/llvm-project/commit/dc8a41de34933bc10c4d5d89c539dd0dc80d59cc
DIFF: https://github.com/llvm/llvm-project/commit/dc8a41de34933bc10c4d5d89c539dd0dc80d59cc.diff
LOG: [ARM] Simplify address calculation for NEON load/store
The patch attempts to optimize a sequence of SIMD loads from the same
base pointer:
%0 = gep float*, float* base, i32 4
%1 = bitcast float* %0 to <4 x float>*
%2 = load <4 x float>, <4 x float>* %1
...
%n1 = gep float*, float* base, i32 N
%n2 = bitcast float* %n1 to <4 x float>*
%n3 = load <4 x float>, <4 x float>* %n2
For AArch64 the compiler generates a sequence of LDR Qt, [Xn, #16].
However, 32-bit NEON VLD1/VST1 lack the [Wn, #imm] addressing mode, so
the address is computed before every ld/st instruction:
add r2, r0, #32
add r0, r0, #16
vld1.32 {d18, d19}, [r2]
vld1.32 {d22, d23}, [r0]
This can be improved by computing address for the first load, and then
using a post-indexed form of VLD1/VST1 to load the rest:
add r0, r0, #16
vld1.32 {d18, d19}, [r0]!
vld1.32 {d22, d23}, [r0]
In order to do that, the patch adds more patterns to DAGCombine:
- (load (add ptr inc1)) and (add ptr inc2) are now folded if inc1
and inc2 are constants.
- (or ptr inc) is now recognized as a pointer increment if ptr is
sufficiently aligned.
In addition to that, we now search for all possible base updates and
then pick the best one.
Differential Revision: https://reviews.llvm.org/D108988
Added:
llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
llvm/test/CodeGen/ARM/fp16-vector-argument.ll
llvm/test/CodeGen/ARM/large-vector.ll
llvm/test/CodeGen/ARM/memcpy-inline.ll
llvm/test/CodeGen/ARM/memset-align.ll
llvm/test/CodeGen/ARM/misched-fusion-aes.ll
llvm/test/CodeGen/ARM/vector-load.ll
llvm/test/CodeGen/ARM/vext.ll
llvm/test/CodeGen/ARM/vselect_imax.ll
llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 94605f88c3183..079c9989f9a85 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -15244,6 +15244,390 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
DAG.getUNDEF(VT), NewMask);
}
+/// Load/store instruction that can be merged with a base address
+/// update
+struct BaseUpdateTarget {
+ SDNode *N;
+ bool isIntrinsic;
+ bool isStore;
+ unsigned AddrOpIdx;
+};
+
+struct BaseUpdateUser {
+ /// Instruction that updates a pointer
+ SDNode *N;
+ /// Pointer increment operand
+ SDValue Inc;
+ /// Pointer increment value if it is a constant, or 0 otherwise
+ unsigned ConstInc;
+};
+
+static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
+ struct BaseUpdateUser &User,
+ bool SimpleConstIncOnly,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDNode *N = Target.N;
+ MemSDNode *MemN = cast<MemSDNode>(N);
+ SDLoc dl(N);
+
+ // Find the new opcode for the updating load/store.
+ bool isLoadOp = true;
+ bool isLaneOp = false;
+ // Workaround for vst1x and vld1x intrinsics which do not have alignment
+ // as an operand.
+ bool hasAlignment = true;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ if (Target.isIntrinsic) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default:
+ llvm_unreachable("unexpected intrinsic for Neon base update");
+ case Intrinsic::arm_neon_vld1:
+ NewOpc = ARMISD::VLD1_UPD;
+ NumVecs = 1;
+ break;
+ case Intrinsic::arm_neon_vld2:
+ NewOpc = ARMISD::VLD2_UPD;
+ NumVecs = 2;
+ break;
+ case Intrinsic::arm_neon_vld3:
+ NewOpc = ARMISD::VLD3_UPD;
+ NumVecs = 3;
+ break;
+ case Intrinsic::arm_neon_vld4:
+ NewOpc = ARMISD::VLD4_UPD;
+ NumVecs = 4;
+ break;
+ case Intrinsic::arm_neon_vld1x2:
+ NewOpc = ARMISD::VLD1x2_UPD;
+ NumVecs = 2;
+ hasAlignment = false;
+ break;
+ case Intrinsic::arm_neon_vld1x3:
+ NewOpc = ARMISD::VLD1x3_UPD;
+ NumVecs = 3;
+ hasAlignment = false;
+ break;
+ case Intrinsic::arm_neon_vld1x4:
+ NewOpc = ARMISD::VLD1x4_UPD;
+ NumVecs = 4;
+ hasAlignment = false;
+ break;
+ case Intrinsic::arm_neon_vld2dup:
+ NewOpc = ARMISD::VLD2DUP_UPD;
+ NumVecs = 2;
+ break;
+ case Intrinsic::arm_neon_vld3dup:
+ NewOpc = ARMISD::VLD3DUP_UPD;
+ NumVecs = 3;
+ break;
+ case Intrinsic::arm_neon_vld4dup:
+ NewOpc = ARMISD::VLD4DUP_UPD;
+ NumVecs = 4;
+ break;
+ case Intrinsic::arm_neon_vld2lane:
+ NewOpc = ARMISD::VLD2LN_UPD;
+ NumVecs = 2;
+ isLaneOp = true;
+ break;
+ case Intrinsic::arm_neon_vld3lane:
+ NewOpc = ARMISD::VLD3LN_UPD;
+ NumVecs = 3;
+ isLaneOp = true;
+ break;
+ case Intrinsic::arm_neon_vld4lane:
+ NewOpc = ARMISD::VLD4LN_UPD;
+ NumVecs = 4;
+ isLaneOp = true;
+ break;
+ case Intrinsic::arm_neon_vst1:
+ NewOpc = ARMISD::VST1_UPD;
+ NumVecs = 1;
+ isLoadOp = false;
+ break;
+ case Intrinsic::arm_neon_vst2:
+ NewOpc = ARMISD::VST2_UPD;
+ NumVecs = 2;
+ isLoadOp = false;
+ break;
+ case Intrinsic::arm_neon_vst3:
+ NewOpc = ARMISD::VST3_UPD;
+ NumVecs = 3;
+ isLoadOp = false;
+ break;
+ case Intrinsic::arm_neon_vst4:
+ NewOpc = ARMISD::VST4_UPD;
+ NumVecs = 4;
+ isLoadOp = false;
+ break;
+ case Intrinsic::arm_neon_vst2lane:
+ NewOpc = ARMISD::VST2LN_UPD;
+ NumVecs = 2;
+ isLoadOp = false;
+ isLaneOp = true;
+ break;
+ case Intrinsic::arm_neon_vst3lane:
+ NewOpc = ARMISD::VST3LN_UPD;
+ NumVecs = 3;
+ isLoadOp = false;
+ isLaneOp = true;
+ break;
+ case Intrinsic::arm_neon_vst4lane:
+ NewOpc = ARMISD::VST4LN_UPD;
+ NumVecs = 4;
+ isLoadOp = false;
+ isLaneOp = true;
+ break;
+ case Intrinsic::arm_neon_vst1x2:
+ NewOpc = ARMISD::VST1x2_UPD;
+ NumVecs = 2;
+ isLoadOp = false;
+ hasAlignment = false;
+ break;
+ case Intrinsic::arm_neon_vst1x3:
+ NewOpc = ARMISD::VST1x3_UPD;
+ NumVecs = 3;
+ isLoadOp = false;
+ hasAlignment = false;
+ break;
+ case Intrinsic::arm_neon_vst1x4:
+ NewOpc = ARMISD::VST1x4_UPD;
+ NumVecs = 4;
+ isLoadOp = false;
+ hasAlignment = false;
+ break;
+ }
+ } else {
+ isLaneOp = true;
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("unexpected opcode for Neon base update");
+ case ARMISD::VLD1DUP:
+ NewOpc = ARMISD::VLD1DUP_UPD;
+ NumVecs = 1;
+ break;
+ case ARMISD::VLD2DUP:
+ NewOpc = ARMISD::VLD2DUP_UPD;
+ NumVecs = 2;
+ break;
+ case ARMISD::VLD3DUP:
+ NewOpc = ARMISD::VLD3DUP_UPD;
+ NumVecs = 3;
+ break;
+ case ARMISD::VLD4DUP:
+ NewOpc = ARMISD::VLD4DUP_UPD;
+ NumVecs = 4;
+ break;
+ case ISD::LOAD:
+ NewOpc = ARMISD::VLD1_UPD;
+ NumVecs = 1;
+ isLaneOp = false;
+ break;
+ case ISD::STORE:
+ NewOpc = ARMISD::VST1_UPD;
+ NumVecs = 1;
+ isLaneOp = false;
+ isLoadOp = false;
+ break;
+ }
+ }
+
+ // Find the size of memory referenced by the load/store.
+ EVT VecTy;
+ if (isLoadOp) {
+ VecTy = N->getValueType(0);
+ } else if (Target.isIntrinsic) {
+ VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
+ } else {
+ assert(Target.isStore &&
+ "Node has to be a load, a store, or an intrinsic!");
+ VecTy = N->getOperand(1).getValueType();
+ }
+
+ bool isVLDDUPOp =
+ NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
+ NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
+
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (isLaneOp || isVLDDUPOp)
+ NumBytes /= VecTy.getVectorNumElements();
+
+ if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
+ // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+ // separate instructions that make it harder to use a non-constant update.
+ return false;
+ }
+
+ if (SimpleConstIncOnly && User.ConstInc != NumBytes)
+ return false;
+
+ // OK, we found an ADD we can fold into the base update.
+ // Now, create a _UPD node, taking care of not breaking alignment.
+
+ EVT AlignedVecTy = VecTy;
+ unsigned Alignment = MemN->getAlignment();
+
+ // If this is a less-than-standard-aligned load/store, change the type to
+ // match the standard alignment.
+ // The alignment is overlooked when selecting _UPD variants; and it's
+ // easier to introduce bitcasts here than fix that.
+ // There are 3 ways to get to this base-update combine:
+ // - intrinsics: they are assumed to be properly aligned (to the standard
+ // alignment of the memory type), so we don't need to do anything.
+ // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+ // intrinsics, so, likewise, there's nothing to do.
+ // - generic load/store instructions: the alignment is specified as an
+ // explicit operand, rather than implicitly as the standard alignment
+ // of the memory type (like the intrisics). We need to change the
+ // memory type to match the explicit alignment. That way, we don't
+ // generate non-standard-aligned ARMISD::VLDx nodes.
+ if (isa<LSBaseSDNode>(N)) {
+ if (Alignment == 0)
+ Alignment = 1;
+ if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+ MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+ assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+ assert(!isLaneOp && "Unexpected generic load/store lane.");
+ unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+ AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+ }
+ // Don't set an explicit alignment on regular load/stores that we want
+ // to transform to VLD/VST 1_UPD nodes.
+ // This matches the behavior of regular load/stores, which only get an
+ // explicit alignment if the MMO alignment is larger than the standard
+ // alignment of the memory type.
+ // Intrinsics, however, always get an explicit alignment, set to the
+ // alignment of the MMO.
+ Alignment = 1;
+ }
+
+ // Create the new updating load/store node.
+ // First, create an SDVTList for the new updating node's results.
+ EVT Tys[6];
+ unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = AlignedVecTy;
+ Tys[n++] = MVT::i32;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+
+ // Then, gather the new node's operands.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // incoming chain
+ Ops.push_back(N->getOperand(Target.AddrOpIdx));
+ Ops.push_back(User.Inc);
+
+ if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+ // Try to match the intrinsic's signature
+ Ops.push_back(StN->getValue());
+ } else {
+ // Loads (and of course intrinsics) match the intrinsics' signature,
+ // so just add all but the alignment operand.
+ unsigned LastOperand =
+ hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
+ for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
+ Ops.push_back(N->getOperand(i));
+ }
+
+ // For all node types, the alignment operand is always the last one.
+ Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+
+ // If this is a non-standard-aligned STORE, the penultimate operand is the
+ // stored value. Bitcast it to the aligned type.
+ if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+ SDValue &StVal = Ops[Ops.size() - 2];
+ StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+ }
+
+ EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
+ MemN->getMemOperand());
+
+ // Update the uses.
+ SmallVector<SDValue, 5> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i)
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+ // If this is an non-standard-aligned LOAD, the first result is the loaded
+ // value. Bitcast it to the expected result type.
+ if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+ SDValue &LdVal = NewResults[0];
+ LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+ }
+
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
+
+ return true;
+}
+
+// If (opcode ptr inc) is and ADD-like instruction, return the
+// increment value. Otherwise return 0.
+static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
+ SDValue Inc, const SelectionDAG &DAG) {
+ ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
+ if (!CInc)
+ return 0;
+
+ switch (Opcode) {
+ case ARMISD::VLD1_UPD:
+ case ISD::ADD:
+ return CInc->getZExtValue();
+ case ISD::OR: {
+ if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
+ // (OR ptr inc) is the same as (ADD ptr inc)
+ return CInc->getZExtValue();
+ }
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
+static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
+ switch (N->getOpcode()) {
+ case ISD::ADD:
+ case ISD::OR: {
+ if (isa<ConstantSDNode>(N->getOperand(1))) {
+ *Ptr = N->getOperand(0);
+ *CInc = N->getOperand(1);
+ return true;
+ }
+ return false;
+ }
+ case ARMISD::VLD1_UPD: {
+ if (isa<ConstantSDNode>(N->getOperand(2))) {
+ *Ptr = N->getOperand(1);
+ *CInc = N->getOperand(2);
+ return true;
+ }
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
+ // Check that the add is independent of the load/store.
+ // Otherwise, folding it would create a cycle. Search through Addr
+ // as well, since the User may not be a direct user of Addr and
+ // only share a base pointer.
+ SmallPtrSet<const SDNode *, 32> Visited;
+ SmallVector<const SDNode *, 16> Worklist;
+ Worklist.push_back(N);
+ Worklist.push_back(User);
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist))
+ return false;
+ return true;
+}
+
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
/// NEON load/store intrinsics, and generic vector load/stores, to merge
/// base address updates.
@@ -15251,237 +15635,89 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
/// The caller is assumed to have checked legality.
static SDValue CombineBaseUpdate(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
- SelectionDAG &DAG = DCI.DAG;
const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
const bool isStore = N->getOpcode() == ISD::STORE;
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
+ BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
+
SDValue Addr = N->getOperand(AddrOpIdx);
- MemSDNode *MemN = cast<MemSDNode>(N);
- SDLoc dl(N);
+
+ SmallVector<BaseUpdateUser, 8> BaseUpdates;
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
- if (User->getOpcode() != ISD::ADD ||
- UI.getUse().getResNo() != Addr.getResNo())
+ if (UI.getUse().getResNo() != Addr.getResNo() ||
+ User->getNumOperands() != 2)
continue;
- // Check that the add is independent of the load/store. Otherwise, folding
- // it would create a cycle. We can avoid searching through Addr as it's a
- // predecessor to both.
- SmallPtrSet<const SDNode *, 32> Visited;
- SmallVector<const SDNode *, 16> Worklist;
- Visited.insert(Addr.getNode());
- Worklist.push_back(N);
- Worklist.push_back(User);
- if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
- SDNode::hasPredecessorHelper(User, Visited, Worklist))
- continue;
-
- // Find the new opcode for the updating load/store.
- bool isLoadOp = true;
- bool isLaneOp = false;
- // Workaround for vst1x and vld1x intrinsics which do not have alignment
- // as an operand.
- bool hasAlignment = true;
- unsigned NewOpc = 0;
- unsigned NumVecs = 0;
- if (isIntrinsic) {
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: llvm_unreachable("unexpected intrinsic for Neon base update");
- case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
- NumVecs = 1; break;
- case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
- NumVecs = 2; break;
- case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
- NumVecs = 3; break;
- case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
- NumVecs = 4; break;
- case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD;
- NumVecs = 2; hasAlignment = false; break;
- case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD;
- NumVecs = 3; hasAlignment = false; break;
- case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
- NumVecs = 4; hasAlignment = false; break;
- case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
- NumVecs = 2; break;
- case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
- NumVecs = 3; break;
- case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
- NumVecs = 4; break;
- case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
- NumVecs = 2; isLaneOp = true; break;
- case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
- NumVecs = 3; isLaneOp = true; break;
- case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
- NumVecs = 4; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
- NumVecs = 1; isLoadOp = false; break;
- case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
- NumVecs = 2; isLoadOp = false; break;
- case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
- NumVecs = 3; isLoadOp = false; break;
- case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
- NumVecs = 4; isLoadOp = false; break;
- case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
- NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
- NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
- NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
- case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD;
- NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
- case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD;
- NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
- case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD;
- NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
- }
- } else {
- isLaneOp = true;
- switch (N->getOpcode()) {
- default: llvm_unreachable("unexpected opcode for Neon base update");
- case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
- case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
- case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
- case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
- case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
- NumVecs = 1; isLaneOp = false; break;
- case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
- NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
- }
- }
-
- // Find the size of memory referenced by the load/store.
- EVT VecTy;
- if (isLoadOp) {
- VecTy = N->getValueType(0);
- } else if (isIntrinsic) {
- VecTy = N->getOperand(AddrOpIdx+1).getValueType();
- } else {
- assert(isStore && "Node has to be a load, a store, or an intrinsic!");
- VecTy = N->getOperand(1).getValueType();
- }
-
- bool isVLDDUPOp =
- NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
- NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
-
- unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
- if (isLaneOp || isVLDDUPOp)
- NumBytes /= VecTy.getVectorNumElements();
-
- // If the increment is a constant, it must match the memory ref size.
- SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
- ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
- if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
- // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
- // separate instructions that make it harder to use a non-constant update.
- continue;
- }
+ SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
+ unsigned ConstInc =
+ getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
- // OK, we found an ADD we can fold into the base update.
- // Now, create a _UPD node, taking care of not breaking alignment.
-
- EVT AlignedVecTy = VecTy;
- unsigned Alignment = MemN->getAlignment();
-
- // If this is a less-than-standard-aligned load/store, change the type to
- // match the standard alignment.
- // The alignment is overlooked when selecting _UPD variants; and it's
- // easier to introduce bitcasts here than fix that.
- // There are 3 ways to get to this base-update combine:
- // - intrinsics: they are assumed to be properly aligned (to the standard
- // alignment of the memory type), so we don't need to do anything.
- // - ARMISD::VLDx nodes: they are only generated from the aforementioned
- // intrinsics, so, likewise, there's nothing to do.
- // - generic load/store instructions: the alignment is specified as an
- // explicit operand, rather than implicitly as the standard alignment
- // of the memory type (like the intrisics). We need to change the
- // memory type to match the explicit alignment. That way, we don't
- // generate non-standard-aligned ARMISD::VLDx nodes.
- if (isa<LSBaseSDNode>(N)) {
- if (Alignment == 0)
- Alignment = 1;
- if (Alignment < VecTy.getScalarSizeInBits() / 8) {
- MVT EltTy = MVT::getIntegerVT(Alignment * 8);
- assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
- assert(!isLaneOp && "Unexpected generic load/store lane.");
- unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
- AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
- }
- // Don't set an explicit alignment on regular load/stores that we want
- // to transform to VLD/VST 1_UPD nodes.
- // This matches the behavior of regular load/stores, which only get an
- // explicit alignment if the MMO alignment is larger than the standard
- // alignment of the memory type.
- // Intrinsics, however, always get an explicit alignment, set to the
- // alignment of the MMO.
- Alignment = 1;
- }
+ if (ConstInc || User->getOpcode() == ISD::ADD)
+ BaseUpdates.push_back({User, Inc, ConstInc});
+ }
- // Create the new updating load/store node.
- // First, create an SDVTList for the new updating node's results.
- EVT Tys[6];
- unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
- unsigned n;
- for (n = 0; n < NumResultVecs; ++n)
- Tys[n] = AlignedVecTy;
- Tys[n++] = MVT::i32;
- Tys[n] = MVT::Other;
- SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
+ // If the address is a constant pointer increment itself, find
+ // another constant increment that has the same base operand
+ SDValue Base;
+ SDValue CInc;
+ if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
+ unsigned Offset =
+ getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
+ for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
+ UI != UE; ++UI) {
- // Then, gather the new node's operands.
- SmallVector<SDValue, 8> Ops;
- Ops.push_back(N->getOperand(0)); // incoming chain
- Ops.push_back(N->getOperand(AddrOpIdx));
- Ops.push_back(Inc);
+ SDNode *User = *UI;
+ if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
+ User->getNumOperands() != 2)
+ continue;
- if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
- // Try to match the intrinsic's signature
- Ops.push_back(StN->getValue());
- } else {
- // Loads (and of course intrinsics) match the intrinsics' signature,
- // so just add all but the alignment operand.
- unsigned LastOperand =
- hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
- for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
- Ops.push_back(N->getOperand(i));
- }
+ SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
+ unsigned UserOffset =
+ getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
- // For all node types, the alignment operand is always the last one.
- Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+ if (!UserOffset || UserOffset <= Offset)
+ continue;
- // If this is a non-standard-aligned STORE, the penultimate operand is the
- // stored value. Bitcast it to the aligned type.
- if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
- SDValue &StVal = Ops[Ops.size()-2];
- StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+ unsigned NewConstInc = UserOffset - Offset;
+ SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
+ BaseUpdates.push_back({User, NewInc, NewConstInc});
}
+ }
- EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
- SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
- MemN->getMemOperand());
-
- // Update the uses.
- SmallVector<SDValue, 5> NewResults;
- for (unsigned i = 0; i < NumResultVecs; ++i)
- NewResults.push_back(SDValue(UpdN.getNode(), i));
-
- // If this is an non-standard-aligned LOAD, the first result is the loaded
- // value. Bitcast it to the expected result type.
- if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
- SDValue &LdVal = NewResults[0];
- LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+ // Try to fold the load/store with an update that matches memory
+ // access size. This should work well for sequential loads.
+ //
+ // Filter out invalid updates as well.
+ unsigned NumValidUpd = BaseUpdates.size();
+ for (unsigned I = 0; I < NumValidUpd;) {
+ BaseUpdateUser &User = BaseUpdates[I];
+ if (!isValidBaseUpdate(N, User.N)) {
+ --NumValidUpd;
+ std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
+ continue;
}
- NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
- DCI.CombineTo(N, NewResults);
- DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
-
- break;
+ if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
+ return SDValue();
+ ++I;
+ }
+ BaseUpdates.resize(NumValidUpd);
+
+ // Try to fold with other users. Non-constant updates are considered
+ // first, and constant updates are sorted to not break a sequence of
+ // strided accesses (if there is any).
+ std::sort(BaseUpdates.begin(), BaseUpdates.end(),
+ [](BaseUpdateUser &LHS, BaseUpdateUser &RHS) {
+ return LHS.ConstInc < RHS.ConstInc;
+ });
+ for (BaseUpdateUser &User : BaseUpdates) {
+ if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
+ return SDValue();
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
index ad19d8d1ce8f7..f0c066911f12d 100644
--- a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
+++ b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -3,80 +3,57 @@
; rdar://12713765
; When realign-stack is set to false, make sure we are not creating stack
; objects that are assumed to be 64-byte aligned.
- at T3_retval = common global <16 x float> zeroinitializer, align 16
define void @test1(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp "no-realign-stack" {
-entry:
; CHECK-LABEL: test1:
-; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
-; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]!
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add r[[R3:[0-9]+]], r[[R1]], #32
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: add r[[R3:[0-9]+]], r[[R1]], #48
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: mov r[[R2:[0-9]+]], sp
-; CHECK: add r[[R3:[0-9]+]], r[[R2]], #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: add r[[R4:[0-9]+]], r[[R2]], #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
-; CHECK: mov r[[R5:[0-9]+]], r[[R2]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R5]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R4]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: add r[[R1:[0-9]+]], r0, #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add r[[R1:[0-9]+]], r0, #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
+; CHECK: mov r[[NOTALIGNED:[0-9]+]], sp
+; CHECK: add r[[NOTALIGNED]], r[[NOTALIGNED]], #32
+; CHECK: add r[[PTR]], r[[PTR]], #32
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[NOTALIGNED]]:128]
+entry:
%retval = alloca <16 x float>, align 64
- %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
- store <16 x float> %0, <16 x float>* %retval
- %1 = load <16 x float>, <16 x float>* %retval
- store <16 x float> %1, <16 x float>* %agg.result, align 16
+ %a1 = bitcast <16 x float>* %retval to float*
+ %a2 = getelementptr inbounds float, float* %a1, i64 8
+ %a3 = bitcast float* %a2 to <4 x float>*
+
+ %b1 = bitcast <16 x float>* %agg.result to float*
+ %b2 = getelementptr inbounds float, float* %b1, i64 8
+ %b3 = bitcast float* %b2 to <4 x float>*
+
+ %0 = load <4 x float>, <4 x float>* %a3, align 16
+ %1 = load <4 x float>, <4 x float>* %b3, align 16
+ store <4 x float> %0, <4 x float>* %b3, align 16
+ store <4 x float> %1, <4 x float>* %a3, align 16
ret void
}
define void @test2(<16 x float>* noalias sret(<16 x float>) %agg.result) nounwind ssp {
-entry:
; CHECK-LABEL: test2:
-; CHECK: ldr r[[R1:[0-9]+]], [pc, r[[R1]]]
-; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]!
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: mov r[[R1:[0-9]+]], sp
-; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #16
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: mov r[[R3:[0-9]+]], #32
-; CHECK: mov r[[R9:[0-9]+]], r[[R1]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128], r[[R3]]
-; CHECK: mov r[[R3:[0-9]+]], r[[R9]]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R9]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R3]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R2]]:128]
-; CHECK: vld1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add r[[R1:[0-9]+]], r0, #48
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: add r[[R1:[0-9]+]], r0, #32
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
-; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
-; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
+; CHECK: mov r[[PTR:[0-9]+]], r{{[0-9]+}}
+; CHECK: mov r[[ALIGNED:[0-9]+]], sp
+; CHECK: orr r[[ALIGNED]], r[[ALIGNED]], #32
+; CHECK: add r[[PTR]], r[[PTR]], #32
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[PTR]]:128]
+; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[ALIGNED]]:128]
+entry:
+ %retval = alloca <16 x float>, align 64
+ %a1 = bitcast <16 x float>* %retval to float*
+ %a2 = getelementptr inbounds float, float* %a1, i64 8
+ %a3 = bitcast float* %a2 to <4 x float>*
+ %b1 = bitcast <16 x float>* %agg.result to float*
+ %b2 = getelementptr inbounds float, float* %b1, i64 8
+ %b3 = bitcast float* %b2 to <4 x float>*
-%retval = alloca <16 x float>, align 64
- %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
- store <16 x float> %0, <16 x float>* %retval
- %1 = load <16 x float>, <16 x float>* %retval
- store <16 x float> %1, <16 x float>* %agg.result, align 16
+ %0 = load <4 x float>, <4 x float>* %a3, align 16
+ %1 = load <4 x float>, <4 x float>* %b3, align 16
+ store <4 x float> %0, <4 x float>* %b3, align 16
+ store <4 x float> %1, <4 x float>* %a3, align 16
ret void
}
diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll
new file mode 100644
index 0000000000000..322c55a14909a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -o - < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-unknown-linux-gnueabihf"
+
+define <4 x float> @test(float* %A) {
+; CHECK-LABEL: test:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X.ptr = bitcast float* %A to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+define <4 x float> @test_stride(float* %A) {
+; CHECK-LABEL: test_stride:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #24
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X.ptr = bitcast float* %A to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+define <4 x float> @test_stride_mixed(float* %A) {
+; CHECK-LABEL: test_stride_mixed:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #24
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X.ptr = bitcast float* %A to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+; Refrain from using multiple stride registers
+define <4 x float> @test_stride_noop(float* %A) {
+; CHECK-LABEL: test_stride_noop:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mov r1, #24
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1
+; CHECK-NEXT: mov r1, #32
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X.ptr = bitcast float* %A to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+define <4 x float> @test_positive_initial_offset(float* %A) {
+; CHECK-LABEL: test_positive_initial_offset:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: add r0, r0, #32
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+ %X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+define <4 x float> @test_negative_initial_offset(float* %A) {
+; CHECK-LABEL: test_negative_initial_offset:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sub r0, r0, #64
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16
+ %X.ptr = bitcast float* %X.ptr.elt to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+ at global_float_array = external global [128 x float], align 4
+define <4 x float> @test_global() {
+; CHECK-LABEL: test_global:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r0, :lower16:global_float_array
+; CHECK-NEXT: movt r0, :upper16:global_float_array
+; CHECK-NEXT: add r0, r0, #32
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: bx lr
+ %X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4
+ %Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4
+ %Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+define <4 x float> @test_stack() {
+; Use huge alignment to test that ADD would not be converted to OR
+; CHECK-LABEL: test_stack:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r10, r11, lr}
+; CHECK-NEXT: push {r4, r10, r11, lr}
+; CHECK-NEXT: .setfp r11, sp, #8
+; CHECK-NEXT: add r11, sp, #8
+; CHECK-NEXT: .pad #240
+; CHECK-NEXT: sub sp, sp, #240
+; CHECK-NEXT: bfc sp, #0, #7
+; CHECK-NEXT: mov r4, sp
+; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: bl external_function
+; CHECK-NEXT: vld1.32 {d16, d17}, [r4:128]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r4:128]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128]
+; CHECK-NEXT: vadd.f32 q0, q8, q9
+; CHECK-NEXT: sub sp, r11, #8
+; CHECK-NEXT: pop {r4, r10, r11, pc}
+ %array = alloca [32 x float], align 128
+ %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0
+ call void @external_function(float* %arraydecay)
+ %X.ptr = bitcast [32 x float]* %array to <4 x float>*
+ %X = load <4 x float>, <4 x float>* %X.ptr, align 4
+ %Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8
+ %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>*
+ %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4
+ %tmp.sum = fadd <4 x float> %X, %Y
+ %sum = fadd <4 x float> %tmp.sum, %Z
+ ret <4 x float> %sum
+}
+
+define <2 x double> @test_double(double* %A) {
+; CHECK-LABEL: test_double:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: add r0, r0, #64
+; CHECK-NEXT: vld1.64 {d16, d17}, [r0]!
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f64 d20, d17, d19
+; CHECK-NEXT: vadd.f64 d16, d16, d18
+; CHECK-NEXT: vld1.64 {d22, d23}, [r0]
+; CHECK-NEXT: vadd.f64 d1, d20, d23
+; CHECK-NEXT: vadd.f64 d0, d16, d22
+; CHECK-NEXT: bx lr
+ %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8
+ %X.ptr = bitcast double* %X.ptr.elt to <2 x double>*
+ %X = load <2 x double>, <2 x double>* %X.ptr, align 8
+ %Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10
+ %Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>*
+ %Y = load <2 x double>, <2 x double>* %Y.ptr, align 8
+ %Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12
+ %Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>*
+ %Z = load <2 x double>, <2 x double>* %Z.ptr, align 8
+ %tmp.sum = fadd <2 x double> %X, %Y
+ %sum = fadd <2 x double> %tmp.sum, %Z
+ ret <2 x double> %sum
+}
+
+define void @test_various_instructions(float* %A) {
+; CHECK-LABEL: test_various_instructions:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vadd.f32 q8, q8, q9
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]
+; CHECK-NEXT: bx lr
+ %X.ptr = bitcast float* %A to i8*
+ %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1)
+ %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4
+ %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>*
+ %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4
+ %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8
+ %Z.ptr = bitcast float* %Z.ptr.elt to i8*
+ %Z = fadd <4 x float> %X, %Y
+ tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4)
+ ret void
+}
+
+define void @test_lsr_geps(float* %a, float* %b, i32 %n) {
+; CHECK-LABEL: test_lsr_geps:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: bxlt lr
+; CHECK-NEXT: .LBB10_1: @ %for.body.preheader
+; CHECK-NEXT: mov r12, #0
+; CHECK-NEXT: .LBB10_2: @ %for.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add r3, r0, r12
+; CHECK-NEXT: subs r2, r2, #1
+; CHECK-NEXT: vld1.32 {d16, d17}, [r3]!
+; CHECK-NEXT: vld1.32 {d18, d19}, [r3]!
+; CHECK-NEXT: vld1.32 {d20, d21}, [r3]!
+; CHECK-NEXT: vld1.32 {d22, d23}, [r3]
+; CHECK-NEXT: add r3, r1, r12
+; CHECK-NEXT: add r12, r12, #64
+; CHECK-NEXT: vst1.32 {d16, d17}, [r3]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r3]!
+; CHECK-NEXT: vst1.32 {d20, d21}, [r3]!
+; CHECK-NEXT: vst1.32 {d22, d23}, [r3]
+; CHECK-NEXT: bne .LBB10_2
+; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT: bx lr
+entry:
+ %cmp61 = icmp sgt i32 %n, 0
+ br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ]
+ %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+ %0 = bitcast float* %a to i8*
+ %1 = bitcast float* %b to i8*
+ %uglygep19 = getelementptr i8, i8* %0, i32 %lsr.iv1
+ %uglygep1920 = bitcast i8* %uglygep19 to <4 x float>*
+ %2 = load <4 x float>, <4 x float>* %uglygep1920, align 4
+ %uglygep16 = getelementptr i8, i8* %0, i32 %lsr.iv1
+ %uglygep1617 = bitcast i8* %uglygep16 to <4 x float>*
+ %scevgep18 = getelementptr <4 x float>, <4 x float>* %uglygep1617, i32 1
+ %3 = load <4 x float>, <4 x float>* %scevgep18, align 4
+ %uglygep13 = getelementptr i8, i8* %0, i32 %lsr.iv1
+ %uglygep1314 = bitcast i8* %uglygep13 to <4 x float>*
+ %scevgep15 = getelementptr <4 x float>, <4 x float>* %uglygep1314, i32 2
+ %4 = load <4 x float>, <4 x float>* %scevgep15, align 4
+ %uglygep10 = getelementptr i8, i8* %0, i32 %lsr.iv1
+ %uglygep1011 = bitcast i8* %uglygep10 to <4 x float>*
+ %scevgep12 = getelementptr <4 x float>, <4 x float>* %uglygep1011, i32 3
+ %5 = load <4 x float>, <4 x float>* %scevgep12, align 4
+ %uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv1
+ tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %uglygep8, <4 x float> %2, i32 4)
+ %uglygep6 = getelementptr i8, i8* %1, i32 %lsr.iv1
+ %scevgep7 = getelementptr i8, i8* %uglygep6, i32 16
+ tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep7, <4 x float> %3, i32 4)
+ %uglygep4 = getelementptr i8, i8* %1, i32 %lsr.iv1
+ %scevgep5 = getelementptr i8, i8* %uglygep4, i32 32
+ tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep5, <4 x float> %4, i32 4)
+ %uglygep = getelementptr i8, i8* %1, i32 %lsr.iv1
+ %scevgep = getelementptr i8, i8* %uglygep, i32 48
+ tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep, <4 x float> %5, i32 4)
+ %lsr.iv.next = add i32 %lsr.iv, -1
+ %lsr.iv.next2 = add nuw i32 %lsr.iv1, 64
+ %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare void @external_function(float*)
+declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly
diff --git a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
index 9ea8d44d8cc2d..6fc56967bc7aa 100644
--- a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
@@ -83,16 +83,16 @@ define void @test(double, float, i16, <4 x half>, <8 x half>) {
; SOFT: @ %bb.0: @ %entry
; SOFT-NEXT: push {r11, lr}
; SOFT-NEXT: sub sp, sp, #32
-; SOFT-NEXT: vldr d16, [sp, #40]
-; SOFT-NEXT: mov r12, #16
-; SOFT-NEXT: vabs.f16 d16, d16
-; SOFT-NEXT: mov lr, sp
-; SOFT-NEXT: vst1.16 {d16}, [lr:64], r12
; SOFT-NEXT: add r12, sp, #48
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
+; SOFT-NEXT: add r12, sp, #16
; SOFT-NEXT: vabs.f16 q8, q8
-; SOFT-NEXT: str r3, [sp, #8]
-; SOFT-NEXT: vst1.64 {d16, d17}, [lr]
+; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
+; SOFT-NEXT: mov r12, sp
+; SOFT-NEXT: vldr d16, [sp, #40]
+; SOFT-NEXT: vabs.f16 d16, d16
+; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
+; SOFT-NEXT: str r3, [r12]
; SOFT-NEXT: bl use
; SOFT-NEXT: add sp, sp, #32
; SOFT-NEXT: pop {r11, pc}
@@ -105,26 +105,26 @@ define void @test(double, float, i16, <4 x half>, <8 x half>) {
;
; SOFTEB-LABEL: test:
; SOFTEB: @ %bb.0: @ %entry
-; SOFTEB-NEXT: .save {r11, lr}
-; SOFTEB-NEXT: push {r11, lr}
+; SOFTEB-NEXT: .save {r4, lr}
+; SOFTEB-NEXT: push {r4, lr}
; SOFTEB-NEXT: .pad #32
; SOFTEB-NEXT: sub sp, sp, #32
; SOFTEB-NEXT: vldr d16, [sp, #40]
-; SOFTEB-NEXT: mov r12, #16
; SOFTEB-NEXT: mov lr, sp
-; SOFTEB-NEXT: str r3, [sp, #8]
+; SOFTEB-NEXT: add r4, sp, #48
+; SOFTEB-NEXT: add r12, sp, #16
; SOFTEB-NEXT: vrev64.16 d16, d16
; SOFTEB-NEXT: vabs.f16 d16, d16
-; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12
-; SOFTEB-NEXT: add r12, sp, #48
-; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
+; SOFTEB-NEXT: vst1.16 {d16}, [lr:64]!
+; SOFTEB-NEXT: vld1.64 {d16, d17}, [r4]
; SOFTEB-NEXT: vrev64.16 q8, q8
+; SOFTEB-NEXT: str r3, [lr]
; SOFTEB-NEXT: vabs.f16 q8, q8
; SOFTEB-NEXT: vrev64.16 q8, q8
-; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr]
+; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: bl use
; SOFTEB-NEXT: add sp, sp, #32
-; SOFTEB-NEXT: pop {r11, pc}
+; SOFTEB-NEXT: pop {r4, pc}
;
; HARDEB-LABEL: test:
; HARDEB: @ %bb.0: @ %entry
@@ -148,20 +148,20 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
; SOFT-NEXT: push {r11, lr}
; SOFT-NEXT: sub sp, sp, #32
; SOFT-NEXT: add r12, sp, #80
-; SOFT-NEXT: mov lr, sp
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
; SOFT-NEXT: add r12, sp, #48
; SOFT-NEXT: vabs.f16 q8, q8
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
; SOFT-NEXT: add r12, sp, #64
-; SOFT-NEXT: str r3, [sp, #8]
; SOFT-NEXT: vadd.f16 q8, q8, q9
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
-; SOFT-NEXT: mov r12, #16
+; SOFT-NEXT: add r12, sp, #16
; SOFT-NEXT: vmul.f16 q8, q9, q8
-; SOFT-NEXT: vldr d18, [sp, #40]
-; SOFT-NEXT: vst1.16 {d18}, [lr:64], r12
-; SOFT-NEXT: vst1.64 {d16, d17}, [lr]
+; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
+; SOFT-NEXT: mov r12, sp
+; SOFT-NEXT: vldr d16, [sp, #40]
+; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
+; SOFT-NEXT: str r3, [r12]
; SOFT-NEXT: bl use
; SOFT-NEXT: add sp, sp, #32
; SOFT-NEXT: pop {r11, pc}
@@ -181,13 +181,8 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
; SOFTEB-NEXT: push {r11, lr}
; SOFTEB-NEXT: .pad #32
; SOFTEB-NEXT: sub sp, sp, #32
-; SOFTEB-NEXT: vldr d16, [sp, #40]
-; SOFTEB-NEXT: mov r12, #16
-; SOFTEB-NEXT: mov lr, sp
-; SOFTEB-NEXT: str r3, [sp, #8]
-; SOFTEB-NEXT: vrev64.16 d16, d16
-; SOFTEB-NEXT: vst1.16 {d16}, [lr:64], r12
; SOFTEB-NEXT: add r12, sp, #80
+; SOFTEB-NEXT: mov lr, sp
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: add r12, sp, #48
; SOFTEB-NEXT: vrev64.16 q8, q8
@@ -197,10 +192,15 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vadd.f16 q8, q8, q9
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
+; SOFTEB-NEXT: add r12, sp, #16
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vmul.f16 q8, q9, q8
+; SOFTEB-NEXT: vldr d18, [sp, #40]
+; SOFTEB-NEXT: vrev64.16 d18, d18
+; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]!
+; SOFTEB-NEXT: str r3, [lr]
; SOFTEB-NEXT: vrev64.16 q8, q8
-; SOFTEB-NEXT: vst1.64 {d16, d17}, [lr]
+; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: bl use
; SOFTEB-NEXT: add sp, sp, #32
; SOFTEB-NEXT: pop {r11, pc}
diff --git a/llvm/test/CodeGen/ARM/large-vector.ll b/llvm/test/CodeGen/ARM/large-vector.ll
index 3c351aa3a3a5b..f587a96a48b85 100644
--- a/llvm/test/CodeGen/ARM/large-vector.ll
+++ b/llvm/test/CodeGen/ARM/large-vector.ll
@@ -26,20 +26,18 @@ define <32 x i8> @test_consume_arg([9 x double], <32 x i8> %vec) {
define void @test_produce_arg() {
; CHECK-LABEL: test_produce_arg:
-; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #32
-; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
; CHECK-V7K: add r[[BASE:[0-9]+]], sp, #16
+; CHECK-V7K: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]!
; CHECK-V7K: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]:128]
-; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #24
-; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
; CHECK-AAPCS: add r[[BASE:[0-9]+]], sp, #8
+; CHECK-AAPCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
; CHECK-AAPCS: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
-; CHECK-APCS: add r[[BASE:[0-9]+]], sp, #60
-; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
+; CHECK-APCS: mov r[[R4:[0-9]+]], sp
; CHECK-APCS: mov r[[BASE:[0-9]+]], sp
-; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #76
+; CHECK-APCS: str {{r[0-9]+}}, [r[[BASE]]], #60
+; CHECK-APCS: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]!
; CHECK-APCS: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r[[BASE]]]
call <32 x i8> @test_consume_arg([9 x double] undef, <32 x i8> zeroinitializer)
diff --git a/llvm/test/CodeGen/ARM/memcpy-inline.ll b/llvm/test/CodeGen/ARM/memcpy-inline.ll
index 8dab9b67d5593..4e6627cd7999a 100644
--- a/llvm/test/CodeGen/ARM/memcpy-inline.ll
+++ b/llvm/test/CodeGen/ARM/memcpy-inline.ll
@@ -44,11 +44,10 @@ entry:
define void @t2(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t2:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
-; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: movs [[INC:r[0-9]+]], #32
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
; CHECK: movw [[REG2:r[0-9]+]], #16716
; CHECK: movt [[REG2:r[0-9]+]], #72
; CHECK: str [[REG2]], [r0]
diff --git a/llvm/test/CodeGen/ARM/memset-align.ll b/llvm/test/CodeGen/ARM/memset-align.ll
index b8f42f8de9cfd..cdd01477c0d1f 100644
--- a/llvm/test/CodeGen/ARM/memset-align.ll
+++ b/llvm/test/CodeGen/ARM/memset-align.ll
@@ -10,18 +10,17 @@ define void @test() {
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: vmov.i32 q8, #0x0
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: mov.w r1, #-1
-; CHECK-NEXT: vmov.i32 q8, #0x0
-; CHECK-NEXT: movs r2, #15
-; CHECK-NEXT: mov r3, r0
+; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: strd r1, r1, [sp, #8]
; CHECK-NEXT: strd r1, r1, [sp]
-; CHECK-NEXT: str r1, [sp, #16]
-; CHECK-NEXT: vst1.64 {d16, d17}, [r3], r2
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: str r2, [r3]
+; CHECK-NEXT: vst1.64 {d16, d17}, [r2]!
+; CHECK-NEXT: str r1, [r2]
; CHECK-NEXT: str r1, [sp, #20]
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: str.w r1, [sp, #15]
; CHECK-NEXT: bl callee
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: pop {r7, pc}
diff --git a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll
index 92f59f22e88ea..addd698d786ed 100644
--- a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll
+++ b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll
@@ -76,13 +76,14 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
+; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
@@ -93,8 +94,6 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
-; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
-
; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
}
@@ -170,13 +169,14 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
+; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
@@ -187,7 +187,6 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]]
-; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]]
}
diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll
index 7512323bb56ae..396d74d4040a7 100644
--- a/llvm/test/CodeGen/ARM/vector-load.ll
+++ b/llvm/test/CodeGen/ARM/vector-load.ll
@@ -253,9 +253,8 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
}
; CHECK-LABEL: test_silly_load:
-; CHECK: vldr d{{[0-9]+}}, [r0, #16]
-; CHECK: movs r1, #24
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]!
+; CHECK: vld1.8 {d{{[0-9]+}}}, [r0:64]!
; CHECK: ldr {{r[0-9]+}}, [r0]
define void @test_silly_load(<28 x i8>* %addr) {
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index 554588fcc8e93..5ff04ef0dd196 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -216,15 +216,14 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
; CHECK-LABEL: test_multisource:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d18, [r0, #32]
-; CHECK-NEXT: mov r1, r0
-; CHECK-NEXT: vorr d22, d18, d18
-; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
-; CHECK-NEXT: vldr d19, [r0, #48]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]
-; CHECK-NEXT: vzip.16 d22, d19
-; CHECK-NEXT: vtrn.16 q8, q10
-; CHECK-NEXT: vext.16 d18, d18, d22, #2
+; CHECK-NEXT: vld1.16 {d16, d17}, [r0:128]!
+; CHECK-NEXT: vld1.16 {d18, d19}, [r0:128]!
+; CHECK-NEXT: vld1.16 {d20, d21}, [r0:128]!
+; CHECK-NEXT: vorr d23, d20, d20
+; CHECK-NEXT: vldr d22, [r0]
+; CHECK-NEXT: vzip.16 d23, d22
+; CHECK-NEXT: vtrn.16 q8, q9
+; CHECK-NEXT: vext.16 d18, d20, d23, #2
; CHECK-NEXT: vext.16 d16, d18, d16, #2
; CHECK-NEXT: vext.16 d16, d16, d16, #2
; CHECK-NEXT: vmov r0, r1, d16
diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll
index db303e6e6c222..11e7f86a06071 100644
--- a/llvm/test/CodeGen/ARM/vselect_imax.ll
+++ b/llvm/test/CodeGen/ARM/vselect_imax.ll
@@ -134,106 +134,97 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
%T1_19* %blend, %T0_19* %storeaddr) {
; CHECK-LABEL: func_blend19:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-NEXT: add r2, r1, #48
-; CHECK-NEXT: mov r8, #0
-; CHECK-NEXT: vld1.64 {d16, d17}, [r2:128]
-; CHECK-NEXT: add r2, r0, #48
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vld1.64 {d28, d29}, [r1:128]!
; CHECK-NEXT: mov lr, #0
-; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128]
-; CHECK-NEXT: vmov r2, r12, d16
-; CHECK-NEXT: vmov r6, r7, d17
-; CHECK-NEXT: vmov r4, r5, d18
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: sbcs r2, r5, r12
+; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]!
+; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]!
+; CHECK-NEXT: vld1.64 {d24, d25}, [r0:128]!
+; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]!
+; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128]!
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT: vmov r0, r12, d16
+; CHECK-NEXT: vmov r1, r2, d18
+; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: vmov r1, r4, d25
+; CHECK-NEXT: sbcs r0, r2, r12
; CHECK-NEXT: mov r12, #0
-; CHECK-NEXT: vmov r2, r4, d19
+; CHECK-NEXT: vmov r2, r0, d21
; CHECK-NEXT: movlt r12, #1
; CHECK-NEXT: cmp r12, #0
-; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: mvnne r12, #0
-; CHECK-NEXT: vld1.64 {d24, d25}, [r5:128]!
-; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128]
-; CHECK-NEXT: subs r2, r2, r6
-; CHECK-NEXT: mov r2, r0
-; CHECK-NEXT: add r0, r0, #32
-; CHECK-NEXT: vld1.64 {d26, d27}, [r2:128]!
-; CHECK-NEXT: vld1.64 {d22, d23}, [r2:128]
-; CHECK-NEXT: sbcs r2, r4, r7
-; CHECK-NEXT: vmov r4, r5, d21
-; CHECK-NEXT: movlt r8, #1
-; CHECK-NEXT: vmov r6, r7, d23
-; CHECK-NEXT: cmp r8, #0
-; CHECK-NEXT: mvnne r8, #0
-; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128]
-; CHECK-NEXT: add r0, r1, #32
-; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128]
-; CHECK-NEXT: vmov r0, r1, d20
-; CHECK-NEXT: vdup.32 d7, r8
-; CHECK-NEXT: vdup.32 d6, r12
-; CHECK-NEXT: subs r4, r6, r4
-; CHECK-NEXT: sbcs r4, r7, r5
-; CHECK-NEXT: vmov r5, r6, d24
-; CHECK-NEXT: vmov r7, r2, d26
-; CHECK-NEXT: mov r4, #0
-; CHECK-NEXT: movlt r4, #1
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: mvnne r4, #0
-; CHECK-NEXT: vdup.32 d5, r4
-; CHECK-NEXT: subs r5, r7, r5
-; CHECK-NEXT: sbcs r2, r2, r6
-; CHECK-NEXT: vmov r7, r6, d27
-; CHECK-NEXT: vmov r2, r9, d25
-; CHECK-NEXT: mov r5, #0
-; CHECK-NEXT: movlt r5, #1
-; CHECK-NEXT: cmp r5, #0
-; CHECK-NEXT: mvnne r5, #0
-; CHECK-NEXT: subs r2, r7, r2
-; CHECK-NEXT: sbcs r2, r6, r9
-; CHECK-NEXT: vmov r6, r7, d22
+; CHECK-NEXT: subs r1, r1, r2
+; CHECK-NEXT: sbcs r0, r4, r0
+; CHECK-NEXT: vmov r2, r4, d26
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d1, r0
+; CHECK-NEXT: vmov r0, r1, d22
+; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: mov r2, #0
+; CHECK-NEXT: sbcs r0, r4, r1
+; CHECK-NEXT: vmov r4, r5, d31
+; CHECK-NEXT: vmov r0, r1, d29
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d1, r2
-; CHECK-NEXT: vdup.32 d0, r5
-; CHECK-NEXT: vbit q12, q13, q0
-; CHECK-NEXT: subs r0, r6, r0
-; CHECK-NEXT: vmov r2, r6, d28
-; CHECK-NEXT: sbcs r0, r7, r1
-; CHECK-NEXT: mov r7, #0
-; CHECK-NEXT: vmov r0, r1, d30
-; CHECK-NEXT: movlt r7, #1
-; CHECK-NEXT: subs r0, r2, r0
-; CHECK-NEXT: vmov r2, r5, d29
-; CHECK-NEXT: sbcs r0, r6, r1
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r1
+; CHECK-NEXT: vmov r4, r5, d30
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d3, r0
+; CHECK-NEXT: vmov r0, r1, d28
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r1
+; CHECK-NEXT: vmov r4, r5, d24
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d2, r0
+; CHECK-NEXT: vmov r0, r1, d20
+; CHECK-NEXT: vbit q14, q15, q1
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r1
+; CHECK-NEXT: vmov r1, r4, d17
+; CHECK-NEXT: vmov r5, r6, d19
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d0, r0
+; CHECK-NEXT: vbit q10, q12, q0
+; CHECK-NEXT: subs r1, r5, r1
+; CHECK-NEXT: sbcs r1, r6, r4
+; CHECK-NEXT: vmov r4, r5, d27
+; CHECK-NEXT: vmov r0, r1, d23
; CHECK-NEXT: mov r6, #0
-; CHECK-NEXT: vmov r0, r1, d31
; CHECK-NEXT: movlt r6, #1
-; CHECK-NEXT: subs r0, r2, r0
+; CHECK-NEXT: subs r0, r4, r0
; CHECK-NEXT: sbcs r0, r5, r1
; CHECK-NEXT: movlt lr, #1
; CHECK-NEXT: cmp lr, #0
; CHECK-NEXT: mvnne lr, #0
; CHECK-NEXT: cmp r6, #0
+; CHECK-NEXT: vdup.32 d31, lr
; CHECK-NEXT: mvnne r6, #0
-; CHECK-NEXT: vdup.32 d3, lr
-; CHECK-NEXT: vdup.32 d2, r6
-; CHECK-NEXT: cmp r7, #0
-; CHECK-NEXT: vorr q13, q1, q1
-; CHECK-NEXT: mvnne r7, #0
-; CHECK-NEXT: vdup.32 d4, r7
-; CHECK-NEXT: add r0, r3, #32
-; CHECK-NEXT: vbsl q13, q14, q15
-; CHECK-NEXT: vbit q10, q11, q2
-; CHECK-NEXT: vbit q8, q9, q3
-; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]
-; CHECK-NEXT: add r0, r3, #48
-; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]!
-; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
-; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]
-; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT: vdup.32 d30, r2
+; CHECK-NEXT: vdup.32 d3, r6
+; CHECK-NEXT: vbit q11, q13, q15
+; CHECK-NEXT: vdup.32 d2, r12
+; CHECK-NEXT: vst1.64 {d28, d29}, [r3:128]!
+; CHECK-NEXT: vbit q8, q9, q1
+; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128]!
+; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]
+; CHECK-NEXT: pop {r4, r5, r6, lr}
; CHECK-NEXT: mov pc, lr
%v0 = load %T0_19, %T0_19* %loadaddr
%v1 = load %T0_19, %T0_19* %loadaddr2
@@ -251,213 +242,198 @@ define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
%T1_20* %blend, %T0_20* %storeaddr) {
; CHECK-LABEL: func_blend20:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, sp, #4
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, sp, #8
-; CHECK-NEXT: add r9, r1, #64
-; CHECK-NEXT: mov r2, #32
-; CHECK-NEXT: add r8, r0, #64
-; CHECK-NEXT: vld1.64 {d16, d17}, [r9:128], r2
-; CHECK-NEXT: mov r10, r1
-; CHECK-NEXT: mov r11, r0
-; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128], r2
-; CHECK-NEXT: vmov r7, r5, d17
-; CHECK-NEXT: vmov r6, r2, d19
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: vld1.64 {d22, d23}, [r10:128]!
-; CHECK-NEXT: subs r7, r6, r7
-; CHECK-NEXT: sbcs r2, r2, r5
-; CHECK-NEXT: vmov r5, r6, d16
-; CHECK-NEXT: vmov r7, r4, d18
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d21, r2
-; CHECK-NEXT: subs r5, r7, r5
-; CHECK-NEXT: sbcs r4, r4, r6
-; CHECK-NEXT: mov r4, #0
-; CHECK-NEXT: movlt r4, #1
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: mvnne r4, #0
-; CHECK-NEXT: vdup.32 d20, r4
-; CHECK-NEXT: vmov r2, r4, d23
-; CHECK-NEXT: vbit q8, q9, q10
-; CHECK-NEXT: vld1.64 {d18, d19}, [r11:128]!
-; CHECK-NEXT: vmov r7, r5, d19
-; CHECK-NEXT: subs r2, r7, r2
-; CHECK-NEXT: sbcs r2, r5, r4
-; CHECK-NEXT: vmov r5, r7, d18
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d21, r2
-; CHECK-NEXT: vmov r2, r4, d22
-; CHECK-NEXT: subs r2, r5, r2
-; CHECK-NEXT: sbcs r2, r7, r4
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d20, r2
-; CHECK-NEXT: add r2, r0, #48
-; CHECK-NEXT: vbif q9, q11, q10
-; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128]
-; CHECK-NEXT: add r2, r1, #48
-; CHECK-NEXT: vld1.64 {d2, d3}, [r2:128]
-; CHECK-NEXT: vmov r5, r7, d30
-; CHECK-NEXT: vmov r2, r4, d2
-; CHECK-NEXT: vld1.64 {d26, d27}, [r11:128]
-; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]
-; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]!
-; CHECK-NEXT: vld1.64 {d22, d23}, [r9:128]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r8:128]!
-; CHECK-NEXT: vmov r11, r10, d21
-; CHECK-NEXT: subs r2, r5, r2
-; CHECK-NEXT: sbcs r2, r7, r4
-; CHECK-NEXT: vmov r7, r6, d31
-; CHECK-NEXT: vmov r2, r5, d3
-; CHECK-NEXT: mov r4, #0
-; CHECK-NEXT: movlt r4, #1
-; CHECK-NEXT: cmp r4, #0
-; CHECK-NEXT: mvnne r4, #0
-; CHECK-NEXT: subs r2, r7, r2
-; CHECK-NEXT: mov r7, #0
-; CHECK-NEXT: sbcs r2, r6, r5
-; CHECK-NEXT: vmov r6, r5, d27
-; CHECK-NEXT: vmov r2, r9, d1
-; CHECK-NEXT: movlt r7, #1
-; CHECK-NEXT: cmp r7, #0
-; CHECK-NEXT: mvnne r7, #0
-; CHECK-NEXT: vdup.32 d7, r7
-; CHECK-NEXT: vdup.32 d6, r4
-; CHECK-NEXT: subs r2, r6, r2
-; CHECK-NEXT: sbcs r2, r5, r9
-; CHECK-NEXT: vmov r6, r5, d26
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d5, r2
-; CHECK-NEXT: vmov r2, r9, d0
-; CHECK-NEXT: subs r2, r6, r2
-; CHECK-NEXT: sbcs r2, r5, r9
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: vdup.32 d4, r2
-; CHECK-NEXT: add r2, r1, #32
-; CHECK-NEXT: vld1.64 {d28, d29}, [r2:128]
-; CHECK-NEXT: add r2, r0, #32
-; CHECK-NEXT: vbif q13, q0, q2
-; CHECK-NEXT: add r1, r1, #80
-; CHECK-NEXT: vld1.64 {d0, d1}, [r2:128]
-; CHECK-NEXT: vmov r4, r5, d28
-; CHECK-NEXT: vbif q15, q1, q3
-; CHECK-NEXT: add r0, r0, #80
-; CHECK-NEXT: vmov r2, r6, d0
-; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128]
-; CHECK-NEXT: vmov r9, r8, d25
-; CHECK-NEXT: vld1.64 {d8, d9}, [r0:128]
-; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128]
-; CHECK-NEXT: vmov r3, r12, d8
-; CHECK-NEXT: subs r2, r2, r4
-; CHECK-NEXT: sbcs r2, r6, r5
-; CHECK-NEXT: vmov r4, r5, d29
-; CHECK-NEXT: vmov r6, r7, d1
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movlt r2, #1
-; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: subs r4, r6, r4
-; CHECK-NEXT: sbcs r4, r7, r5
-; CHECK-NEXT: vmov r5, r6, d2
+; CHECK-NEXT: mov r8, r1
+; CHECK-NEXT: mov lr, r0
+; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]!
+; CHECK-NEXT: add r9, r0, #64
+; CHECK-NEXT: add r10, r1, #64
+; CHECK-NEXT: mov r12, #0
+; CHECK-NEXT: vld1.64 {d22, d23}, [lr:128]!
+; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128]!
+; CHECK-NEXT: vld1.64 {d20, d21}, [lr:128]!
+; CHECK-NEXT: vmov r6, r4, d19
+; CHECK-NEXT: vmov r5, r7, d21
+; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]!
+; CHECK-NEXT: vld1.64 {d6, d7}, [r10:128]!
+; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]!
+; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]!
+; CHECK-NEXT: subs r6, r5, r6
+; CHECK-NEXT: sbcs r4, r7, r4
+; CHECK-NEXT: vmov r5, r6, d18
+; CHECK-NEXT: vmov r7, r2, d20
; CHECK-NEXT: mov r4, #0
; CHECK-NEXT: movlt r4, #1
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
-; CHECK-NEXT: vdup.32 d5, r4
-; CHECK-NEXT: vdup.32 d4, r2
-; CHECK-NEXT: vmov r2, r4, d22
-; CHECK-NEXT: vbit q14, q0, q2
-; CHECK-NEXT: subs r2, r5, r2
-; CHECK-NEXT: sbcs r2, r6, r4
-; CHECK-NEXT: vmov r4, r5, d24
-; CHECK-NEXT: vmov r6, r7, d20
+; CHECK-NEXT: vdup.32 d31, r4
+; CHECK-NEXT: subs r5, r7, r5
+; CHECK-NEXT: sbcs r2, r2, r6
+; CHECK-NEXT: vmov r4, r5, d3
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: movlt r2, #1
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: mvnne r2, #0
-; CHECK-NEXT: subs r1, r6, r4
-; CHECK-NEXT: vmov r0, r6, d9
-; CHECK-NEXT: sbcs r1, r7, r5
-; CHECK-NEXT: vmov r4, r5, d7
-; CHECK-NEXT: mov r1, #0
-; CHECK-NEXT: movlt r1, #1
-; CHECK-NEXT: cmp r1, #0
-; CHECK-NEXT: mvnne r1, #0
-; CHECK-NEXT: subs r0, r0, r4
-; CHECK-NEXT: vmov r7, r4, d23
-; CHECK-NEXT: sbcs r0, r6, r5
-; CHECK-NEXT: vmov r5, lr, d6
+; CHECK-NEXT: vdup.32 d30, r2
+; CHECK-NEXT: vmov r0, r2, d1
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r2
+; CHECK-NEXT: vmov r4, r5, d2
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d9, r0
+; CHECK-NEXT: vmov r0, r2, d0
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r2
+; CHECK-NEXT: vmov r4, r5, d5
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d8, r0
+; CHECK-NEXT: vmov r0, r2, d7
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r2
+; CHECK-NEXT: vmov r4, r5, d4
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mvnne r0, #0
; CHECK-NEXT: vdup.32 d11, r0
-; CHECK-NEXT: vmov r0, r6, d3
-; CHECK-NEXT: subs r0, r0, r7
-; CHECK-NEXT: sbcs r0, r6, r4
+; CHECK-NEXT: vmov r0, r2, d6
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r2
+; CHECK-NEXT: vmov r4, r5, d23
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d10, r0
+; CHECK-NEXT: vmov r0, r2, d17
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r2
+; CHECK-NEXT: vmov r4, r5, d22
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d25, r0
+; CHECK-NEXT: vmov r0, r2, d16
+; CHECK-NEXT: subs r0, r4, r0
+; CHECK-NEXT: sbcs r0, r5, r2
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: movlt r0, #1
-; CHECK-NEXT: subs r4, r11, r9
-; CHECK-NEXT: sbcs r4, r10, r8
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: vdup.32 d24, r0
+; CHECK-NEXT: vorr q13, q12, q12
+; CHECK-NEXT: vbsl q13, q11, q8
+; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]!
+; CHECK-NEXT: vorr q8, q5, q5
+; CHECK-NEXT: vld1.64 {d28, d29}, [r10:128]!
+; CHECK-NEXT: vbsl q8, q2, q3
+; CHECK-NEXT: vld1.64 {d6, d7}, [r8:128]!
+; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128]
+; CHECK-NEXT: vld1.64 {d4, d5}, [lr:128]!
+; CHECK-NEXT: vbif q10, q9, q15
+; CHECK-NEXT: vorr q9, q4, q4
+; CHECK-NEXT: vmov r0, r2, d22
+; CHECK-NEXT: vbsl q9, q1, q0
+; CHECK-NEXT: vld1.64 {d30, d31}, [lr:128]
+; CHECK-NEXT: mov lr, #0
+; CHECK-NEXT: vmov r7, r5, d30
+; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128]
+; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128]
+; CHECK-NEXT: subs r0, r7, r0
+; CHECK-NEXT: sbcs r0, r5, r2
+; CHECK-NEXT: vmov r5, r4, d24
+; CHECK-NEXT: vmov r0, r7, d28
+; CHECK-NEXT: movlt lr, #1
+; CHECK-NEXT: cmp lr, #0
+; CHECK-NEXT: mvnne lr, #0
+; CHECK-NEXT: subs r0, r5, r0
+; CHECK-NEXT: sbcs r0, r4, r7
+; CHECK-NEXT: vmov r7, r5, d29
+; CHECK-NEXT: vmov r4, r6, d25
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: movlt r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mvnne r0, #0
+; CHECK-NEXT: subs r7, r4, r7
; CHECK-NEXT: mov r4, #0
+; CHECK-NEXT: sbcs r7, r6, r5
+; CHECK-NEXT: vmov r5, r1, d31
+; CHECK-NEXT: vmov r7, r6, d23
; CHECK-NEXT: movlt r4, #1
-; CHECK-NEXT: subs r3, r3, r5
-; CHECK-NEXT: sbcs r3, r12, lr
-; CHECK-NEXT: mov r3, #0
-; CHECK-NEXT: movlt r3, #1
-; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: mvnne r3, #0
; CHECK-NEXT: cmp r4, #0
; CHECK-NEXT: mvnne r4, #0
-; CHECK-NEXT: vdup.32 d10, r3
-; CHECK-NEXT: vdup.32 d1, r4
-; CHECK-NEXT: vorr q2, q5, q5
-; CHECK-NEXT: vdup.32 d0, r1
-; CHECK-NEXT: cmp r0, #0
-; CHECK-NEXT: vbsl q2, q4, q3
-; CHECK-NEXT: mvnne r0, #0
-; CHECK-NEXT: vbif q10, q12, q0
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vdup.32 d7, r0
-; CHECK-NEXT: add r0, r1, #80
-; CHECK-NEXT: vdup.32 d6, r2
-; CHECK-NEXT: vbit q11, q1, q3
-; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]
-; CHECK-NEXT: add r0, r1, #32
-; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128]
-; CHECK-NEXT: add r0, r1, #48
-; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128]
-; CHECK-NEXT: add r0, r1, #64
-; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128]!
-; CHECK-NEXT: vst1.64 {d26, d27}, [r1:128]
-; CHECK-NEXT: mov r1, #32
-; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1
+; CHECK-NEXT: subs r7, r5, r7
+; CHECK-NEXT: mov r5, #0
+; CHECK-NEXT: sbcs r1, r1, r6
+; CHECK-NEXT: vmov r6, r2, d5
+; CHECK-NEXT: vmov r1, r7, d7
+; CHECK-NEXT: movlt r5, #1
+; CHECK-NEXT: cmp r5, #0
+; CHECK-NEXT: mvnne r5, #0
+; CHECK-NEXT: subs r1, r6, r1
+; CHECK-NEXT: sbcs r1, r2, r7
+; CHECK-NEXT: vmov r6, r7, d4
+; CHECK-NEXT: mov r1, #0
+; CHECK-NEXT: movlt r1, #1
+; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: mvnne r1, #0
+; CHECK-NEXT: vdup.32 d9, r1
+; CHECK-NEXT: vmov r1, r2, d6
+; CHECK-NEXT: subs r1, r6, r1
+; CHECK-NEXT: sbcs r1, r7, r2
+; CHECK-NEXT: vmov r6, r7, d0
+; CHECK-NEXT: mov r1, #0
+; CHECK-NEXT: movlt r1, #1
+; CHECK-NEXT: cmp r1, #0
+; CHECK-NEXT: mvnne r1, #0
+; CHECK-NEXT: vdup.32 d8, r1
+; CHECK-NEXT: vmov r1, r2, d2
+; CHECK-NEXT: vbif q2, q3, q4
+; CHECK-NEXT: vdup.32 d7, r5
+; CHECK-NEXT: vdup.32 d9, r4
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: vdup.32 d8, r0
+; CHECK-NEXT: mov r0, r3
+; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]!
+; CHECK-NEXT: vbif q12, q14, q4
+; CHECK-NEXT: vdup.32 d6, lr
+; CHECK-NEXT: vbit q11, q15, q3
; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]!
+; CHECK-NEXT: subs r1, r6, r1
+; CHECK-NEXT: mov r6, #0
+; CHECK-NEXT: sbcs r1, r7, r2
+; CHECK-NEXT: vmov r1, r2, d3
+; CHECK-NEXT: movlt r6, #1
+; CHECK-NEXT: subs r1, r4, r1
+; CHECK-NEXT: sbcs r1, r5, r2
+; CHECK-NEXT: movlt r12, #1
+; CHECK-NEXT: cmp r12, #0
+; CHECK-NEXT: mvnne r12, #0
+; CHECK-NEXT: cmp r6, #0
+; CHECK-NEXT: vdup.32 d27, r12
+; CHECK-NEXT: mvnne r6, #0
+; CHECK-NEXT: vdup.32 d26, r6
+; CHECK-NEXT: vorr q10, q13, q13
+; CHECK-NEXT: vbsl q10, q0, q1
+; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]!
; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128]
-; CHECK-NEXT: add sp, sp, #8
+; CHECK-NEXT: add r0, r3, #64
+; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]!
+; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: add sp, sp, #4
-; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: mov pc, lr
%v0 = load %T0_20, %T0_20* %loadaddr
%v1 = load %T0_20, %T0_20* %loadaddr2
diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index dcb27d58efef5..8a3f6ffd1bb0e 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -198,21 +198,13 @@ for.end: ; preds = %for.body
; @testNeon is an important example of the nead for ivchains.
;
-; Currently we have two extra add.w's that keep the store address
-; live past the next increment because ISEL is unfortunately undoing
-; the store chain. ISEL also fails to convert all but one of the stores to
-; post-increment addressing. However, the loads should use
-; post-increment addressing, no add's or add.w's beyond the three
-; mentioned. Most importantly, there should be no spills or reloads!
+; Loads and stores should use post-increment addressing, no add's or add.w's.
+; Most importantly, there should be no spills or reloads!
;
; A9: testNeon:
; A9: %.lr.ph
-; A9: add.w r
; A9-NOT: lsl.w
; A9-NOT: {{ldr|str|adds|add r}}
-; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}}
-; A9: add.w r
-; A9-NOT: {{ldr|str|adds|add r}}
; A9-NOT: add.w r
; A9: bne
define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {
More information about the llvm-commits
mailing list