[llvm] 532d05b - [ARM] Attempt to distribute reductions
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 30 06:48:43 PDT 2021
Author: David Green
Date: 2021-07-30T14:48:31+01:00
New Revision: 532d05b714b3f64603be53398571d49a6f4b2f92
URL: https://github.com/llvm/llvm-project/commit/532d05b714b3f64603be53398571d49a6f4b2f92
DIFF: https://github.com/llvm/llvm-project/commit/532d05b714b3f64603be53398571d49a6f4b2f92.diff
LOG: [ARM] Attempt to distribute reductions
This adds a combine for adds of reductions, distributing them so that
they occur sequentially to enable better use of accumulating VADDVA
instructions. It combines:
add(X, add(vecreduce(Y), vecreduce(Z))) ->
add(add(X, vecreduce(Y)), vecreduce(Z))
and
add(add(A, reduce(B)), add(C, reduce(D))) ->
add(add(add(A, C), reduce(B)), reduce(D))
These together distribute the add's so that more reductions can be
selected to VADDVA.
Differential Revision: https://reviews.llvm.org/D106532
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/Thumb2/mve-vaddv.ll
llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 021e33ff38cb..e81fa343ad21 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13066,11 +13066,67 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
- if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
+ if (!Subtarget->hasMVEIntegerOps())
return SDValue();
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ SDLoc dl(N);
+
+ auto IsVecReduce = [](SDValue Op) {
+ switch (Op.getOpcode()) {
+ case ISD::VECREDUCE_ADD:
+ case ARMISD::VADDVs:
+ case ARMISD::VADDVu:
+ case ARMISD::VMLAVs:
+ case ARMISD::VMLAVu:
+ return true;
+ }
+ return false;
+ };
+
+ auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
+ // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
+ // add(add(X, vecreduce(Y)), vecreduce(Z))
+ // to make better use of vaddva style instructions.
+ if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
+ IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1))) {
+ SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
+ return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
+ }
+ // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
+ // add(add(add(A, C), reduce(B)), reduce(D))
+ if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
+ N1.getOpcode() == ISD::ADD) {
+ unsigned N0RedOp = 0;
+ if (!IsVecReduce(N0.getOperand(N0RedOp))) {
+ N0RedOp = 1;
+ if (!IsVecReduce(N0.getOperand(N0RedOp)))
+ return SDValue();
+ }
+
+ unsigned N1RedOp = 0;
+ if (!IsVecReduce(N1.getOperand(N1RedOp)))
+ N1RedOp = 1;
+ if (!IsVecReduce(N1.getOperand(N1RedOp)))
+ return SDValue();
+
+ SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
+ N1.getOperand(1 - N1RedOp));
+ SDValue Add1 =
+ DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
+ return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
+ }
+ return SDValue();
+ };
+ if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
+ return R;
+ if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
+ return R;
+
+ if (VT != MVT::i64)
+ return SDValue();
// We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
// will look like:
@@ -13090,7 +13146,6 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
return SDValue();
- SDLoc dl(N);
if (VecRed->getOpcode() == OpcodeA) {
// add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
index 8d06f1998361..3e27955ab1f1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll
@@ -117,9 +117,8 @@ entry:
define arm_aapcs_vfpcc i32 @vaddva_v8i32_i32(<8 x i32> %s1, i32 %x) {
; CHECK-LABEL: vaddva_v8i32_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vaddva.u32 r2, q0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vaddva.u32 r0, q1
; CHECK-NEXT: bx lr
entry:
%t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
@@ -141,9 +140,8 @@ entry:
define arm_aapcs_vfpcc i16 @vaddva_v16i16_i16(<16 x i16> %s1, i16 %x) {
; CHECK-LABEL: vaddva_v16i16_i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vaddv.u16 r2, q1
-; CHECK-NEXT: vaddva.u16 r2, q0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: vaddva.u16 r0, q0
+; CHECK-NEXT: vaddva.u16 r0, q1
; CHECK-NEXT: bx lr
entry:
%t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
@@ -165,9 +163,8 @@ entry:
define arm_aapcs_vfpcc i8 @vaddva_v32i8_i8(<32 x i8> %s1, i8 %x) {
; CHECK-LABEL: vaddva_v32i8_i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vaddv.u8 r2, q1
-; CHECK-NEXT: vaddva.u8 r2, q0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: vaddva.u8 r0, q0
+; CHECK-NEXT: vaddva.u8 r0, q1
; CHECK-NEXT: bx lr
entry:
%t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll
index 94fb1326dd11..11188d4b47e2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll
@@ -49,15 +49,15 @@ entry:
define i32 @addv16i32i32(i32* %x) {
; CHECK-LABEL: addv16i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r0, q1
-; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: vaddv.u32 r2, q1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <16 x i32>*
@@ -69,20 +69,19 @@ entry:
define i32 @addv24i32i32(i32* %x) {
; CHECK-LABEL: addv24i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r0, q1
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <8 x i32>*
@@ -99,25 +98,23 @@ entry:
define i32 @addv32i32i32(i32* %x) {
; CHECK-LABEL: addv32i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0, #112]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
-; CHECK-NEXT: vaddva.u32 r2, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT: vaddva.u32 r2, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: vaddv.u32 r0, q1
; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vldrw.u32 q0, [r1, #96]
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vldrw.u32 q0, [r1, #64]
+; CHECK-NEXT: vaddva.u32 r0, q0
+; CHECK-NEXT: vldrw.u32 q0, [r1, #112]
+; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <32 x i32>*
@@ -129,45 +126,39 @@ entry:
define i32 @addv64i32i32(i32* %x) {
; CHECK-LABEL: addv64i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0, #240]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrw.u32 q1, [r0, #176]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #208]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u32 r12, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #144]
-; CHECK-NEXT: vaddva.u32 r12, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #224]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
-; CHECK-NEXT: add r2, r12
-; CHECK-NEXT: vaddv.u32 r12, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #160]
-; CHECK-NEXT: vaddva.u32 r12, q0
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: add r1, r2
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #192]
; CHECK-NEXT: vaddva.u32 r2, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT: add.w r3, r2, r12
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #128]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r0, q1
-; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: add r0, r3
-; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <64 x i32>*
@@ -455,15 +446,15 @@ entry:
define i32 @addv16i32i16(i16* %x) {
; CHECK-LABEL: addv16i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q1, [r0, #24]
-; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
-; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vldrh.s32 q1, [r0, #16]
-; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r0, q1
-; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: vaddv.u32 r2, q1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <16 x i16>*
@@ -476,16 +467,15 @@ entry:
define i32 @addv24i32i16(i16* %x) {
; CHECK-LABEL: addv24i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q1, [r0, #24]
-; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrh.s32 q1, [r0, #16]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
+; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
-; CHECK-NEXT: add r2, r12
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
@@ -506,25 +496,23 @@ entry:
define i32 @addv32i32i16(i16* %x) {
; CHECK-LABEL: addv32i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q1, [r0, #56]
-; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrh.s32 q1, [r0, #40]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrh.s32 q1, [r0, #48]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrh.s32 q1, [r0, #32]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r0, q1
-; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <32 x i16>*
@@ -537,39 +525,34 @@ entry:
define i32 @addv64i32i16(i16* %x) {
; CHECK-LABEL: addv64i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q1, [r0, #56]
-; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
-; CHECK-NEXT: ldrsh.w r3, [r0, #122]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrh.s32 q1, [r0, #40]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: ldrsh.w r1, [r0, #120]
; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrh.s32 q1, [r0, #48]
+; CHECK-NEXT: ldrsh.w r3, [r0, #122]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
+; CHECK-NEXT: ldrsh.w r12, [r0, #124]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u32 r12, q1
-; CHECK-NEXT: vldrh.s32 q1, [r0, #32]
-; CHECK-NEXT: vaddva.u32 r12, q0
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
+; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrh.s32 q1, [r0, #88]
; CHECK-NEXT: vaddva.u32 r2, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #72]
-; CHECK-NEXT: add r2, r12
-; CHECK-NEXT: vaddv.u32 r12, q1
-; CHECK-NEXT: vldrh.s32 q1, [r0, #80]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #64]
-; CHECK-NEXT: add r1, r2
-; CHECK-NEXT: vaddv.u32 r2, q1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #80]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #72]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #88]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
-; CHECK-NEXT: add r2, r12
-; CHECK-NEXT: ldrsh.w r12, [r0, #124]
-; CHECK-NEXT: add r2, r1
-; CHECK-NEXT: ldrsh.w r1, [r0, #120]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #112]
; CHECK-NEXT: ldrsh.w r0, [r0, #126]
@@ -840,25 +823,23 @@ entry:
define i32 @addv32i32i8(i8* %x) {
; CHECK-LABEL: addv32i32i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q1, [r0, #28]
-; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrb.u32 q1, [r0, #20]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrb.u32 q1, [r0, #24]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrb.u32 q1, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r0, q1
-; CHECK-NEXT: vaddva.u32 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i8* %x to <32 x i8>*
@@ -871,29 +852,26 @@ entry:
define i32 @addv64i32i8(i8* %x) {
; CHECK-LABEL: addv64i32i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q1, [r0, #28]
-; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
-; CHECK-NEXT: ldrb.w r3, [r0, #61]
-; CHECK-NEXT: vaddv.u32 r12, q1
; CHECK-NEXT: vldrb.u32 q1, [r0, #20]
-; CHECK-NEXT: vaddva.u32 r12, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
+; CHECK-NEXT: ldrb.w r1, [r0, #60]
; CHECK-NEXT: vaddv.u32 r2, q1
-; CHECK-NEXT: vldrb.u32 q1, [r0, #24]
+; CHECK-NEXT: ldrb.w r3, [r0, #61]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
+; CHECK-NEXT: ldrb.w r12, [r0, #62]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u32 r12, q1
-; CHECK-NEXT: vldrb.u32 q1, [r0, #16]
-; CHECK-NEXT: vaddva.u32 r12, q0
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
+; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0]
-; CHECK-NEXT: vaddv.u32 r2, q1
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
-; CHECK-NEXT: add r2, r12
-; CHECK-NEXT: ldrb.w r12, [r0, #62]
-; CHECK-NEXT: add r2, r1
-; CHECK-NEXT: ldrb.w r1, [r0, #60]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.u16 r2, q0
@@ -1104,16 +1082,15 @@ entry:
define signext i16 @addv32i16i16(i16* %x) {
; CHECK-LABEL: addv32i16i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u16 q1, [r0, #48]
-; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
-; CHECK-NEXT: vaddv.u16 r2, q1
; CHECK-NEXT: vldrh.u16 q1, [r0, #32]
-; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0]
-; CHECK-NEXT: vaddv.u16 r0, q1
-; CHECK-NEXT: vaddva.u16 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: sxth r0, r0
+; CHECK-NEXT: vaddv.u16 r2, q1
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <32 x i16>*
@@ -1125,26 +1102,23 @@ entry:
define signext i16 @addv64i16i16(i16* %x) {
; CHECK-LABEL: addv64i16i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u16 q1, [r0, #112]
-; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
-; CHECK-NEXT: vaddv.u16 r12, q1
; CHECK-NEXT: vldrh.u16 q1, [r0, #80]
-; CHECK-NEXT: vaddva.u16 r12, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u16 r2, q1
-; CHECK-NEXT: vldrh.u16 q1, [r0, #96]
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
-; CHECK-NEXT: add.w r1, r2, r12
-; CHECK-NEXT: vaddv.u16 r2, q1
-; CHECK-NEXT: vldrh.u16 q1, [r0, #64]
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0]
-; CHECK-NEXT: vaddv.u16 r0, q1
-; CHECK-NEXT: vaddva.u16 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: add r0, r1
-; CHECK-NEXT: sxth r0, r0
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
+; CHECK-NEXT: vaddva.u16 r2, q0
+; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <64 x i16>*
@@ -1370,16 +1344,15 @@ entry:
define zeroext i8 @addv64i8i8(i8* %x) {
; CHECK-LABEL: addv64i8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u8 q1, [r0, #48]
-; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
-; CHECK-NEXT: vaddv.u8 r2, q1
; CHECK-NEXT: vldrb.u8 q1, [r0, #32]
-; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0]
-; CHECK-NEXT: vaddv.u8 r0, q1
-; CHECK-NEXT: vaddva.u8 r0, q0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: uxtb r0, r0
+; CHECK-NEXT: vaddv.u8 r2, q1
+; CHECK-NEXT: vaddva.u8 r2, q0
+; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
+; CHECK-NEXT: vaddva.u8 r2, q0
+; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
+; CHECK-NEXT: vaddva.u8 r2, q0
+; CHECK-NEXT: uxtb r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i8* %x to <64 x i8>*
@@ -1515,19 +1488,19 @@ entry:
define i32 @mlav16i32i32(i32* %x, i32* %y) {
; CHECK-LABEL: mlav16i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
; CHECK-NEXT: vmlav.u32 r2, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: add.w r0, r2, r12
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
+; CHECK-NEXT: vmlava.u32 r2, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
+; CHECK-NEXT: vmlava.u32 r2, q1, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <16 x i32>*
@@ -1542,26 +1515,25 @@ entry:
define i32 @mlav24i32i32(i32* %x, i32* %y) {
; CHECK-LABEL: mlav24i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: vmlav.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
-; CHECK-NEXT: add.w r3, r2, r12
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: adds r0, r3, r2
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <8 x i32>*
@@ -1585,33 +1557,31 @@ entry:
define i32 @mlav32i32i32(i32* %x, i32* %y) {
; CHECK-LABEL: mlav32i32i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: vmlav.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
-; CHECK-NEXT: add.w r3, r2, r12
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: add.w r0, r2, r12
-; CHECK-NEXT: add r0, r3
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
+; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i32* %x to <32 x i32>*
@@ -2279,19 +2249,19 @@ entry:
define i32 @mlav16i32i16(i16* %x, i16* %y) {
; CHECK-LABEL: mlav16i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
; CHECK-NEXT: vmlav.u32 r2, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: add.w r0, r2, r12
+; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT: vmlava.u32 r2, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
+; CHECK-NEXT: vmlava.u32 r2, q1, q0
+; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <16 x i16>*
@@ -2308,23 +2278,22 @@ entry:
define i32 @mlav24i32i16(i16* %x, i16* %y) {
; CHECK-LABEL: mlav24i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: vmlav.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: vldrh.u16 q0, [r0]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vldrh.u16 q1, [r1]
-; CHECK-NEXT: add r2, r12
-; CHECK-NEXT: vmlava.s16 r2, q1, q0
-; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <8 x i16>*
@@ -2352,33 +2321,31 @@ entry:
define i32 @mlav32i32i16(i16* %x, i16* %y) {
; CHECK-LABEL: mlav32i32i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #56]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: vmlav.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #48]
; CHECK-NEXT: vldrh.s32 q1, [r1, #48]
-; CHECK-NEXT: add.w r3, r2, r12
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
-; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrh.s32 q0, [r0]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2]
; CHECK-NEXT: vldrh.s32 q1, [r1]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: add.w r0, r2, r12
-; CHECK-NEXT: add r0, r3
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrh.s32 q0, [r2, #56]
+; CHECK-NEXT: vldrh.s32 q1, [r1, #56]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <32 x i16>*
@@ -2861,33 +2828,31 @@ entry:
define i32 @mlav32i32i8(i8* %x, i8* %y) {
; CHECK-LABEL: mlav32i32i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
-; CHECK-NEXT: vldrb.u32 q1, [r1, #28]
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
-; CHECK-NEXT: vldrb.u32 q1, [r1, #12]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
; CHECK-NEXT: vldrb.u32 q1, [r1, #20]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: vmlav.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2, #4]
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2, #24]
; CHECK-NEXT: vldrb.u32 q1, [r1, #24]
-; CHECK-NEXT: add.w r3, r2, r12
-; CHECK-NEXT: vmlav.u32 r12, q1, q0
-; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2, #8]
; CHECK-NEXT: vldrb.u32 q1, [r1, #8]
-; CHECK-NEXT: vmlava.u32 r12, q1, q0
-; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrb.u32 q1, [r1, #16]
-; CHECK-NEXT: vmlav.u32 r2, q1, q0
-; CHECK-NEXT: vldrb.u32 q0, [r0]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2, #12]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #12]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2]
; CHECK-NEXT: vldrb.u32 q1, [r1]
-; CHECK-NEXT: vmlava.u32 r2, q1, q0
-; CHECK-NEXT: add.w r0, r2, r12
-; CHECK-NEXT: add r0, r3
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2, #16]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #16]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
+; CHECK-NEXT: vldrb.u32 q0, [r2, #28]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #28]
+; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i8* %x to <32 x i8>*
@@ -3198,20 +3163,19 @@ entry:
define signext i16 @mlav32i16i16(i16* %x, i16* %y) {
; CHECK-LABEL: mlav32i16i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
-; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
-; CHECK-NEXT: vmlav.u16 r12, q1, q0
-; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
-; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
-; CHECK-NEXT: vmlava.u16 r12, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
; CHECK-NEXT: vmlav.u16 r2, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vldrh.u16 q1, [r1]
; CHECK-NEXT: vmlava.u16 r2, q1, q0
-; CHECK-NEXT: add.w r0, r2, r12
-; CHECK-NEXT: sxth r0, r0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
+; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
+; CHECK-NEXT: vmlava.u16 r2, q1, q0
+; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
+; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
+; CHECK-NEXT: vmlava.u16 r2, q1, q0
+; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = bitcast i16* %x to <32 x i16>*
More information about the llvm-commits
mailing list