[llvm] 0f83d37 - [ARM] MVE vabd
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 26 11:41:43 PDT 2021
Author: David Green
Date: 2021-06-26T19:41:32+01:00
New Revision: 0f83d37a144ad4a9e530703d80e02e18b3b84c02
URL: https://github.com/llvm/llvm-project/commit/0f83d37a144ad4a9e530703d80e02e18b3b84c02
DIFF: https://github.com/llvm/llvm-project/commit/0f83d37a144ad4a9e530703d80e02e18b3b84c02.diff
LOG: [ARM] MVE vabd
This adds MVE lowering for VABDS/VABDU, using the code parted from
AArch64 in D91937.
Differential Revision: https://reviews.llvm.org/D91938
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/lib/Target/ARM/ARMInstrMVE.td
llvm/test/CodeGen/Thumb2/mve-vabdus.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f7c8066d687e0..6fa89b9068558 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -281,6 +281,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
+ setOperationAction(ISD::ABDS, VT, Legal);
+ setOperationAction(ISD::ABDU, VT, Legal);
// No native support for these.
setOperationAction(ISD::UDIV, VT, Expand);
@@ -14616,6 +14618,8 @@ static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
case ARMISD::VQDMULH:
case ISD::MULHS:
case ISD::MULHU:
+ case ISD::ABDS:
+ case ISD::ABDU:
break;
default:
return SDValue();
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 37e825388430a..372893814092e 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2131,36 +2131,31 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size,
let validForTailPredication = 1;
}
-multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
- Intrinsic unpred_int, Intrinsic pred_int> {
+multiclass MVE_VABD_m<MVEVectorVTInfo VTI, SDNode Op,
+ Intrinsic unpred_int, Intrinsic PredInt> {
def "" : MVE_VABD_int<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEInt] in {
+ defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)),
+ !cast<Instruction>(NAME)>;
+
// Unpredicated absolute
diff erence
def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
-
- // Predicated absolute
diff erence
- def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive))),
- (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
- (VTI.Vec MQPR:$inactive)))>;
}
}
-multiclass MVE_VABD<MVEVectorVTInfo VTI>
- : MVE_VABD_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
+multiclass MVE_VABD<MVEVectorVTInfo VTI, SDNode Op>
+ : MVE_VABD_m<VTI, Op, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
-defm MVE_VABDs8 : MVE_VABD<MVE_v16s8>;
-defm MVE_VABDs16 : MVE_VABD<MVE_v8s16>;
-defm MVE_VABDs32 : MVE_VABD<MVE_v4s32>;
-defm MVE_VABDu8 : MVE_VABD<MVE_v16u8>;
-defm MVE_VABDu16 : MVE_VABD<MVE_v8u16>;
-defm MVE_VABDu32 : MVE_VABD<MVE_v4u32>;
+defm MVE_VABDs8 : MVE_VABD<MVE_v16s8, abds>;
+defm MVE_VABDs16 : MVE_VABD<MVE_v8s16, abds>;
+defm MVE_VABDs32 : MVE_VABD<MVE_v4s32, abds>;
+defm MVE_VABDu8 : MVE_VABD<MVE_v16u8, abdu>;
+defm MVE_VABDu16 : MVE_VABD<MVE_v8u16, abdu>;
+defm MVE_VABDu32 : MVE_VABD<MVE_v4u32, abdu>;
class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
: MVE_int<"vrhadd", suffix, size, pattern> {
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 4e8d880d1a306..0aaceab21ae43 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -4,15 +4,7 @@
define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: vabd_s8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.s8 q2, q1
-; CHECK-NEXT: vmovlt.s8 q3, q0
-; CHECK-NEXT: vmovlb.s8 q1, q1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vsub.i16 q2, q3, q2
-; CHECK-NEXT: vsub.i16 q0, q0, q1
-; CHECK-NEXT: vabs.s16 q2, q2
-; CHECK-NEXT: vabs.s16 q0, q0
-; CHECK-NEXT: vmovnt.i16 q0, q2
+; CHECK-NEXT: vabd.s8 q0, q0, q1
; CHECK-NEXT: bx lr
%sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
%sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
@@ -27,15 +19,7 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: vabd_s16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.s16 q2, q1
-; CHECK-NEXT: vmovlt.s16 q3, q0
-; CHECK-NEXT: vmovlb.s16 q1, q1
-; CHECK-NEXT: vmovlb.s16 q0, q0
-; CHECK-NEXT: vsub.i32 q2, q3, q2
-; CHECK-NEXT: vsub.i32 q0, q0, q1
-; CHECK-NEXT: vabs.s32 q2, q2
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vmovnt.i32 q0, q2
+; CHECK-NEXT: vabd.s16 q0, q0, q1
; CHECK-NEXT: bx lr
%sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
@@ -50,46 +34,7 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vabd_s32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s14, s3
-; CHECK-NEXT: vmov.f32 s16, s6
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vmov.f32 s18, s7
-; CHECK-NEXT: vmov r2, s16
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: asrs r1, r0, #31
-; CHECK-NEXT: subs r0, r0, r2
-; CHECK-NEXT: sbc.w r1, r1, r2, asr #31
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r0, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: sbc.w r2, r2, r3, asr #31
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: add.w r1, r1, r2, asr #31
-; CHECK-NEXT: eor.w r1, r1, r2, asr #31
-; CHECK-NEXT: vmov r2, s18
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov r0, s14
-; CHECK-NEXT: asrs r1, r0, #31
-; CHECK-NEXT: subs r0, r0, r2
-; CHECK-NEXT: sbc.w r1, r1, r2, asr #31
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r0, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: sbc.w r2, r2, r3, asr #31
-; CHECK-NEXT: add.w r1, r1, r2, asr #31
-; CHECK-NEXT: eor.w r1, r1, r2, asr #31
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vabd.s32 q0, q0, q1
; CHECK-NEXT: bx lr
%sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
%sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -104,15 +49,7 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
; CHECK-LABEL: vabd_u8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.u8 q2, q1
-; CHECK-NEXT: vmovlt.u8 q3, q0
-; CHECK-NEXT: vmovlb.u8 q1, q1
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vsub.i16 q2, q3, q2
-; CHECK-NEXT: vsub.i16 q0, q0, q1
-; CHECK-NEXT: vabs.s16 q2, q2
-; CHECK-NEXT: vabs.s16 q0, q0
-; CHECK-NEXT: vmovnt.i16 q0, q2
+; CHECK-NEXT: vabd.u8 q0, q0, q1
; CHECK-NEXT: bx lr
%zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
%zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
@@ -127,15 +64,7 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: vabd_u16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmovlt.u16 q2, q1
-; CHECK-NEXT: vmovlt.u16 q3, q0
-; CHECK-NEXT: vmovlb.u16 q1, q1
-; CHECK-NEXT: vmovlb.u16 q0, q0
-; CHECK-NEXT: vsub.i32 q2, q3, q2
-; CHECK-NEXT: vsub.i32 q0, q0, q1
-; CHECK-NEXT: vabs.s32 q2, q2
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vmovnt.i32 q0, q2
+; CHECK-NEXT: vabd.u16 q0, q0, q1
; CHECK-NEXT: bx lr
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
@@ -150,46 +79,7 @@ define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vabd_u32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmov.i64 q4, #0xffffffff
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov.f32 s14, s3
-; CHECK-NEXT: vand q2, q2, q4
-; CHECK-NEXT: vand q3, q3, q4
-; CHECK-NEXT: vmov r0, r1, d4
-; CHECK-NEXT: vmov r2, r3, d6
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vand q1, q1, q4
-; CHECK-NEXT: vand q4, q0, q4
-; CHECK-NEXT: subs r0, r2, r0
-; CHECK-NEXT: sbc.w r1, r3, r1
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r12, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, r2, d2
-; CHECK-NEXT: vmov r3, r0, d8
-; CHECK-NEXT: subs r1, r3, r1
-; CHECK-NEXT: sbcs r0, r2
-; CHECK-NEXT: vmov r2, r3, d7
-; CHECK-NEXT: add.w r1, r1, r0, asr #31
-; CHECK-NEXT: eor.w r0, r1, r0, asr #31
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r12
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: subs r0, r2, r0
-; CHECK-NEXT: sbc.w r1, r3, r1
-; CHECK-NEXT: add.w r0, r0, r1, asr #31
-; CHECK-NEXT: eor.w r12, r0, r1, asr #31
-; CHECK-NEXT: vmov r1, r2, d3
-; CHECK-NEXT: vmov r3, r0, d9
-; CHECK-NEXT: subs r1, r3, r1
-; CHECK-NEXT: sbcs r0, r2
-; CHECK-NEXT: add.w r1, r1, r0, asr #31
-; CHECK-NEXT: eor.w r0, r1, r0, asr #31
-; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: vabd.u32 q0, q0, q1
; CHECK-NEXT: bx lr
%zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
%zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
@@ -209,26 +99,37 @@ define void @vabd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y,
; CHECK-NEXT: mov.w lr, #64
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
-; CHECK-NEXT: vldrb.s32 q1, [r0, #12]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.s32 q1, [r0, #8]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #12]
-; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.s32 q1, [r0, #4]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #8]
-; CHECK-NEXT: vldrb.s32 q0, [r1, #4]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.s32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #4]
-; CHECK-NEXT: vldrb.s32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2], #16
+; CHECK-NEXT: vldrb.u8 q0, [r1], #16
+; CHECK-NEXT: vldrb.u8 q1, [r0], #16
+; CHECK-NEXT: vabd.s8 q0, q1, q0
+; CHECK-NEXT: vmov.u8 r12, q0[14]
+; CHECK-NEXT: vmov.u8 r3, q0[12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[15]
+; CHECK-NEXT: vmov.u8 r3, q0[13]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[10]
+; CHECK-NEXT: vmov.u8 r3, q0[8]
+; CHECK-NEXT: vstrb.32 q1, [r2, #12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[11]
+; CHECK-NEXT: vmov.u8 r3, q0[9]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[6]
+; CHECK-NEXT: vmov.u8 r3, q0[4]
+; CHECK-NEXT: vstrb.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[7]
+; CHECK-NEXT: vmov.u8 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[2]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: vstrb.32 q1, [r2, #4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[3]
+; CHECK-NEXT: vmov.u8 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrb.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -269,16 +170,23 @@ define void @vabd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %
; CHECK-NEXT: mov.w lr, #128
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.s32 q0, [r1, #8]
-; CHECK-NEXT: vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrh.s32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2, #8]
-; CHECK-NEXT: vldrh.s32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q0, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r0], #16
+; CHECK-NEXT: vabd.s16 q0, q1, q0
+; CHECK-NEXT: vmov.u16 r12, q0[6]
+; CHECK-NEXT: vmov.u16 r3, q0[4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[7]
+; CHECK-NEXT: vmov.u16 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: vstrh.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[3]
+; CHECK-NEXT: vmov.u16 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrh.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -419,26 +327,37 @@ define void @vabd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y,
; CHECK-NEXT: mov.w lr, #64
; CHECK-NEXT: .LBB9_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
-; CHECK-NEXT: vldrb.u32 q1, [r0, #12]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.u32 q1, [r0, #8]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #12]
-; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.u32 q1, [r0, #4]
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #8]
-; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrb.u32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2, #4]
-; CHECK-NEXT: vldrb.u32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrb.32 q0, [r2], #16
+; CHECK-NEXT: vldrb.u8 q0, [r1], #16
+; CHECK-NEXT: vldrb.u8 q1, [r0], #16
+; CHECK-NEXT: vabd.u8 q0, q1, q0
+; CHECK-NEXT: vmov.u8 r12, q0[14]
+; CHECK-NEXT: vmov.u8 r3, q0[12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[15]
+; CHECK-NEXT: vmov.u8 r3, q0[13]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[10]
+; CHECK-NEXT: vmov.u8 r3, q0[8]
+; CHECK-NEXT: vstrb.32 q1, [r2, #12]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[11]
+; CHECK-NEXT: vmov.u8 r3, q0[9]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[6]
+; CHECK-NEXT: vmov.u8 r3, q0[4]
+; CHECK-NEXT: vstrb.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[7]
+; CHECK-NEXT: vmov.u8 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[2]
+; CHECK-NEXT: vmov.u8 r3, q0[0]
+; CHECK-NEXT: vstrb.32 q1, [r2, #4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u8 r12, q0[3]
+; CHECK-NEXT: vmov.u8 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrb.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB9_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -479,16 +398,23 @@ define void @vabd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %
; CHECK-NEXT: mov.w lr, #128
; CHECK-NEXT: .LBB10_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u32 q0, [r1, #8]
-; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vldrh.u32 q1, [r0], #16
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2, #8]
-; CHECK-NEXT: vldrh.u32 q0, [r1], #16
-; CHECK-NEXT: vsub.i32 q0, q1, q0
-; CHECK-NEXT: vabs.s32 q0, q0
-; CHECK-NEXT: vstrh.32 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q0, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r0], #16
+; CHECK-NEXT: vabd.u16 q0, q1, q0
+; CHECK-NEXT: vmov.u16 r12, q0[6]
+; CHECK-NEXT: vmov.u16 r3, q0[4]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[7]
+; CHECK-NEXT: vmov.u16 r3, q0[5]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[2]
+; CHECK-NEXT: vmov.u16 r3, q0[0]
+; CHECK-NEXT: vstrh.32 q1, [r2, #8]
+; CHECK-NEXT: vmov q1[2], q1[0], r3, r12
+; CHECK-NEXT: vmov.u16 r12, q0[3]
+; CHECK-NEXT: vmov.u16 r3, q0[1]
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r12
+; CHECK-NEXT: vstrh.32 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB10_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
More information about the llvm-commits
mailing list