[llvm] r375085 - [DAGCombine][ARM] Enable extending masked loads
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 17 00:55:55 PDT 2019
Author: sam_parker
Date: Thu Oct 17 00:55:55 2019
New Revision: 375085
URL: http://llvm.org/viewvc/llvm-project?rev=375085&view=rev
Log:
[DAGCombine][ARM] Enable extending masked loads
Add generic DAG combine for extending masked loads.
Allow us to generate sext/zext masked loads which can access v4i8,
v8i8 and v4i16 memory to produce v4i32, v8i16 and v4i32 respectively.
Differential Revision: https://reviews.llvm.org/D68337
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Oct 17 00:55:55 2019
@@ -9345,6 +9345,35 @@ static SDValue tryToFoldExtOfLoad(Select
return SDValue(N, 0); // Return N so it doesn't get rechecked!
}
+static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
+ const TargetLowering &TLI, EVT VT,
+ SDNode *N, SDValue N0,
+ ISD::LoadExtType ExtLoadType,
+ ISD::NodeType ExtOpc) {
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
+ if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+
+ if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0)))
+ return SDValue();
+
+ if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+ return SDValue();
+
+ SDLoc dl(Ld);
+ SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
+ SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getMask(),
+ PassThru, Ld->getMemoryVT(),
+ Ld->getMemOperand(), ExtLoadType,
+ Ld->isExpandingLoad());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
+ return NewLoad;
+}
+
static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
bool LegalOperations) {
assert((N->getOpcode() == ISD::SIGN_EXTEND ||
@@ -9445,6 +9474,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SD
ISD::SEXTLOAD, ISD::SIGN_EXTEND))
return foldedExt;
+ if (SDValue foldedExt =
+ tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
+ ISD::SIGN_EXTEND))
+ return foldedExt;
+
// fold (sext (load x)) to multiple smaller sextloads.
// Only on illegal but splittable vectors.
if (SDValue ExtLoad = CombineExtLoad(N))
@@ -9733,6 +9767,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SD
ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
return foldedExt;
+ if (SDValue foldedExt =
+ tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
+ ISD::ZERO_EXTEND))
+ return foldedExt;
+
// fold (zext (load x)) to multiple smaller zextloads.
// Only on illegal but splittable vectors.
if (SDValue ExtLoad = CombineExtLoad(N))
Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Thu Oct 17 00:55:55 2019
@@ -8898,9 +8898,13 @@ static SDValue LowerMLOAD(SDValue Op, Se
SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
- if (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+ auto IsZero = [](SDValue PassThru) {
+ return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
(PassThru->getOpcode() == ARMISD::VMOVIMM &&
- isNullConstant(PassThru->getOperand(0))))
+ isNullConstant(PassThru->getOperand(0))));
+ };
+
+ if (IsZero(PassThru))
return Op;
// MVE Masked loads use zero as the passthru value. Here we convert undef to
@@ -8911,7 +8915,9 @@ static SDValue LowerMLOAD(SDValue Op, Se
VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
SDValue Combo = NewLoad;
- if (!PassThru.isUndef())
+ if (!PassThru.isUndef() &&
+ (PassThru.getOpcode() != ISD::BITCAST ||
+ !IsZero(PassThru->getOperand(0))))
Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
}
@@ -14698,6 +14704,11 @@ bool ARMTargetLowering::isVectorLoadExtD
if (!isTypeLegal(VT))
return false;
+ if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
+ if (Ld->isExpandingLoad())
+ return false;
+ }
+
// Don't create a loadext if we can fold the extension into a wide/long
// instruction.
// If there's more than one user instruction, the loadext is desirable no
Modified: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMInstrMVE.td?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td (original)
+++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td Thu Oct 17 00:55:55 2019
@@ -5071,16 +5071,52 @@ def aligned16_post_store : PatFrag<(ops
return cast<StoreSDNode>(N)->getAlignment() >= 2;
}]>;
-def alignedmaskedload32 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 4;
-}]>;
-def alignedmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
- return cast<MaskedLoadSDNode>(N)->getAlignment() >= 2;
+
+def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ return Ld->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
+}]>;
+def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+}]>;
+def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
+}]>;
+def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+ (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ auto *Ld = cast<MaskedLoadSDNode>(N);
+ EVT ScalarVT = Ld->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
}]>;
-def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru)>;
def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, node:$pred), [{
@@ -5090,6 +5126,7 @@ def alignedmaskedstore16 : PatFrag<(ops
(masked_st node:$val, node:$ptr, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
}]>;
+
def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred),
(masked_st node:$val, node:$ptr, node:$pred)>;
@@ -5121,16 +5158,6 @@ let Predicates = [HasMVEInt, IsLE] in {
(MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
-
- // Unaligned masked loads
- def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
- (v4f32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
- (v8i16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
- def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
- (v8f16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
}
let Predicates = [HasMVEInt, IsBE] in {
@@ -5195,15 +5222,6 @@ let Predicates = [HasMVEInt, IsBE] in {
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
(MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
- // Unaligned masked loads
- def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
- (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
- def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
- (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
- def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
- (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
- def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
- (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
}
let Predicates = [HasMVEInt] in {
@@ -5214,11 +5232,39 @@ let Predicates = [HasMVEInt] in {
def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
// Aligned masked loads
- def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload, 0>;
+ def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+ // Extending masked loads.
+ def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v8i16 NEONimmAllZerosV))),
+ (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+ def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
+ (v4i32 NEONimmAllZerosV))),
+ (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
}
// Widening/Narrowing Loads/Stores
Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp Thu Oct 17 00:55:55 2019
@@ -495,16 +495,21 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type
if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
return false;
- if (DataTy->isVectorTy()) {
- // We don't yet support narrowing or widening masked loads/stores. Expand
- // them for the moment.
- unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
- if (VecWidth != 128)
+ if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+ // Don't support v2i1 yet.
+ if (VecTy->getNumElements() == 2)
+ return false;
+
+ // We don't support extending fp types.
+ unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+ if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
return false;
}
unsigned EltWidth = DataTy->getScalarSizeInBits();
- return EltWidth == 32 || EltWidth == 16 || EltWidth == 8;
+ return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+ (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
+ (EltWidth == 8);
}
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Oct 17 00:55:55 2019
@@ -29056,6 +29056,9 @@ bool X86TargetLowering::isZExtFree(SDVal
}
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
+ return false;
+
EVT SrcVT = ExtVal.getOperand(0).getValueType();
// There is no extending load for vXi1.
Modified: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll Thu Oct 17 00:55:55 2019
@@ -4,75 +4,39 @@
define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: test_acc_scalar_char:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: it eq
+; CHECK-NEXT: itt eq
+; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: bxeq lr
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: subs r2, #1
; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: sub.w lr, r3, #4
-; CHECK-NEXT: adr r2, .LCPI0_0
+; CHECK-NEXT: vdup.32 q1, r2
+; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: add.w lr, r3, lr, lsr #2
-; CHECK-NEXT: vmov.i32 q4, #0x0
-; CHECK-NEXT: vmov.i32 q2, #0xff
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: adr r3, .LCPI0_0
+; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q4
-; CHECK-NEXT: vadd.i32 q4, q1, r0
-; CHECK-NEXT: vcmp.u32 cs, q0, q4
-; CHECK-NEXT: @ implicit-def: $q4
-; CHECK-NEXT: vmrs r3, p0
-; CHECK-NEXT: and r2, r3, #1
-; CHECK-NEXT: rsbs r4, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r4, #0, #1
-; CHECK-NEXT: ubfx r4, r3, #4, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #1, #1
-; CHECK-NEXT: ubfx r4, r3, #8, #1
-; CHECK-NEXT: ubfx r3, r3, #12, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #2, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #3, #1
-; CHECK-NEXT: lsls r3, r2, #31
-; CHECK-NEXT: add.w r3, r1, r0
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r4, [r3]
-; CHECK-NEXT: vmovne.32 q4[0], r4
-; CHECK-NEXT: lsls r4, r2, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r4, [r3, #1]
-; CHECK-NEXT: vmovmi.32 q4[1], r4
-; CHECK-NEXT: lsls r4, r2, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r4, [r3, #2]
-; CHECK-NEXT: vmovmi.32 q4[2], r4
-; CHECK-NEXT: lsls r2, r2, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r2, [r3, #3]
-; CHECK-NEXT: vmovmi.32 q4[3], r2
-; CHECK-NEXT: vand q5, q4, q2
-; CHECK-NEXT: vmov q4, q3
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vmla.u32 q4, q5, r12
+; CHECK-NEXT: vadd.i32 q4, q2, r2
+; CHECK-NEXT: adds r3, r1, r2
+; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vpt.u32 cs, q1, q4
+; CHECK-NEXT: vldrbt.u32 q4, [r3]
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vmla.u32 q0, q4, r0
; CHECK-NEXT: le lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
-; CHECK-NEXT: vpsel q0, q4, q3
+; CHECK-NEXT: vpsel q0, q0, q3
; CHECK-NEXT: vaddv.u32 r0, q0
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: pop.w {r4, lr}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
@@ -125,74 +89,39 @@ for.cond.cleanup:
define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: test_acc_scalar_short:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: it eq
+; CHECK-NEXT: itt eq
+; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: bxeq lr
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: subs r2, #1
; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: sub.w lr, r3, #4
-; CHECK-NEXT: adr r2, .LCPI1_0
+; CHECK-NEXT: vdup.32 q1, r2
+; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: add.w lr, r3, lr, lsr #2
-; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: adr r3, .LCPI1_0
+; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q2, q3
-; CHECK-NEXT: vadd.i32 q3, q1, r0
-; CHECK-NEXT: vcmp.u32 cs, q0, q3
-; CHECK-NEXT: @ implicit-def: $q3
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vmrs r3, p0
-; CHECK-NEXT: and r2, r3, #1
-; CHECK-NEXT: rsbs r4, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r4, #0, #1
-; CHECK-NEXT: ubfx r4, r3, #4, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #1, #1
-; CHECK-NEXT: ubfx r4, r3, #8, #1
-; CHECK-NEXT: ubfx r3, r3, #12, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #2, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #3, #1
-; CHECK-NEXT: lsls r3, r2, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r3, [r1]
-; CHECK-NEXT: vmovne.32 q3[0], r3
-; CHECK-NEXT: lsls r3, r2, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r1, #2]
-; CHECK-NEXT: vmovmi.32 q3[1], r3
-; CHECK-NEXT: lsls r3, r2, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r1, #4]
-; CHECK-NEXT: vmovmi.32 q3[2], r3
-; CHECK-NEXT: lsls r2, r2, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r2, [r1, #6]
-; CHECK-NEXT: vmovmi.32 q3[3], r2
-; CHECK-NEXT: vmovlb.s16 q4, q3
-; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vadd.i32 q4, q2, r2
+; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vpt.u32 cs, q1, q4
+; CHECK-NEXT: vldrht.s32 q4, [r1]
; CHECK-NEXT: adds r1, #8
-; CHECK-NEXT: vmla.u32 q3, q4, r12
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vmla.u32 q0, q4, r0
; CHECK-NEXT: le lr, .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
-; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vpsel q0, q0, q3
; CHECK-NEXT: vaddv.u32 r0, q0
-; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: pop.w {r4, lr}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI1_0:
@@ -245,75 +174,39 @@ for.cond.cleanup:
define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: test_acc_scalar_uchar:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: it eq
+; CHECK-NEXT: itt eq
+; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: bxeq lr
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: subs r2, #1
; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: sub.w lr, r3, #4
-; CHECK-NEXT: adr r2, .LCPI2_0
+; CHECK-NEXT: vdup.32 q1, r2
+; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: add.w lr, r3, lr, lsr #2
-; CHECK-NEXT: vmov.i32 q4, #0x0
-; CHECK-NEXT: vmov.i32 q2, #0xff
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: adr r3, .LCPI2_0
+; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q4
-; CHECK-NEXT: vadd.i32 q4, q1, r0
-; CHECK-NEXT: vcmp.u32 cs, q0, q4
-; CHECK-NEXT: @ implicit-def: $q4
-; CHECK-NEXT: vmrs r3, p0
-; CHECK-NEXT: and r2, r3, #1
-; CHECK-NEXT: rsbs r4, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r4, #0, #1
-; CHECK-NEXT: ubfx r4, r3, #4, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #1, #1
-; CHECK-NEXT: ubfx r4, r3, #8, #1
-; CHECK-NEXT: ubfx r3, r3, #12, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #2, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #3, #1
-; CHECK-NEXT: lsls r3, r2, #31
-; CHECK-NEXT: add.w r3, r1, r0
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r4, [r3]
-; CHECK-NEXT: vmovne.32 q4[0], r4
-; CHECK-NEXT: lsls r4, r2, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r4, [r3, #1]
-; CHECK-NEXT: vmovmi.32 q4[1], r4
-; CHECK-NEXT: lsls r4, r2, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r4, [r3, #2]
-; CHECK-NEXT: vmovmi.32 q4[2], r4
-; CHECK-NEXT: lsls r2, r2, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r2, [r3, #3]
-; CHECK-NEXT: vmovmi.32 q4[3], r2
-; CHECK-NEXT: vand q5, q4, q2
-; CHECK-NEXT: vmov q4, q3
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vmla.u32 q4, q5, r12
+; CHECK-NEXT: vadd.i32 q4, q2, r2
+; CHECK-NEXT: adds r3, r1, r2
+; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vpt.u32 cs, q1, q4
+; CHECK-NEXT: vldrbt.u32 q4, [r3]
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vmla.u32 q0, q4, r0
; CHECK-NEXT: le lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
-; CHECK-NEXT: vpsel q0, q4, q3
+; CHECK-NEXT: vpsel q0, q0, q3
; CHECK-NEXT: vaddv.u32 r0, q0
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: pop.w {r4, lr}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI2_0:
@@ -366,74 +259,39 @@ for.cond.cleanup:
define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: test_acc_scalar_ushort:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: it eq
+; CHECK-NEXT: itt eq
+; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: bxeq lr
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: subs r2, #1
; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: sub.w lr, r3, #4
-; CHECK-NEXT: adr r2, .LCPI3_0
+; CHECK-NEXT: vdup.32 q1, r2
+; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: add.w lr, r3, lr, lsr #2
-; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: adr r3, .LCPI3_0
+; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q2, q3
-; CHECK-NEXT: vadd.i32 q3, q1, r0
-; CHECK-NEXT: vcmp.u32 cs, q0, q3
-; CHECK-NEXT: @ implicit-def: $q3
-; CHECK-NEXT: adds r0, #4
-; CHECK-NEXT: vmrs r3, p0
-; CHECK-NEXT: and r2, r3, #1
-; CHECK-NEXT: rsbs r4, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r4, #0, #1
-; CHECK-NEXT: ubfx r4, r3, #4, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #1, #1
-; CHECK-NEXT: ubfx r4, r3, #8, #1
-; CHECK-NEXT: ubfx r3, r3, #12, #1
-; CHECK-NEXT: rsbs r4, r4, #0
-; CHECK-NEXT: bfi r2, r4, #2, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #3, #1
-; CHECK-NEXT: lsls r3, r2, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r3, [r1]
-; CHECK-NEXT: vmovne.32 q3[0], r3
-; CHECK-NEXT: lsls r3, r2, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r1, #2]
-; CHECK-NEXT: vmovmi.32 q3[1], r3
-; CHECK-NEXT: lsls r3, r2, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r1, #4]
-; CHECK-NEXT: vmovmi.32 q3[2], r3
-; CHECK-NEXT: lsls r2, r2, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r2, [r1, #6]
-; CHECK-NEXT: vmovmi.32 q3[3], r2
-; CHECK-NEXT: vmovlb.u16 q4, q3
-; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vadd.i32 q4, q2, r2
+; CHECK-NEXT: adds r2, #4
+; CHECK-NEXT: vpt.u32 cs, q1, q4
+; CHECK-NEXT: vldrht.u32 q4, [r1]
; CHECK-NEXT: adds r1, #8
-; CHECK-NEXT: vmla.u32 q3, q4, r12
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vmla.u32 q0, q4, r0
; CHECK-NEXT: le lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
-; CHECK-NEXT: vpsel q0, q3, q2
+; CHECK-NEXT: vpsel q0, q0, q3
; CHECK-NEXT: vaddv.u32 r0, q0
-; CHECK-NEXT: add sp, #8
; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: pop.w {r4, lr}
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI3_0:
@@ -558,134 +416,66 @@ define arm_aapcs_vfpcc void @test_vec_mu
; CHECK-LABEL: test_vec_mul_scalar_add_char:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: ldr.w r12, [sp, #72]
-; CHECK-NEXT: cmp.w r12, #0
+; CHECK-NEXT: ldr r7, [sp, #28]
+; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: beq.w .LBB5_12
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: add.w r5, r3, r12, lsl #2
-; CHECK-NEXT: add.w r6, r1, r12
-; CHECK-NEXT: cmp r5, r1
-; CHECK-NEXT: add.w r4, r0, r12
-; CHECK-NEXT: cset r7, hi
-; CHECK-NEXT: cmp r6, r3
-; CHECK-NEXT: cset r6, hi
-; CHECK-NEXT: cmp r5, r0
+; CHECK-NEXT: add.w r4, r3, r7, lsl #2
+; CHECK-NEXT: adds r5, r1, r7
+; CHECK-NEXT: cmp r4, r1
+; CHECK-NEXT: add.w r6, r0, r7
+; CHECK-NEXT: cset r12, hi
+; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: cset r5, hi
-; CHECK-NEXT: cmp r4, r3
+; CHECK-NEXT: cmp r4, r0
; CHECK-NEXT: cset r4, hi
-; CHECK-NEXT: ands r5, r4
-; CHECK-NEXT: lsls r5, r5, #31
+; CHECK-NEXT: cmp r6, r3
+; CHECK-NEXT: cset r6, hi
+; CHECK-NEXT: ands r6, r4
+; CHECK-NEXT: lsls r6, r6, #31
; CHECK-NEXT: itt eq
-; CHECK-NEXT: andeq r7, r6
-; CHECK-NEXT: lslseq.w r7, r7, #31
+; CHECK-NEXT: andeq.w r6, r5, r12
+; CHECK-NEXT: lslseq.w r6, r6, #31
; CHECK-NEXT: beq .LBB5_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: sub.w r4, r12, #1
-; CHECK-NEXT: and lr, r12, #3
-; CHECK-NEXT: cmp r4, #3
-; CHECK-NEXT: bhs.w .LBB5_6
+; CHECK-NEXT: subs r6, r7, #1
+; CHECK-NEXT: and lr, r7, #3
+; CHECK-NEXT: cmp r6, #3
+; CHECK-NEXT: bhs .LBB5_6
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: b .LBB5_9
; CHECK-NEXT: .LBB5_4: @ %vector.ph
-; CHECK-NEXT: add.w r7, r12, #3
-; CHECK-NEXT: adr r5, .LCPI5_0
-; CHECK-NEXT: bic r7, r7, #3
-; CHECK-NEXT: sub.w r4, r12, #1
-; CHECK-NEXT: subs r7, #4
-; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: vdup.32 q0, r4
-; CHECK-NEXT: add.w lr, r6, r7, lsr #2
-; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: vmov.i32 q2, #0xff
-; CHECK-NEXT: vmov.i32 q3, #0xff
+; CHECK-NEXT: adds r6, r7, #3
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: bic r6, r6, #3
+; CHECK-NEXT: subs r7, #1
+; CHECK-NEXT: subs r6, #4
+; CHECK-NEXT: vdup.32 q0, r7
+; CHECK-NEXT: movs r7, #0
+; CHECK-NEXT: add.w lr, r5, r6, lsr #2
+; CHECK-NEXT: adr r6, .LCPI5_0
+; CHECK-NEXT: vldrw.u32 q1, [r6]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q4, q1, r4
-; CHECK-NEXT: @ implicit-def: $q5
-; CHECK-NEXT: vcmp.u32 cs, q0, q4
-; CHECK-NEXT: @ implicit-def: $q4
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: add.w r6, r0, r4
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r7, [r6]
-; CHECK-NEXT: vmovne.32 q4[0], r7
-; CHECK-NEXT: lsls r7, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #1]
-; CHECK-NEXT: vmovmi.32 q4[1], r7
-; CHECK-NEXT: lsls r7, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #2]
-; CHECK-NEXT: vmovmi.32 q4[2], r7
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r5, [r6, #3]
-; CHECK-NEXT: vmovmi.32 q4[3], r5
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: vand q4, q4, q2
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: add.w r6, r1, r4
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r7, [r6]
-; CHECK-NEXT: vmovne.32 q5[0], r7
-; CHECK-NEXT: lsls r7, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #1]
-; CHECK-NEXT: vmovmi.32 q5[1], r7
-; CHECK-NEXT: lsls r7, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #2]
-; CHECK-NEXT: vmovmi.32 q5[2], r7
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r5, [r6, #3]
-; CHECK-NEXT: vmovmi.32 q5[3], r5
-; CHECK-NEXT: vand q5, q5, q3
-; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: vmul.i32 q4, q5, q4
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: vadd.i32 q4, q4, r2
+; CHECK-NEXT: vadd.i32 q2, q1, r7
+; CHECK-NEXT: adds r4, r0, r7
+; CHECK-NEXT: vpt.u32 cs, q0, q2
+; CHECK-NEXT: vldrbt.u32 q2, [r4]
+; CHECK-NEXT: adds r4, r1, r7
; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q4, [r3]
+; CHECK-NEXT: vldrbt.u32 q3, [r4]
+; CHECK-NEXT: vmul.i32 q2, q3, q2
+; CHECK-NEXT: vadd.i32 q2, q2, r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vstrwt.32 q2, [r3]
; CHECK-NEXT: adds r3, #16
-; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: le lr, .LBB5_5
; CHECK-NEXT: b .LBB5_12
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
-; CHECK-NEXT: sub.w r12, lr, r12
+; CHECK-NEXT: sub.w r12, lr, r7
; CHECK-NEXT: subs r4, r1, #3
; CHECK-NEXT: subs r5, r0, #3
; CHECK-NEXT: sub.w r7, r3, #16
@@ -728,9 +518,6 @@ define arm_aapcs_vfpcc void @test_vec_mu
; CHECK-NEXT: str r7, [r3, #4]!
; CHECK-NEXT: le lr, .LBB5_11
; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.13:
@@ -883,107 +670,41 @@ for.body:
define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
; CHECK-LABEL: test_vec_mul_scalar_add_short:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: ldr.w r12, [sp, #28]
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: ldr.w r12, [sp, #8]
; CHECK-NEXT: cmp.w r12, #0
-; CHECK-NEXT: beq.w .LBB6_3
-; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r5, r12, #3
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r4, pc}
+; CHECK-NEXT: add.w lr, r12, #3
; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: bic r5, r5, #3
-; CHECK-NEXT: subs r5, #4
-; CHECK-NEXT: add.w lr, r4, r5, lsr #2
-; CHECK-NEXT: adr r5, .LCPI6_0
+; CHECK-NEXT: bic lr, lr, #3
+; CHECK-NEXT: sub.w lr, lr, #4
+; CHECK-NEXT: add.w lr, r4, lr, lsr #2
; CHECK-NEXT: sub.w r4, r12, #1
-; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: vdup.32 q0, r4
-; CHECK-NEXT: movs r4, #0
+; CHECK-NEXT: adr r4, .LCPI6_0
+; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: .LBB6_2: @ %vector.body
+; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q1, r4
-; CHECK-NEXT: @ implicit-def: $q3
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: vcmp.u32 cs, q0, q2
-; CHECK-NEXT: @ implicit-def: $q2
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r6, [r0]
-; CHECK-NEXT: vmovne.32 q2[0], r6
-; CHECK-NEXT: lsls r6, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r0, #2]
-; CHECK-NEXT: vmovmi.32 q2[1], r6
-; CHECK-NEXT: lsls r6, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r0, #4]
-; CHECK-NEXT: vmovmi.32 q2[2], r6
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r5, [r0, #6]
-; CHECK-NEXT: vmovmi.32 q2[3], r5
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: vmovlb.s16 q2, q2
+; CHECK-NEXT: vadd.i32 q2, q1, r12
+; CHECK-NEXT: add.w r12, r12, #4
+; CHECK-NEXT: vptt.u32 cs, q0, q2
+; CHECK-NEXT: vldrht.s32 q2, [r0]
+; CHECK-NEXT: vldrht.s32 q3, [r1]
; CHECK-NEXT: adds r0, #8
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r6, [r1]
-; CHECK-NEXT: vmovne.32 q3[0], r6
-; CHECK-NEXT: lsls r6, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r1, #2]
-; CHECK-NEXT: vmovmi.32 q3[1], r6
-; CHECK-NEXT: lsls r6, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r1, #4]
-; CHECK-NEXT: vmovmi.32 q3[2], r6
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r5, [r1, #6]
-; CHECK-NEXT: vmovmi.32 q3[3], r5
-; CHECK-NEXT: vmovlb.s16 q3, q3
-; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vmul.i32 q2, q3, q2
; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: vadd.i32 q2, q2, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q2, [r3]
; CHECK-NEXT: adds r3, #16
-; CHECK-NEXT: sub.w r12, r12, #4
-; CHECK-NEXT: le lr, .LBB6_2
-; CHECK-NEXT: .LBB6_3: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: le lr, .LBB6_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.4:
+; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI6_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
@@ -1035,134 +756,66 @@ define arm_aapcs_vfpcc void @test_vec_mu
; CHECK-LABEL: test_vec_mul_scalar_add_uchar:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: ldr.w r12, [sp, #72]
-; CHECK-NEXT: cmp.w r12, #0
+; CHECK-NEXT: ldr r7, [sp, #28]
+; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: beq.w .LBB7_12
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: add.w r5, r3, r12, lsl #2
-; CHECK-NEXT: add.w r6, r1, r12
-; CHECK-NEXT: cmp r5, r1
-; CHECK-NEXT: add.w r4, r0, r12
-; CHECK-NEXT: cset r7, hi
-; CHECK-NEXT: cmp r6, r3
-; CHECK-NEXT: cset r6, hi
-; CHECK-NEXT: cmp r5, r0
+; CHECK-NEXT: add.w r4, r3, r7, lsl #2
+; CHECK-NEXT: adds r5, r1, r7
+; CHECK-NEXT: cmp r4, r1
+; CHECK-NEXT: add.w r6, r0, r7
+; CHECK-NEXT: cset r12, hi
+; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: cset r5, hi
-; CHECK-NEXT: cmp r4, r3
+; CHECK-NEXT: cmp r4, r0
; CHECK-NEXT: cset r4, hi
-; CHECK-NEXT: ands r5, r4
-; CHECK-NEXT: lsls r5, r5, #31
+; CHECK-NEXT: cmp r6, r3
+; CHECK-NEXT: cset r6, hi
+; CHECK-NEXT: ands r6, r4
+; CHECK-NEXT: lsls r6, r6, #31
; CHECK-NEXT: itt eq
-; CHECK-NEXT: andeq r7, r6
-; CHECK-NEXT: lslseq.w r7, r7, #31
+; CHECK-NEXT: andeq.w r6, r5, r12
+; CHECK-NEXT: lslseq.w r6, r6, #31
; CHECK-NEXT: beq .LBB7_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
-; CHECK-NEXT: sub.w r4, r12, #1
-; CHECK-NEXT: and lr, r12, #3
-; CHECK-NEXT: cmp r4, #3
-; CHECK-NEXT: bhs.w .LBB7_6
+; CHECK-NEXT: subs r6, r7, #1
+; CHECK-NEXT: and lr, r7, #3
+; CHECK-NEXT: cmp r6, #3
+; CHECK-NEXT: bhs .LBB7_6
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: b .LBB7_9
; CHECK-NEXT: .LBB7_4: @ %vector.ph
-; CHECK-NEXT: add.w r7, r12, #3
-; CHECK-NEXT: adr r5, .LCPI7_0
-; CHECK-NEXT: bic r7, r7, #3
-; CHECK-NEXT: sub.w r4, r12, #1
-; CHECK-NEXT: subs r7, #4
-; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: vdup.32 q0, r4
-; CHECK-NEXT: add.w lr, r6, r7, lsr #2
-; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: vmov.i32 q2, #0xff
-; CHECK-NEXT: vmov.i32 q3, #0xff
+; CHECK-NEXT: adds r6, r7, #3
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: bic r6, r6, #3
+; CHECK-NEXT: subs r7, #1
+; CHECK-NEXT: subs r6, #4
+; CHECK-NEXT: vdup.32 q0, r7
+; CHECK-NEXT: movs r7, #0
+; CHECK-NEXT: add.w lr, r5, r6, lsr #2
+; CHECK-NEXT: adr r6, .LCPI7_0
+; CHECK-NEXT: vldrw.u32 q1, [r6]
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB7_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q4, q1, r4
-; CHECK-NEXT: @ implicit-def: $q5
-; CHECK-NEXT: vcmp.u32 cs, q0, q4
-; CHECK-NEXT: @ implicit-def: $q4
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: add.w r6, r0, r4
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r7, [r6]
-; CHECK-NEXT: vmovne.32 q4[0], r7
-; CHECK-NEXT: lsls r7, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #1]
-; CHECK-NEXT: vmovmi.32 q4[1], r7
-; CHECK-NEXT: lsls r7, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #2]
-; CHECK-NEXT: vmovmi.32 q4[2], r7
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r5, [r6, #3]
-; CHECK-NEXT: vmovmi.32 q4[3], r5
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: vand q4, q4, q2
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: add.w r6, r1, r4
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r7, [r6]
-; CHECK-NEXT: vmovne.32 q5[0], r7
-; CHECK-NEXT: lsls r7, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #1]
-; CHECK-NEXT: vmovmi.32 q5[1], r7
-; CHECK-NEXT: lsls r7, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r7, [r6, #2]
-; CHECK-NEXT: vmovmi.32 q5[2], r7
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r5, [r6, #3]
-; CHECK-NEXT: vmovmi.32 q5[3], r5
-; CHECK-NEXT: vand q5, q5, q3
-; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: vmul.i32 q4, q5, q4
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: vadd.i32 q4, q4, r2
+; CHECK-NEXT: vadd.i32 q2, q1, r7
+; CHECK-NEXT: adds r4, r0, r7
+; CHECK-NEXT: vpt.u32 cs, q0, q2
+; CHECK-NEXT: vldrbt.u32 q2, [r4]
+; CHECK-NEXT: adds r4, r1, r7
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrbt.u32 q3, [r4]
+; CHECK-NEXT: vmul.i32 q2, q3, q2
+; CHECK-NEXT: vadd.i32 q2, q2, r2
; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q4, [r3]
+; CHECK-NEXT: vstrwt.32 q2, [r3]
; CHECK-NEXT: adds r3, #16
-; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: adds r7, #4
; CHECK-NEXT: le lr, .LBB7_5
; CHECK-NEXT: b .LBB7_12
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
-; CHECK-NEXT: sub.w r12, lr, r12
+; CHECK-NEXT: sub.w r12, lr, r7
; CHECK-NEXT: subs r4, r1, #3
; CHECK-NEXT: subs r5, r0, #3
; CHECK-NEXT: sub.w r7, r3, #16
@@ -1205,9 +858,6 @@ define arm_aapcs_vfpcc void @test_vec_mu
; CHECK-NEXT: str r7, [r3, #4]!
; CHECK-NEXT: le lr, .LBB7_11
; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.13:
@@ -1360,107 +1010,41 @@ for.body:
define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
; CHECK-LABEL: test_vec_mul_scalar_add_ushort:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: ldr.w r12, [sp, #28]
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: ldr.w r12, [sp, #8]
; CHECK-NEXT: cmp.w r12, #0
-; CHECK-NEXT: beq.w .LBB8_3
-; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: add.w r5, r12, #3
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r4, pc}
+; CHECK-NEXT: add.w lr, r12, #3
; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: bic r5, r5, #3
-; CHECK-NEXT: subs r5, #4
-; CHECK-NEXT: add.w lr, r4, r5, lsr #2
-; CHECK-NEXT: adr r5, .LCPI8_0
+; CHECK-NEXT: bic lr, lr, #3
+; CHECK-NEXT: sub.w lr, lr, #4
+; CHECK-NEXT: add.w lr, r4, lr, lsr #2
; CHECK-NEXT: sub.w r4, r12, #1
-; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: vdup.32 q0, r4
-; CHECK-NEXT: movs r4, #0
+; CHECK-NEXT: adr r4, .LCPI8_0
+; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: .LBB8_2: @ %vector.body
+; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q1, r4
-; CHECK-NEXT: @ implicit-def: $q3
-; CHECK-NEXT: adds r4, #4
-; CHECK-NEXT: vcmp.u32 cs, q0, q2
-; CHECK-NEXT: @ implicit-def: $q2
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r6, [r0]
-; CHECK-NEXT: vmovne.32 q2[0], r6
-; CHECK-NEXT: lsls r6, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r0, #2]
-; CHECK-NEXT: vmovmi.32 q2[1], r6
-; CHECK-NEXT: lsls r6, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r0, #4]
-; CHECK-NEXT: vmovmi.32 q2[2], r6
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r5, [r0, #6]
-; CHECK-NEXT: vmovmi.32 q2[3], r5
-; CHECK-NEXT: vmrs r6, p0
-; CHECK-NEXT: vmovlb.u16 q2, q2
+; CHECK-NEXT: vadd.i32 q2, q1, r12
+; CHECK-NEXT: add.w r12, r12, #4
+; CHECK-NEXT: vptt.u32 cs, q0, q2
+; CHECK-NEXT: vldrht.u32 q2, [r0]
+; CHECK-NEXT: vldrht.u32 q3, [r1]
; CHECK-NEXT: adds r0, #8
-; CHECK-NEXT: and r5, r6, #1
-; CHECK-NEXT: rsbs r7, r5, #0
-; CHECK-NEXT: movs r5, #0
-; CHECK-NEXT: bfi r5, r7, #0, #1
-; CHECK-NEXT: ubfx r7, r6, #4, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #1, #1
-; CHECK-NEXT: ubfx r7, r6, #8, #1
-; CHECK-NEXT: ubfx r6, r6, #12, #1
-; CHECK-NEXT: rsbs r7, r7, #0
-; CHECK-NEXT: bfi r5, r7, #2, #1
-; CHECK-NEXT: rsbs r6, r6, #0
-; CHECK-NEXT: bfi r5, r6, #3, #1
-; CHECK-NEXT: lsls r6, r5, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r6, [r1]
-; CHECK-NEXT: vmovne.32 q3[0], r6
-; CHECK-NEXT: lsls r6, r5, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r1, #2]
-; CHECK-NEXT: vmovmi.32 q3[1], r6
-; CHECK-NEXT: lsls r6, r5, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r6, [r1, #4]
-; CHECK-NEXT: vmovmi.32 q3[2], r6
-; CHECK-NEXT: lsls r5, r5, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r5, [r1, #6]
-; CHECK-NEXT: vmovmi.32 q3[3], r5
-; CHECK-NEXT: vmovlb.u16 q3, q3
-; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vmul.i32 q2, q3, q2
; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: vadd.i32 q2, q2, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q2, [r3]
; CHECK-NEXT: adds r3, #16
-; CHECK-NEXT: sub.w r12, r12, #4
-; CHECK-NEXT: le lr, .LBB8_2
-; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: le lr, .LBB8_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.4:
+; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI8_0:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-ldst.ll Thu Oct 17 00:55:55 2019
@@ -21,49 +21,11 @@ entry:
define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
; CHECK-LABEL: foo_sext_v4i32_v4i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: vmrs lr, p0
-; CHECK-NEXT: and r1, lr, #1
-; CHECK-NEXT: ubfx r3, lr, #4, #1
-; CHECK-NEXT: rsb.w r12, r1, #0
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r12, #0, #1
-; CHECK-NEXT: bfi r1, r3, #1, #1
-; CHECK-NEXT: ubfx r3, lr, #8, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #2, #1
-; CHECK-NEXT: ubfx r3, lr, #12, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #3, #1
-; CHECK-NEXT: lsls r3, r1, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r3, [r2]
-; CHECK-NEXT: vmovne.32 q0[0], r3
-; CHECK-NEXT: lsls r3, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #1]
-; CHECK-NEXT: vmovmi.32 q0[1], r3
-; CHECK-NEXT: lsls r3, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #2]
-; CHECK-NEXT: vmovmi.32 q0[2], r3
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r1, [r2, #3]
-; CHECK-NEXT: vmovmi.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vmovlb.s16 q0, q0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vptt.s32 gt, q0, zr
+; CHECK-NEXT: vldrbt.s32 q0, [r2]
; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
%1 = icmp sgt <4 x i32> %0, zeroinitializer
@@ -76,48 +38,11 @@ entry:
define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
; CHECK-LABEL: foo_sext_v4i32_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: vmrs lr, p0
-; CHECK-NEXT: and r1, lr, #1
-; CHECK-NEXT: ubfx r3, lr, #4, #1
-; CHECK-NEXT: rsb.w r12, r1, #0
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r12, #0, #1
-; CHECK-NEXT: bfi r1, r3, #1, #1
-; CHECK-NEXT: ubfx r3, lr, #8, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #2, #1
-; CHECK-NEXT: ubfx r3, lr, #12, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #3, #1
-; CHECK-NEXT: lsls r3, r1, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r3, [r2]
-; CHECK-NEXT: vmovne.32 q0[0], r3
-; CHECK-NEXT: lsls r3, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r2, #2]
-; CHECK-NEXT: vmovmi.32 q0[1], r3
-; CHECK-NEXT: lsls r3, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r2, #4]
-; CHECK-NEXT: vmovmi.32 q0[2], r3
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r1, [r2, #6]
-; CHECK-NEXT: vmovmi.32 q0[3], r1
-; CHECK-NEXT: vmovlb.s16 q0, q0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vptt.s32 gt, q0, zr
+; CHECK-NEXT: vldrht.s32 q0, [r2]
; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
%1 = icmp sgt <4 x i32> %0, zeroinitializer
@@ -130,49 +55,11 @@ entry:
define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
; CHECK-LABEL: foo_zext_v4i32_v4i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vmov.i32 q1, #0xff
-; CHECK-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: vmrs lr, p0
-; CHECK-NEXT: and r1, lr, #1
-; CHECK-NEXT: ubfx r3, lr, #4, #1
-; CHECK-NEXT: rsb.w r12, r1, #0
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r12, #0, #1
-; CHECK-NEXT: bfi r1, r3, #1, #1
-; CHECK-NEXT: ubfx r3, lr, #8, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #2, #1
-; CHECK-NEXT: ubfx r3, lr, #12, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #3, #1
-; CHECK-NEXT: lsls r3, r1, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r3, [r2]
-; CHECK-NEXT: vmovne.32 q0[0], r3
-; CHECK-NEXT: lsls r3, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #1]
-; CHECK-NEXT: vmovmi.32 q0[1], r3
-; CHECK-NEXT: lsls r3, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #2]
-; CHECK-NEXT: vmovmi.32 q0[2], r3
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r1, [r2, #3]
-; CHECK-NEXT: vmovmi.32 q0[3], r1
-; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vptt.s32 gt, q0, zr
+; CHECK-NEXT: vldrbt.u32 q0, [r2]
; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
%1 = icmp sgt <4 x i32> %0, zeroinitializer
@@ -185,48 +72,11 @@ entry:
define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
; CHECK-LABEL: foo_zext_v4i32_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vcmp.s32 gt, q0, zr
-; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: vmrs lr, p0
-; CHECK-NEXT: and r1, lr, #1
-; CHECK-NEXT: ubfx r3, lr, #4, #1
-; CHECK-NEXT: rsb.w r12, r1, #0
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r12, #0, #1
-; CHECK-NEXT: bfi r1, r3, #1, #1
-; CHECK-NEXT: ubfx r3, lr, #8, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #2, #1
-; CHECK-NEXT: ubfx r3, lr, #12, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #3, #1
-; CHECK-NEXT: lsls r3, r1, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrhne r3, [r2]
-; CHECK-NEXT: vmovne.32 q0[0], r3
-; CHECK-NEXT: lsls r3, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r2, #2]
-; CHECK-NEXT: vmovmi.32 q0[1], r3
-; CHECK-NEXT: lsls r3, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r3, [r2, #4]
-; CHECK-NEXT: vmovmi.32 q0[2], r3
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrhmi r1, [r2, #6]
-; CHECK-NEXT: vmovmi.32 q0[3], r1
-; CHECK-NEXT: vmovlb.u16 q0, q0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vptt.s32 gt, q0, zr
+; CHECK-NEXT: vldrht.u32 q0, [r2]
; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
%1 = icmp sgt <4 x i32> %0, zeroinitializer
@@ -236,6 +86,636 @@ entry:
ret void
}
+define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
+; CHECK-LE-LABEL: foo_sext_v2i64_v2i32:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: ldrd lr, r12, [r1]
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: @ implicit-def: $q1
+; CHECK-LE-NEXT: movs r4, #0
+; CHECK-LE-NEXT: rsbs.w r3, lr, #0
+; CHECK-LE-NEXT: vmov.32 q0[0], lr
+; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT: mov.w lr, #0
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt.w lr, #1
+; CHECK-LE-NEXT: rsbs.w r3, r12, #0
+; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r1, #1
+; CHECK-LE-NEXT: cmp r1, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r1, #1
+; CHECK-LE-NEXT: bfi r1, lr, #0, #1
+; CHECK-LE-NEXT: vmov.32 q0[2], r12
+; CHECK-LE-NEXT: and r3, r1, #3
+; CHECK-LE-NEXT: lsls r1, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrne r1, [r2]
+; CHECK-LE-NEXT: vmovne.32 q1[0], r1
+; CHECK-LE-NEXT: lsls r1, r3, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r1, [r2, #4]
+; CHECK-LE-NEXT: vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT: vmov r2, s0
+; CHECK-LE-NEXT: vmov r3, s4
+; CHECK-LE-NEXT: vmov r1, s6
+; CHECK-LE-NEXT: vmov.32 q1[0], r3
+; CHECK-LE-NEXT: rsbs r5, r2, #0
+; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT: vmov r2, s2
+; CHECK-LE-NEXT: asr.w lr, r3, #31
+; CHECK-LE-NEXT: vmov.32 q1[1], lr
+; CHECK-LE-NEXT: asr.w r12, r1, #31
+; CHECK-LE-NEXT: vmov.32 q1[2], r1
+; CHECK-LE-NEXT: mov.w r1, #0
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r1, #1
+; CHECK-LE-NEXT: vmov.32 q1[3], r12
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r4, #1
+; CHECK-LE-NEXT: cmp r4, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r4, #1
+; CHECK-LE-NEXT: bfi r4, r1, #0, #1
+; CHECK-LE-NEXT: and r1, r4, #3
+; CHECK-LE-NEXT: lsls r2, r4, #31
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: vstrne d2, [r0]
+; CHECK-LE-NEXT: lsls r1, r1, #30
+; CHECK-LE-NEXT: it mi
+; CHECK-LE-NEXT: vstrmi d3, [r0, #8]
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: foo_sext_v2i64_v2i32:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: ldrd r12, lr, [r1]
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: mov.w r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: vmov.32 q0[1], r12
+; CHECK-BE-NEXT: @ implicit-def: $q2
+; CHECK-BE-NEXT: vmov.32 q0[3], lr
+; CHECK-BE-NEXT: mov.w lr, #0
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt.w lr, #1
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r3, #1
+; CHECK-BE-NEXT: bfi r3, lr, #0, #1
+; CHECK-BE-NEXT: and r1, r3, #3
+; CHECK-BE-NEXT: lsls r3, r3, #31
+; CHECK-BE-NEXT: beq .LBB5_2
+; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
+; CHECK-BE-NEXT: ldr r3, [r2]
+; CHECK-BE-NEXT: vmov.32 q1[1], r3
+; CHECK-BE-NEXT: vrev64.32 q2, q1
+; CHECK-BE-NEXT: .LBB5_2: @ %else
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: bpl .LBB5_4
+; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1
+; CHECK-BE-NEXT: ldr r1, [r2, #4]
+; CHECK-BE-NEXT: vrev64.32 q0, q2
+; CHECK-BE-NEXT: vmov.32 q0[3], r1
+; CHECK-BE-NEXT: vrev64.32 q2, q0
+; CHECK-BE-NEXT: .LBB5_4: @ %else2
+; CHECK-BE-NEXT: vrev64.32 q0, q2
+; CHECK-BE-NEXT: vrev64.32 q2, q1
+; CHECK-BE-NEXT: vmov r2, s11
+; CHECK-BE-NEXT: movs r4, #0
+; CHECK-BE-NEXT: vmov r3, s1
+; CHECK-BE-NEXT: vmov r1, s3
+; CHECK-BE-NEXT: rsbs r5, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: vmov r2, s9
+; CHECK-BE-NEXT: asr.w lr, r3, #31
+; CHECK-BE-NEXT: vmov.32 q1[0], lr
+; CHECK-BE-NEXT: asr.w r12, r1, #31
+; CHECK-BE-NEXT: vmov.32 q1[1], r3
+; CHECK-BE-NEXT: vmov.32 q1[2], r12
+; CHECK-BE-NEXT: vmov.32 q1[3], r1
+; CHECK-BE-NEXT: mov.w r1, #0
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r1, #1
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r4, #1
+; CHECK-BE-NEXT: cmp r4, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r4, #1
+; CHECK-BE-NEXT: bfi r4, r1, #0, #1
+; CHECK-BE-NEXT: and r1, r4, #3
+; CHECK-BE-NEXT: lsls r2, r4, #31
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: vstrne d0, [r0]
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: it mi
+; CHECK-BE-NEXT: vstrmi d1, [r0, #8]
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %0 = load <2 x i32>, <2 x i32>* %mask, align 4
+ %1 = icmp sgt <2 x i32> %0, zeroinitializer
+ %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef)
+ %3 = sext <2 x i32> %2 to <2 x i64>
+ call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1)
+ ret void
+}
+
+define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
+; CHECK-LE-LABEL: foo_sext_v2i64_v2i32_unaligned:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-LE-NEXT: push {r4, r5, r7, lr}
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: ldrd lr, r12, [r1]
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: @ implicit-def: $q1
+; CHECK-LE-NEXT: movs r4, #0
+; CHECK-LE-NEXT: rsbs.w r3, lr, #0
+; CHECK-LE-NEXT: vmov.32 q0[0], lr
+; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT: mov.w lr, #0
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt.w lr, #1
+; CHECK-LE-NEXT: rsbs.w r3, r12, #0
+; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r1, #1
+; CHECK-LE-NEXT: cmp r1, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r1, #1
+; CHECK-LE-NEXT: bfi r1, lr, #0, #1
+; CHECK-LE-NEXT: vmov.32 q0[2], r12
+; CHECK-LE-NEXT: and r3, r1, #3
+; CHECK-LE-NEXT: lsls r1, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrne r1, [r2]
+; CHECK-LE-NEXT: vmovne.32 q1[0], r1
+; CHECK-LE-NEXT: lsls r1, r3, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r1, [r2, #4]
+; CHECK-LE-NEXT: vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT: vmov r2, s0
+; CHECK-LE-NEXT: vmov r3, s4
+; CHECK-LE-NEXT: vmov r1, s6
+; CHECK-LE-NEXT: vmov.32 q1[0], r3
+; CHECK-LE-NEXT: rsbs r5, r2, #0
+; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT: vmov r2, s2
+; CHECK-LE-NEXT: asr.w lr, r3, #31
+; CHECK-LE-NEXT: vmov.32 q1[1], lr
+; CHECK-LE-NEXT: asr.w r12, r1, #31
+; CHECK-LE-NEXT: vmov.32 q1[2], r1
+; CHECK-LE-NEXT: mov.w r1, #0
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r1, #1
+; CHECK-LE-NEXT: vmov.32 q1[3], r12
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r4, #1
+; CHECK-LE-NEXT: cmp r4, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r4, #1
+; CHECK-LE-NEXT: bfi r4, r1, #0, #1
+; CHECK-LE-NEXT: and r1, r4, #3
+; CHECK-LE-NEXT: lsls r2, r4, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: vmovne r2, r3, d2
+; CHECK-LE-NEXT: strdne r2, r3, [r0]
+; CHECK-LE-NEXT: lsls r1, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi r1, r2, d3
+; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8]
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: pop {r4, r5, r7, pc}
+;
+; CHECK-BE-LABEL: foo_sext_v2i64_v2i32_unaligned:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-BE-NEXT: push {r4, r5, r7, lr}
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: ldrd r12, lr, [r1]
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: mov.w r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: vmov.32 q0[1], r12
+; CHECK-BE-NEXT: @ implicit-def: $q2
+; CHECK-BE-NEXT: vmov.32 q0[3], lr
+; CHECK-BE-NEXT: mov.w lr, #0
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt.w lr, #1
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r3, #1
+; CHECK-BE-NEXT: bfi r3, lr, #0, #1
+; CHECK-BE-NEXT: and r1, r3, #3
+; CHECK-BE-NEXT: lsls r3, r3, #31
+; CHECK-BE-NEXT: beq .LBB6_2
+; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
+; CHECK-BE-NEXT: ldr r3, [r2]
+; CHECK-BE-NEXT: vmov.32 q1[1], r3
+; CHECK-BE-NEXT: vrev64.32 q2, q1
+; CHECK-BE-NEXT: .LBB6_2: @ %else
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: bpl .LBB6_4
+; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1
+; CHECK-BE-NEXT: ldr r1, [r2, #4]
+; CHECK-BE-NEXT: vrev64.32 q0, q2
+; CHECK-BE-NEXT: vmov.32 q0[3], r1
+; CHECK-BE-NEXT: vrev64.32 q2, q0
+; CHECK-BE-NEXT: .LBB6_4: @ %else2
+; CHECK-BE-NEXT: vrev64.32 q0, q2
+; CHECK-BE-NEXT: vrev64.32 q2, q1
+; CHECK-BE-NEXT: vmov r2, s11
+; CHECK-BE-NEXT: movs r4, #0
+; CHECK-BE-NEXT: vmov r3, s1
+; CHECK-BE-NEXT: vmov r1, s3
+; CHECK-BE-NEXT: rsbs r5, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: vmov r2, s9
+; CHECK-BE-NEXT: asr.w lr, r3, #31
+; CHECK-BE-NEXT: vmov.32 q1[0], lr
+; CHECK-BE-NEXT: asr.w r12, r1, #31
+; CHECK-BE-NEXT: vmov.32 q1[1], r3
+; CHECK-BE-NEXT: vmov.32 q1[2], r12
+; CHECK-BE-NEXT: vmov.32 q1[3], r1
+; CHECK-BE-NEXT: mov.w r1, #0
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r1, #1
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: sbcs.w r2, r4, r2, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r4, #1
+; CHECK-BE-NEXT: cmp r4, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r4, #1
+; CHECK-BE-NEXT: bfi r4, r1, #0, #1
+; CHECK-BE-NEXT: and r1, r4, #3
+; CHECK-BE-NEXT: lsls r2, r4, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: vmovne r2, r3, d0
+; CHECK-BE-NEXT: strdne r3, r2, [r0]
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi r1, r2, d1
+; CHECK-BE-NEXT: strdmi r2, r1, [r0, #8]
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %0 = load <2 x i32>, <2 x i32>* %mask, align 4
+ %1 = icmp sgt <2 x i32> %0, zeroinitializer
+ %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef)
+ %3 = sext <2 x i32> %2 to <2 x i64>
+ call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1)
+ ret void
+}
+
+define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
+; CHECK-LE-LABEL: foo_zext_v2i64_v2i32:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .save {r7, lr}
+; CHECK-LE-NEXT: push {r7, lr}
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: ldrd lr, r12, [r1]
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: @ implicit-def: $q1
+; CHECK-LE-NEXT: rsbs.w r3, lr, #0
+; CHECK-LE-NEXT: vmov.32 q0[0], lr
+; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT: mov.w lr, #0
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt.w lr, #1
+; CHECK-LE-NEXT: rsbs.w r3, r12, #0
+; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r1, #1
+; CHECK-LE-NEXT: cmp r1, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r1, #1
+; CHECK-LE-NEXT: bfi r1, lr, #0, #1
+; CHECK-LE-NEXT: vmov.32 q0[2], r12
+; CHECK-LE-NEXT: and r3, r1, #3
+; CHECK-LE-NEXT: adr.w r12, .LCPI7_0
+; CHECK-LE-NEXT: lsls r1, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrne r1, [r2]
+; CHECK-LE-NEXT: vmovne.32 q1[0], r1
+; CHECK-LE-NEXT: lsls r1, r3, #30
+; CHECK-LE-NEXT: vmov r3, s0
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r1, [r2, #4]
+; CHECK-LE-NEXT: vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vldrw.u32 q2, [r12]
+; CHECK-LE-NEXT: mov.w r12, #0
+; CHECK-LE-NEXT: vand q1, q1, q2
+; CHECK-LE-NEXT: rsbs r1, r3, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-LE-NEXT: vmov r3, s2
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt.w r12, #1
+; CHECK-LE-NEXT: rsbs r1, r3, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r2, #1
+; CHECK-LE-NEXT: cmp r2, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r2, #1
+; CHECK-LE-NEXT: bfi r2, r12, #0, #1
+; CHECK-LE-NEXT: and r1, r2, #3
+; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: vstrne d2, [r0]
+; CHECK-LE-NEXT: lsls r1, r1, #30
+; CHECK-LE-NEXT: it mi
+; CHECK-LE-NEXT: vstrmi d3, [r0, #8]
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: pop {r7, pc}
+; CHECK-LE-NEXT: .p2align 4
+; CHECK-LE-NEXT: @ %bb.1:
+; CHECK-LE-NEXT: .LCPI7_0:
+; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff
+; CHECK-LE-NEXT: .long 0 @ 0x0
+; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff
+; CHECK-LE-NEXT: .long 0 @ 0x0
+;
+; CHECK-BE-LABEL: foo_zext_v2i64_v2i32:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .save {r7, lr}
+; CHECK-BE-NEXT: push {r7, lr}
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: ldrd r12, lr, [r1]
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: mov.w r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: vmov.32 q0[1], r12
+; CHECK-BE-NEXT: @ implicit-def: $q1
+; CHECK-BE-NEXT: vmov.32 q0[3], lr
+; CHECK-BE-NEXT: mov.w lr, #0
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt.w lr, #1
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r3, #1
+; CHECK-BE-NEXT: bfi r3, lr, #0, #1
+; CHECK-BE-NEXT: and r1, r3, #3
+; CHECK-BE-NEXT: lsls r3, r3, #31
+; CHECK-BE-NEXT: beq .LBB7_2
+; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
+; CHECK-BE-NEXT: ldr r3, [r2]
+; CHECK-BE-NEXT: vmov.32 q2[1], r3
+; CHECK-BE-NEXT: vrev64.32 q1, q2
+; CHECK-BE-NEXT: .LBB7_2: @ %else
+; CHECK-BE-NEXT: vrev64.32 q2, q0
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: bpl .LBB7_4
+; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1
+; CHECK-BE-NEXT: ldr r1, [r2, #4]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: vmov.32 q0[3], r1
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: .LBB7_4: @ %else2
+; CHECK-BE-NEXT: vrev64.32 q3, q2
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: vmov r3, s15
+; CHECK-BE-NEXT: adr.w r12, .LCPI7_0
+; CHECK-BE-NEXT: vldrb.u8 q0, [r12]
+; CHECK-BE-NEXT: mov.w r12, #0
+; CHECK-BE-NEXT: vrev64.8 q2, q0
+; CHECK-BE-NEXT: vand q0, q1, q2
+; CHECK-BE-NEXT: rsbs r1, r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-BE-NEXT: vmov r3, s13
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt.w r12, #1
+; CHECK-BE-NEXT: rsbs r1, r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r2, #1
+; CHECK-BE-NEXT: cmp r2, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r2, #1
+; CHECK-BE-NEXT: bfi r2, r12, #0, #1
+; CHECK-BE-NEXT: and r1, r2, #3
+; CHECK-BE-NEXT: lsls r2, r2, #31
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: vstrne d0, [r0]
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: it mi
+; CHECK-BE-NEXT: vstrmi d1, [r0, #8]
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: pop {r7, pc}
+; CHECK-BE-NEXT: .p2align 4
+; CHECK-BE-NEXT: @ %bb.5:
+; CHECK-BE-NEXT: .LCPI7_0:
+; CHECK-BE-NEXT: .long 0 @ 0x0
+; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff
+; CHECK-BE-NEXT: .long 0 @ 0x0
+; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff
+entry:
+ %0 = load <2 x i32>, <2 x i32>* %mask, align 4
+ %1 = icmp sgt <2 x i32> %0, zeroinitializer
+ %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef)
+ %3 = zext <2 x i32> %2 to <2 x i64>
+ call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1)
+ ret void
+}
+
+define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> *%src) {
+; CHECK-LE-LABEL: foo_zext_v2i64_v2i32_unaligned:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .save {r7, lr}
+; CHECK-LE-NEXT: push {r7, lr}
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: ldrd lr, r12, [r1]
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: @ implicit-def: $q1
+; CHECK-LE-NEXT: rsbs.w r3, lr, #0
+; CHECK-LE-NEXT: vmov.32 q0[0], lr
+; CHECK-LE-NEXT: sbcs.w r3, r1, lr, asr #31
+; CHECK-LE-NEXT: mov.w lr, #0
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt.w lr, #1
+; CHECK-LE-NEXT: rsbs.w r3, r12, #0
+; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r1, #1
+; CHECK-LE-NEXT: cmp r1, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r1, #1
+; CHECK-LE-NEXT: bfi r1, lr, #0, #1
+; CHECK-LE-NEXT: vmov.32 q0[2], r12
+; CHECK-LE-NEXT: and r3, r1, #3
+; CHECK-LE-NEXT: adr.w r12, .LCPI8_0
+; CHECK-LE-NEXT: lsls r1, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrne r1, [r2]
+; CHECK-LE-NEXT: vmovne.32 q1[0], r1
+; CHECK-LE-NEXT: lsls r1, r3, #30
+; CHECK-LE-NEXT: vmov r3, s0
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r1, [r2, #4]
+; CHECK-LE-NEXT: vmovmi.32 q1[2], r1
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vldrw.u32 q2, [r12]
+; CHECK-LE-NEXT: mov.w r12, #0
+; CHECK-LE-NEXT: vand q1, q1, q2
+; CHECK-LE-NEXT: rsbs r1, r3, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-LE-NEXT: vmov r3, s2
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt.w r12, #1
+; CHECK-LE-NEXT: rsbs r1, r3, #0
+; CHECK-LE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-LE-NEXT: it lt
+; CHECK-LE-NEXT: movlt r2, #1
+; CHECK-LE-NEXT: cmp r2, #0
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: mvnne r2, #1
+; CHECK-LE-NEXT: bfi r2, r12, #0, #1
+; CHECK-LE-NEXT: and r1, r2, #3
+; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: vmovne r2, r3, d2
+; CHECK-LE-NEXT: strdne r2, r3, [r0]
+; CHECK-LE-NEXT: lsls r1, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi r1, r2, d3
+; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8]
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: pop {r7, pc}
+; CHECK-LE-NEXT: .p2align 4
+; CHECK-LE-NEXT: @ %bb.1:
+; CHECK-LE-NEXT: .LCPI8_0:
+; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff
+; CHECK-LE-NEXT: .long 0 @ 0x0
+; CHECK-LE-NEXT: .long 4294967295 @ 0xffffffff
+; CHECK-LE-NEXT: .long 0 @ 0x0
+;
+; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .save {r7, lr}
+; CHECK-BE-NEXT: push {r7, lr}
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: ldrd r12, lr, [r1]
+; CHECK-BE-NEXT: rsbs.w r1, lr, #0
+; CHECK-BE-NEXT: mov.w r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, lr, asr #31
+; CHECK-BE-NEXT: vmov.32 q0[1], r12
+; CHECK-BE-NEXT: @ implicit-def: $q1
+; CHECK-BE-NEXT: vmov.32 q0[3], lr
+; CHECK-BE-NEXT: mov.w lr, #0
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt.w lr, #1
+; CHECK-BE-NEXT: rsbs.w r1, r12, #0
+; CHECK-BE-NEXT: sbcs.w r1, r3, r12, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r3, #1
+; CHECK-BE-NEXT: cmp r3, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r3, #1
+; CHECK-BE-NEXT: bfi r3, lr, #0, #1
+; CHECK-BE-NEXT: and r1, r3, #3
+; CHECK-BE-NEXT: lsls r3, r3, #31
+; CHECK-BE-NEXT: beq .LBB8_2
+; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
+; CHECK-BE-NEXT: ldr r3, [r2]
+; CHECK-BE-NEXT: vmov.32 q2[1], r3
+; CHECK-BE-NEXT: vrev64.32 q1, q2
+; CHECK-BE-NEXT: .LBB8_2: @ %else
+; CHECK-BE-NEXT: vrev64.32 q2, q0
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: bpl .LBB8_4
+; CHECK-BE-NEXT: @ %bb.3: @ %cond.load1
+; CHECK-BE-NEXT: ldr r1, [r2, #4]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: vmov.32 q0[3], r1
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: .LBB8_4: @ %else2
+; CHECK-BE-NEXT: vrev64.32 q3, q2
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: vmov r3, s15
+; CHECK-BE-NEXT: adr.w r12, .LCPI8_0
+; CHECK-BE-NEXT: vldrb.u8 q0, [r12]
+; CHECK-BE-NEXT: mov.w r12, #0
+; CHECK-BE-NEXT: vrev64.8 q2, q0
+; CHECK-BE-NEXT: vand q0, q1, q2
+; CHECK-BE-NEXT: rsbs r1, r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-BE-NEXT: vmov r3, s13
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt.w r12, #1
+; CHECK-BE-NEXT: rsbs r1, r3, #0
+; CHECK-BE-NEXT: sbcs.w r1, r2, r3, asr #31
+; CHECK-BE-NEXT: it lt
+; CHECK-BE-NEXT: movlt r2, #1
+; CHECK-BE-NEXT: cmp r2, #0
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: mvnne r2, #1
+; CHECK-BE-NEXT: bfi r2, r12, #0, #1
+; CHECK-BE-NEXT: and r1, r2, #3
+; CHECK-BE-NEXT: lsls r2, r2, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: vmovne r2, r3, d0
+; CHECK-BE-NEXT: strdne r3, r2, [r0]
+; CHECK-BE-NEXT: lsls r1, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi r1, r2, d1
+; CHECK-BE-NEXT: strdmi r2, r1, [r0, #8]
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: pop {r7, pc}
+; CHECK-BE-NEXT: .p2align 4
+; CHECK-BE-NEXT: @ %bb.5:
+; CHECK-BE-NEXT: .LCPI8_0:
+; CHECK-BE-NEXT: .long 0 @ 0x0
+; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff
+; CHECK-BE-NEXT: .long 0 @ 0x0
+; CHECK-BE-NEXT: .long 4294967295 @ 0xffffffff
+entry:
+ %0 = load <2 x i32>, <2 x i32>* %mask, align 4
+ %1 = icmp sgt <2 x i32> %0, zeroinitializer
+ %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef)
+ %3 = zext <2 x i32> %2 to <2 x i64>
+ call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1)
+ ret void
+}
+
define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
; CHECK-LABEL: foo_v8i16_v8i16:
; CHECK: @ %bb.0: @ %entry
@@ -255,77 +735,11 @@ entry:
define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
; CHECK-LABEL: foo_sext_v8i16_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrh.u16 q0, [r1]
-; CHECK-NEXT: vcmp.s16 gt, q0, zr
-; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: vmrs lr, p0
-; CHECK-NEXT: and r3, lr, #1
-; CHECK-NEXT: ubfx r1, lr, #2, #1
-; CHECK-NEXT: rsb.w r12, r3, #0
-; CHECK-NEXT: movs r3, #0
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r12, #0, #1
-; CHECK-NEXT: bfi r3, r1, #1, #1
-; CHECK-NEXT: ubfx r1, lr, #4, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #2, #1
-; CHECK-NEXT: ubfx r1, lr, #6, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #3, #1
-; CHECK-NEXT: ubfx r1, lr, #8, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #4, #1
-; CHECK-NEXT: ubfx r1, lr, #10, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #5, #1
-; CHECK-NEXT: ubfx r1, lr, #12, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #6, #1
-; CHECK-NEXT: ubfx r1, lr, #14, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #7, #1
-; CHECK-NEXT: uxtb r1, r3
-; CHECK-NEXT: lsls r3, r3, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r3, [r2]
-; CHECK-NEXT: vmovne.16 q0[0], r3
-; CHECK-NEXT: lsls r3, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #1]
-; CHECK-NEXT: vmovmi.16 q0[1], r3
-; CHECK-NEXT: lsls r3, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #2]
-; CHECK-NEXT: vmovmi.16 q0[2], r3
-; CHECK-NEXT: lsls r3, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #3]
-; CHECK-NEXT: vmovmi.16 q0[3], r3
-; CHECK-NEXT: lsls r3, r1, #27
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #4]
-; CHECK-NEXT: vmovmi.16 q0[4], r3
-; CHECK-NEXT: lsls r3, r1, #26
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #5]
-; CHECK-NEXT: vmovmi.16 q0[5], r3
-; CHECK-NEXT: lsls r3, r1, #25
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #6]
-; CHECK-NEXT: vmovmi.16 q0[6], r3
-; CHECK-NEXT: lsls r1, r1, #24
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r1, [r2, #7]
-; CHECK-NEXT: vmovmi.16 q0[7], r1
-; CHECK-NEXT: vmovlb.s8 q0, q0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vptt.s16 gt, q0, zr
+; CHECK-NEXT: vldrbt.s16 q0, [r2]
; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
%1 = icmp sgt <8 x i16> %0, zeroinitializer
@@ -338,77 +752,11 @@ entry:
define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
; CHECK-LABEL: foo_zext_v8i16_v8i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrh.u16 q0, [r1]
-; CHECK-NEXT: vcmp.s16 gt, q0, zr
-; CHECK-NEXT: @ implicit-def: $q0
-; CHECK-NEXT: vmrs lr, p0
-; CHECK-NEXT: and r3, lr, #1
-; CHECK-NEXT: ubfx r1, lr, #2, #1
-; CHECK-NEXT: rsb.w r12, r3, #0
-; CHECK-NEXT: movs r3, #0
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r12, #0, #1
-; CHECK-NEXT: bfi r3, r1, #1, #1
-; CHECK-NEXT: ubfx r1, lr, #4, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #2, #1
-; CHECK-NEXT: ubfx r1, lr, #6, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #3, #1
-; CHECK-NEXT: ubfx r1, lr, #8, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #4, #1
-; CHECK-NEXT: ubfx r1, lr, #10, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #5, #1
-; CHECK-NEXT: ubfx r1, lr, #12, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #6, #1
-; CHECK-NEXT: ubfx r1, lr, #14, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r3, r1, #7, #1
-; CHECK-NEXT: uxtb r1, r3
-; CHECK-NEXT: lsls r3, r3, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrbne r3, [r2]
-; CHECK-NEXT: vmovne.16 q0[0], r3
-; CHECK-NEXT: lsls r3, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #1]
-; CHECK-NEXT: vmovmi.16 q0[1], r3
-; CHECK-NEXT: lsls r3, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #2]
-; CHECK-NEXT: vmovmi.16 q0[2], r3
-; CHECK-NEXT: lsls r3, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #3]
-; CHECK-NEXT: vmovmi.16 q0[3], r3
-; CHECK-NEXT: lsls r3, r1, #27
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #4]
-; CHECK-NEXT: vmovmi.16 q0[4], r3
-; CHECK-NEXT: lsls r3, r1, #26
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #5]
-; CHECK-NEXT: vmovmi.16 q0[5], r3
-; CHECK-NEXT: lsls r3, r1, #25
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r3, [r2, #6]
-; CHECK-NEXT: vmovmi.16 q0[6], r3
-; CHECK-NEXT: lsls r1, r1, #24
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: ldrbmi r1, [r2, #7]
-; CHECK-NEXT: vmovmi.16 q0[7], r1
-; CHECK-NEXT: vmovlb.u8 q0, q0
-; CHECK-NEXT: vpst
+; CHECK-NEXT: vptt.s16 gt, q0, zr
+; CHECK-NEXT: vldrbt.u16 q0, [r2]
; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
%1 = icmp sgt <8 x i16> %0, zeroinitializer
@@ -435,74 +783,23 @@ entry:
}
define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
-; CHECK-LABEL: foo_trunc_v8i8_v8i16:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .pad #8
-; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: vldrh.u16 q0, [r1]
-; CHECK-NEXT: vpt.s16 gt, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r2]
-; CHECK-NEXT: vmrs r1, p0
-; CHECK-NEXT: and r2, r1, #1
-; CHECK-NEXT: rsbs r3, r2, #0
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: bfi r2, r3, #0, #1
-; CHECK-NEXT: ubfx r3, r1, #2, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #1, #1
-; CHECK-NEXT: ubfx r3, r1, #4, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #2, #1
-; CHECK-NEXT: ubfx r3, r1, #6, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #3, #1
-; CHECK-NEXT: ubfx r3, r1, #8, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #4, #1
-; CHECK-NEXT: ubfx r3, r1, #10, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #5, #1
-; CHECK-NEXT: ubfx r3, r1, #12, #1
-; CHECK-NEXT: ubfx r1, r1, #14, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r2, r3, #6, #1
-; CHECK-NEXT: rsbs r1, r1, #0
-; CHECK-NEXT: bfi r2, r1, #7, #1
-; CHECK-NEXT: uxtb r1, r2
-; CHECK-NEXT: lsls r2, r2, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: vmovne.u16 r2, q0[0]
-; CHECK-NEXT: strbne r2, [r0]
-; CHECK-NEXT: lsls r2, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r2, q0[1]
-; CHECK-NEXT: strbmi r2, [r0, #1]
-; CHECK-NEXT: lsls r2, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r2, q0[2]
-; CHECK-NEXT: strbmi r2, [r0, #2]
-; CHECK-NEXT: lsls r2, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r2, q0[3]
-; CHECK-NEXT: strbmi r2, [r0, #3]
-; CHECK-NEXT: lsls r2, r1, #27
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r2, q0[4]
-; CHECK-NEXT: strbmi r2, [r0, #4]
-; CHECK-NEXT: lsls r2, r1, #26
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r2, q0[5]
-; CHECK-NEXT: strbmi r2, [r0, #5]
-; CHECK-NEXT: lsls r2, r1, #25
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r2, q0[6]
-; CHECK-NEXT: strbmi r2, [r0, #6]
-; CHECK-NEXT: lsls r1, r1, #24
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi.u16 r1, q0[7]
-; CHECK-NEXT: strbmi r1, [r0, #7]
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: bx lr
+; CHECK-LE-LABEL: foo_trunc_v8i8_v8i16:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vldrh.u16 q0, [r1]
+; CHECK-LE-NEXT: vptt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u16 q0, [r2]
+; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: foo_trunc_v8i8_v8i16:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vldrh.u16 q0, [r1]
+; CHECK-BE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT: vldrht.u16 q0, [r2]
+; CHECK-BE-NEXT: vrev16.8 q0, q0
+; CHECK-BE-NEXT: vpst
+; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: bx lr
entry:
%0 = load <8 x i16>, <8 x i16>* %mask, align 2
%1 = icmp sgt <8 x i16> %0, zeroinitializer
@@ -513,45 +810,23 @@ entry:
}
define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
-; CHECK-LABEL: foo_trunc_v4i8_v4i32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vpt.s32 gt, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r2]
-; CHECK-NEXT: vmrs r2, p0
-; CHECK-NEXT: and r1, r2, #1
-; CHECK-NEXT: rsbs r3, r1, #0
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: bfi r1, r3, #0, #1
-; CHECK-NEXT: ubfx r3, r2, #4, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #1, #1
-; CHECK-NEXT: ubfx r3, r2, #8, #1
-; CHECK-NEXT: ubfx r2, r2, #12, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #2, #1
-; CHECK-NEXT: rsbs r2, r2, #0
-; CHECK-NEXT: bfi r1, r2, #3, #1
-; CHECK-NEXT: lsls r2, r1, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: vmovne r2, s0
-; CHECK-NEXT: strbne r2, [r0]
-; CHECK-NEXT: lsls r2, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi r2, s1
-; CHECK-NEXT: strbmi r2, [r0, #1]
-; CHECK-NEXT: lsls r2, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi r2, s2
-; CHECK-NEXT: strbmi r2, [r0, #2]
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi r1, s3
-; CHECK-NEXT: strbmi r1, [r0, #3]
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: bx lr
+; CHECK-LE-LABEL: foo_trunc_v4i8_v4i32:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vldrw.u32 q0, [r1]
+; CHECK-LE-NEXT: vptt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrwt.u32 q0, [r2]
+; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: foo_trunc_v4i8_v4i32:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vldrw.u32 q0, [r1]
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrwt.u32 q0, [r2]
+; CHECK-BE-NEXT: vrev32.8 q0, q0
+; CHECK-BE-NEXT: vpst
+; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
%1 = icmp sgt <4 x i32> %0, zeroinitializer
@@ -562,45 +837,23 @@ entry:
}
define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
-; CHECK-LABEL: foo_trunc_v4i16_v4i32:
-; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .pad #4
-; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vpt.s32 gt, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r2]
-; CHECK-NEXT: vmrs r2, p0
-; CHECK-NEXT: and r1, r2, #1
-; CHECK-NEXT: rsbs r3, r1, #0
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: bfi r1, r3, #0, #1
-; CHECK-NEXT: ubfx r3, r2, #4, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #1, #1
-; CHECK-NEXT: ubfx r3, r2, #8, #1
-; CHECK-NEXT: ubfx r2, r2, #12, #1
-; CHECK-NEXT: rsbs r3, r3, #0
-; CHECK-NEXT: bfi r1, r3, #2, #1
-; CHECK-NEXT: rsbs r2, r2, #0
-; CHECK-NEXT: bfi r1, r2, #3, #1
-; CHECK-NEXT: lsls r2, r1, #31
-; CHECK-NEXT: itt ne
-; CHECK-NEXT: vmovne r2, s0
-; CHECK-NEXT: strhne r2, [r0]
-; CHECK-NEXT: lsls r2, r1, #30
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi r2, s1
-; CHECK-NEXT: strhmi r2, [r0, #2]
-; CHECK-NEXT: lsls r2, r1, #29
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi r2, s2
-; CHECK-NEXT: strhmi r2, [r0, #4]
-; CHECK-NEXT: lsls r1, r1, #28
-; CHECK-NEXT: itt mi
-; CHECK-NEXT: vmovmi r1, s3
-; CHECK-NEXT: strhmi r1, [r0, #6]
-; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: bx lr
+; CHECK-LE-LABEL: foo_trunc_v4i16_v4i32:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vldrw.u32 q0, [r1]
+; CHECK-LE-NEXT: vptt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrwt.u32 q0, [r2]
+; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: foo_trunc_v4i16_v4i32:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vldrw.u32 q0, [r1]
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrwt.u32 q0, [r2]
+; CHECK-BE-NEXT: vrev32.8 q0, q0
+; CHECK-BE-NEXT: vpst
+; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: bx lr
entry:
%0 = load <4 x i32>, <4 x i32>* %mask, align 4
%1 = icmp sgt <4 x i32> %0, zeroinitializer
@@ -642,6 +895,270 @@ entry:
ret void
}
+define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%src) {
+; CHECK-LABEL: foo_v4f32_v4f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: vldrh.s32 q0, [r1]
+; CHECK-NEXT: mov.w lr, #0
+; CHECK-NEXT: @ implicit-def: $q1
+; CHECK-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-NEXT: vmrs r3, p0
+; CHECK-NEXT: and r1, r3, #1
+; CHECK-NEXT: rsb.w r12, r1, #0
+; CHECK-NEXT: ubfx r1, r3, #4, #1
+; CHECK-NEXT: bfi lr, r12, #0, #1
+; CHECK-NEXT: rsbs r1, r1, #0
+; CHECK-NEXT: bfi lr, r1, #1, #1
+; CHECK-NEXT: ubfx r1, r3, #8, #1
+; CHECK-NEXT: rsbs r1, r1, #0
+; CHECK-NEXT: bfi lr, r1, #2, #1
+; CHECK-NEXT: ubfx r1, r3, #12, #1
+; CHECK-NEXT: rsbs r1, r1, #0
+; CHECK-NEXT: bfi lr, r1, #3, #1
+; CHECK-NEXT: lsls.w r1, lr, #31
+; CHECK-NEXT: beq .LBB18_2
+; CHECK-NEXT: @ %bb.1: @ %cond.load
+; CHECK-NEXT: vldr.16 s4, [r2]
+; CHECK-NEXT: .LBB18_2: @ %else
+; CHECK-NEXT: lsls.w r1, lr, #30
+; CHECK-NEXT: bpl .LBB18_6
+; CHECK-NEXT: @ %bb.3: @ %cond.load1
+; CHECK-NEXT: vldr.16 s0, [r2, #2]
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmovx.f16 s4, s5
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q0[0], r3
+; CHECK-NEXT: vmov.16 q0[1], r1
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: vmov.16 q0[2], r1
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: vmov.16 q0[3], r1
+; CHECK-NEXT: lsls.w r1, lr, #29
+; CHECK-NEXT: bmi .LBB18_7
+; CHECK-NEXT: .LBB18_4:
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: lsls.w r1, lr, #28
+; CHECK-NEXT: bmi .LBB18_8
+; CHECK-NEXT: .LBB18_5:
+; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: b .LBB18_9
+; CHECK-NEXT: .LBB18_6:
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: lsls.w r1, lr, #29
+; CHECK-NEXT: bpl .LBB18_4
+; CHECK-NEXT: .LBB18_7: @ %cond.load4
+; CHECK-NEXT: vmovx.f16 s4, s0
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vldr.16 s4, [r2, #4]
+; CHECK-NEXT: vmov.16 q2[0], r1
+; CHECK-NEXT: vmovx.f16 s0, s1
+; CHECK-NEXT: vmov.16 q2[1], r3
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: vmov.16 q2[2], r1
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q2[3], r1
+; CHECK-NEXT: lsls.w r1, lr, #28
+; CHECK-NEXT: bpl .LBB18_5
+; CHECK-NEXT: .LBB18_8: @ %cond.load7
+; CHECK-NEXT: vmovx.f16 s0, s8
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q1[0], r3
+; CHECK-NEXT: vldr.16 s0, [r2, #6]
+; CHECK-NEXT: vmov.16 q1[1], r1
+; CHECK-NEXT: vmov r1, s9
+; CHECK-NEXT: vmov.16 q1[2], r1
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q1[3], r1
+; CHECK-NEXT: .LBB18_9: @ %else8
+; CHECK-NEXT: vmrs r2, p0
+; CHECK-NEXT: vmovx.f16 s0, s5
+; CHECK-NEXT: vcvtb.f32.f16 s3, s0
+; CHECK-NEXT: vmovx.f16 s8, s4
+; CHECK-NEXT: vcvtb.f32.f16 s2, s5
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vcvtb.f32.f16 s1, s8
+; CHECK-NEXT: vcvtb.f32.f16 s0, s4
+; CHECK-NEXT: and r3, r2, #1
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: bfi r1, r3, #0, #1
+; CHECK-NEXT: ubfx r3, r2, #4, #1
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: bfi r1, r3, #1, #1
+; CHECK-NEXT: ubfx r3, r2, #8, #1
+; CHECK-NEXT: ubfx r2, r2, #12, #1
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: bfi r1, r3, #2, #1
+; CHECK-NEXT: rsbs r2, r2, #0
+; CHECK-NEXT: bfi r1, r2, #3, #1
+; CHECK-NEXT: lsls r2, r1, #31
+; CHECK-NEXT: ittt ne
+; CHECK-NEXT: vstrne s0, [sp, #12]
+; CHECK-NEXT: ldrne r2, [sp, #12]
+; CHECK-NEXT: strne r2, [r0]
+; CHECK-NEXT: lsls r2, r1, #30
+; CHECK-NEXT: ittt mi
+; CHECK-NEXT: vstrmi s1, [sp, #8]
+; CHECK-NEXT: ldrmi r2, [sp, #8]
+; CHECK-NEXT: strmi r2, [r0, #4]
+; CHECK-NEXT: lsls r2, r1, #29
+; CHECK-NEXT: ittt mi
+; CHECK-NEXT: vstrmi s2, [sp, #4]
+; CHECK-NEXT: ldrmi r2, [sp, #4]
+; CHECK-NEXT: strmi r2, [r0, #8]
+; CHECK-NEXT: lsls r1, r1, #28
+; CHECK-NEXT: ittt mi
+; CHECK-NEXT: vstrmi s3, [sp]
+; CHECK-NEXT: ldrmi r1, [sp]
+; CHECK-NEXT: strmi r1, [r0, #12]
+; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %0 = load <4 x i16>, <4 x i16>* %mask, align 2
+ %1 = icmp sgt <4 x i16> %0, zeroinitializer
+ %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef)
+ %3 = fpext <4 x half> %2 to <4 x float>
+ call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 2, <4 x i1> %1)
+ ret void
+}
+
+define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%src) {
+; CHECK-LABEL: foo_v4f32_v4f16_unaligned:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: vldrh.s32 q0, [r1]
+; CHECK-NEXT: mov.w lr, #0
+; CHECK-NEXT: @ implicit-def: $q1
+; CHECK-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-NEXT: vmrs r3, p0
+; CHECK-NEXT: and r1, r3, #1
+; CHECK-NEXT: rsb.w r12, r1, #0
+; CHECK-NEXT: ubfx r1, r3, #4, #1
+; CHECK-NEXT: bfi lr, r12, #0, #1
+; CHECK-NEXT: rsbs r1, r1, #0
+; CHECK-NEXT: bfi lr, r1, #1, #1
+; CHECK-NEXT: ubfx r1, r3, #8, #1
+; CHECK-NEXT: rsbs r1, r1, #0
+; CHECK-NEXT: bfi lr, r1, #2, #1
+; CHECK-NEXT: ubfx r1, r3, #12, #1
+; CHECK-NEXT: rsbs r1, r1, #0
+; CHECK-NEXT: bfi lr, r1, #3, #1
+; CHECK-NEXT: lsls.w r1, lr, #31
+; CHECK-NEXT: beq .LBB19_2
+; CHECK-NEXT: @ %bb.1: @ %cond.load
+; CHECK-NEXT: vldr.16 s4, [r2]
+; CHECK-NEXT: .LBB19_2: @ %else
+; CHECK-NEXT: lsls.w r1, lr, #30
+; CHECK-NEXT: bpl .LBB19_6
+; CHECK-NEXT: @ %bb.3: @ %cond.load1
+; CHECK-NEXT: vldr.16 s0, [r2, #2]
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmovx.f16 s4, s5
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q0[0], r3
+; CHECK-NEXT: vmov.16 q0[1], r1
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: vmov.16 q0[2], r1
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: vmov.16 q0[3], r1
+; CHECK-NEXT: lsls.w r1, lr, #29
+; CHECK-NEXT: bmi .LBB19_7
+; CHECK-NEXT: .LBB19_4:
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: lsls.w r1, lr, #28
+; CHECK-NEXT: bmi .LBB19_8
+; CHECK-NEXT: .LBB19_5:
+; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: b .LBB19_9
+; CHECK-NEXT: .LBB19_6:
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: lsls.w r1, lr, #29
+; CHECK-NEXT: bpl .LBB19_4
+; CHECK-NEXT: .LBB19_7: @ %cond.load4
+; CHECK-NEXT: vmovx.f16 s4, s0
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vldr.16 s4, [r2, #4]
+; CHECK-NEXT: vmov.16 q2[0], r1
+; CHECK-NEXT: vmovx.f16 s0, s1
+; CHECK-NEXT: vmov.16 q2[1], r3
+; CHECK-NEXT: vmov r1, s4
+; CHECK-NEXT: vmov.16 q2[2], r1
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q2[3], r1
+; CHECK-NEXT: lsls.w r1, lr, #28
+; CHECK-NEXT: bpl .LBB19_5
+; CHECK-NEXT: .LBB19_8: @ %cond.load7
+; CHECK-NEXT: vmovx.f16 s0, s8
+; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q1[0], r3
+; CHECK-NEXT: vldr.16 s0, [r2, #6]
+; CHECK-NEXT: vmov.16 q1[1], r1
+; CHECK-NEXT: vmov r1, s9
+; CHECK-NEXT: vmov.16 q1[2], r1
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: vmov.16 q1[3], r1
+; CHECK-NEXT: .LBB19_9: @ %else8
+; CHECK-NEXT: vmrs r2, p0
+; CHECK-NEXT: vmovx.f16 s0, s5
+; CHECK-NEXT: vcvtb.f32.f16 s3, s0
+; CHECK-NEXT: vmovx.f16 s8, s4
+; CHECK-NEXT: vcvtb.f32.f16 s2, s5
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: vcvtb.f32.f16 s1, s8
+; CHECK-NEXT: vcvtb.f32.f16 s0, s4
+; CHECK-NEXT: and r3, r2, #1
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: bfi r1, r3, #0, #1
+; CHECK-NEXT: ubfx r3, r2, #4, #1
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: bfi r1, r3, #1, #1
+; CHECK-NEXT: ubfx r3, r2, #8, #1
+; CHECK-NEXT: ubfx r2, r2, #12, #1
+; CHECK-NEXT: rsbs r3, r3, #0
+; CHECK-NEXT: bfi r1, r3, #2, #1
+; CHECK-NEXT: rsbs r2, r2, #0
+; CHECK-NEXT: bfi r1, r2, #3, #1
+; CHECK-NEXT: lsls r2, r1, #31
+; CHECK-NEXT: ittt ne
+; CHECK-NEXT: vstrne s0, [sp, #12]
+; CHECK-NEXT: ldrne r2, [sp, #12]
+; CHECK-NEXT: strne r2, [r0]
+; CHECK-NEXT: lsls r2, r1, #30
+; CHECK-NEXT: ittt mi
+; CHECK-NEXT: vstrmi s1, [sp, #8]
+; CHECK-NEXT: ldrmi r2, [sp, #8]
+; CHECK-NEXT: strmi r2, [r0, #4]
+; CHECK-NEXT: lsls r2, r1, #29
+; CHECK-NEXT: ittt mi
+; CHECK-NEXT: vstrmi s2, [sp, #4]
+; CHECK-NEXT: ldrmi r2, [sp, #4]
+; CHECK-NEXT: strmi r2, [r0, #8]
+; CHECK-NEXT: lsls r1, r1, #28
+; CHECK-NEXT: ittt mi
+; CHECK-NEXT: vstrmi s3, [sp]
+; CHECK-NEXT: ldrmi r1, [sp]
+; CHECK-NEXT: strmi r1, [r0, #12]
+; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %0 = load <4 x i16>, <4 x i16>* %mask, align 2
+ %1 = icmp sgt <4 x i16> %0, zeroinitializer
+ %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef)
+ %3 = fpext <4 x half> %2 to <4 x float>
+ call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 1, <4 x i1> %1)
+ ret void
+}
+
declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
@@ -649,13 +1166,16 @@ declare void @llvm.masked.store.v8f16(<8
declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <4 x half> @llvm.masked.load.v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>)
declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
declare void @llvm.masked.store.v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
declare <4 x i16> @llvm.masked.load.v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-load.ll Thu Oct 17 00:55:55 2019
@@ -45,17 +45,82 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT: @ implicit-def: $q0
+; CHECK-LE-NEXT: vmrs r2, p0
+; CHECK-LE-NEXT: and r1, r2, #1
+; CHECK-LE-NEXT: rsbs r3, r1, #0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrne r2, [r0]
+; CHECK-LE-NEXT: vmovne.32 q0[0], r2
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r2, [r0, #4]
+; CHECK-LE-NEXT: vmovmi.32 q0[1], r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r2, [r0, #8]
+; CHECK-LE-NEXT: vmovmi.32 q0[2], r2
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r0, [r0, #12]
+; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
+; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i32_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT: vrev32.8 q1, q0
+; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT: @ implicit-def: $q1
+; CHECK-BE-NEXT: vmrs r2, p0
+; CHECK-BE-NEXT: and r1, r2, #1
+; CHECK-BE-NEXT: rsbs r3, r1, #0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: bfi r1, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: ldrne r2, [r0]
+; CHECK-BE-NEXT: vmovne.32 q1[0], r2
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrmi r2, [r0, #4]
+; CHECK-BE-NEXT: vmovmi.32 q1[1], r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrmi r2, [r0, #8]
+; CHECK-BE-NEXT: vmovmi.32 q1[2], r2
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrmi r0, [r0, #12]
+; CHECK-BE-NEXT: vmovmi.32 q1[3], r0
; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -85,6 +150,320 @@ entry:
ret <4 x i32> %l
}
+define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_zero(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: zext16_masked_v4i32_align2_zero:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext16_masked_v4i32_align2_zero:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer)
+ %ext = zext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_undef(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: zext16_masked_v4i32_align2_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext16_masked_v4i32_align2_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> undef)
+ %ext = zext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align1_undef(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: zext16_masked_v4i32_align1_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT: @ implicit-def: $q0
+; CHECK-LE-NEXT: vmrs r2, p0
+; CHECK-LE-NEXT: and r1, r2, #1
+; CHECK-LE-NEXT: rsbs r3, r1, #0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrhne r2, [r0]
+; CHECK-LE-NEXT: vmovne.32 q0[0], r2
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-LE-NEXT: vmovmi.32 q0[1], r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-LE-NEXT: vmovmi.32 q0[2], r2
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r0, [r0, #6]
+; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext16_masked_v4i32_align1_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: @ implicit-def: $q0
+; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT: vmrs r2, p0
+; CHECK-BE-NEXT: and r1, r2, #1
+; CHECK-BE-NEXT: rsbs r3, r1, #0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: bfi r1, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: ldrhne r2, [r0]
+; CHECK-BE-NEXT: vmovne.32 q0[0], r2
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-BE-NEXT: vmovmi.32 q0[1], r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-BE-NEXT: vmovmi.32 q0[2], r2
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r0, [r0, #6]
+; CHECK-BE-NEXT: vmovmi.32 q0[3], r0
+; CHECK-BE-NEXT: vmovlb.s16 q1, q0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 1, <4 x i1> %c, <4 x i16> undef)
+ %ext = sext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_other(<4 x i16> *%dest, <4 x i16> %a) {
+; CHECK-LE-LABEL: zext16_masked_v4i32_align2_other:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.u16 q1, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u32 q0, [r0]
+; CHECK-LE-NEXT: vpsel q0, q0, q1
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext16_masked_v4i32_align2_other:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.u16 q0, q1
+; CHECK-BE-NEXT: vmovlb.s16 q1, q1
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u32 q1, [r0]
+; CHECK-BE-NEXT: vpsel q1, q1, q0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i16> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> %a)
+ %ext = zext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align2_zero(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: sext16_masked_v4i32_align2_zero:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.s32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext16_masked_v4i32_align2_zero:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.s32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer)
+ %sext = sext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %sext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align2_undef(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: sext16_masked_v4i32_align2_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.s32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext16_masked_v4i32_align2_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.s32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> undef)
+ %sext = sext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %sext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align1_undef(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: sext16_masked_v4i32_align1_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT: @ implicit-def: $q0
+; CHECK-LE-NEXT: vmrs r2, p0
+; CHECK-LE-NEXT: and r1, r2, #1
+; CHECK-LE-NEXT: rsbs r3, r1, #0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrhne r2, [r0]
+; CHECK-LE-NEXT: vmovne.32 q0[0], r2
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-LE-NEXT: vmovmi.32 q0[1], r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-LE-NEXT: vmovmi.32 q0[2], r2
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r0, [r0, #6]
+; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext16_masked_v4i32_align1_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: @ implicit-def: $q0
+; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT: vmrs r2, p0
+; CHECK-BE-NEXT: and r1, r2, #1
+; CHECK-BE-NEXT: rsbs r3, r1, #0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: bfi r1, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: ldrhne r2, [r0]
+; CHECK-BE-NEXT: vmovne.32 q0[0], r2
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-BE-NEXT: vmovmi.32 q0[1], r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-BE-NEXT: vmovmi.32 q0[2], r2
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r0, [r0, #6]
+; CHECK-BE-NEXT: vmovmi.32 q0[3], r0
+; CHECK-BE-NEXT: vmovlb.s16 q1, q0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 1, <4 x i1> %c, <4 x i16> undef)
+ %sext = sext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %sext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_masked_v4i32_align2_other(<4 x i16> *%dest, <4 x i16> %a) {
+; CHECK-LE-LABEL: sext16_masked_v4i32_align2_other:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.s32 q1, [r0]
+; CHECK-LE-NEXT: vpsel q0, q1, q0
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext16_masked_v4i32_align2_other:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.s16 q0, q1
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrht.s32 q1, [r0]
+; CHECK-BE-NEXT: vpsel q1, q1, q0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i16> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> %a)
+ %sext = sext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %sext
+}
+
define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_preinc:
; CHECK-LE: @ %bb.0: @ %entry
@@ -139,25 +518,18 @@ entry:
ret i8* %z
}
-
-
define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_align4_zero:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
; CHECK-LE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_align4_zero:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: vmov.i32 q1, #0x0
-; CHECK-BE-NEXT: vrev64.16 q2, q0
-; CHECK-BE-NEXT: vrev32.16 q1, q1
-; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-BE-NEXT: vpsel q1, q0, q1
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u16 q1, [r0]
; CHECK-BE-NEXT: vrev64.16 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
@@ -166,14 +538,14 @@ entry:
ret <8 x i16> %l
}
-define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 x i16> %a) {
-; CHECK-LE-LABEL: masked_v8i16_align4_undef:
+define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align2_undef(<8 x i16> *%dest, <8 x i16> %a) {
+; CHECK-LE-LABEL: masked_v8i16_align2_undef:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
; CHECK-LE-NEXT: vldrht.u16 q0, [r0]
; CHECK-LE-NEXT: bx lr
;
-; CHECK-BE-LABEL: masked_v8i16_align4_undef:
+; CHECK-BE-LABEL: masked_v8i16_align2_undef:
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
@@ -189,17 +561,140 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT: .pad #8
+; CHECK-LE-NEXT: sub sp, #8
+; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr
+; CHECK-LE-NEXT: @ implicit-def: $q0
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r2, r1, #1
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: bfi r2, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #6, #1
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: bfi r2, r1, #7, #1
+; CHECK-LE-NEXT: uxtb r1, r2
+; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrhne r2, [r0]
+; CHECK-LE-NEXT: vmovne.16 q0[0], r2
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-LE-NEXT: vmovmi.16 q0[1], r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-LE-NEXT: vmovmi.16 q0[2], r2
+; CHECK-LE-NEXT: lsls r2, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #6]
+; CHECK-LE-NEXT: vmovmi.16 q0[3], r2
+; CHECK-LE-NEXT: lsls r2, r1, #27
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #8]
+; CHECK-LE-NEXT: vmovmi.16 q0[4], r2
+; CHECK-LE-NEXT: lsls r2, r1, #26
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #10]
+; CHECK-LE-NEXT: vmovmi.16 q0[5], r2
+; CHECK-LE-NEXT: lsls r2, r1, #25
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #12]
+; CHECK-LE-NEXT: vmovmi.16 q0[6], r2
+; CHECK-LE-NEXT: lsls r1, r1, #24
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r0, [r0, #14]
+; CHECK-LE-NEXT: vmovmi.16 q0[7], r0
+; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #8
+; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: vrev64.16 q1, q0
-; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT: vrev16.8 q1, q0
+; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr
+; CHECK-BE-NEXT: @ implicit-def: $q1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: bfi r2, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #6, #1
+; CHECK-BE-NEXT: rsbs r1, r1, #0
+; CHECK-BE-NEXT: bfi r2, r1, #7, #1
+; CHECK-BE-NEXT: uxtb r1, r2
+; CHECK-BE-NEXT: lsls r2, r2, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: ldrhne r2, [r0]
+; CHECK-BE-NEXT: vmovne.16 q1[0], r2
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-BE-NEXT: vmovmi.16 q1[1], r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-BE-NEXT: vmovmi.16 q1[2], r2
+; CHECK-BE-NEXT: lsls r2, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #6]
+; CHECK-BE-NEXT: vmovmi.16 q1[3], r2
+; CHECK-BE-NEXT: lsls r2, r1, #27
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #8]
+; CHECK-BE-NEXT: vmovmi.16 q1[4], r2
+; CHECK-BE-NEXT: lsls r2, r1, #26
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #10]
+; CHECK-BE-NEXT: vmovmi.16 q1[5], r2
+; CHECK-BE-NEXT: lsls r2, r1, #25
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #12]
+; CHECK-BE-NEXT: vmovmi.16 q1[6], r2
+; CHECK-BE-NEXT: lsls r1, r1, #24
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r0, [r0, #14]
+; CHECK-BE-NEXT: vmovmi.16 q1[7], r0
; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -229,6 +724,308 @@ entry:
ret <8 x i16> %l
}
+define arm_aapcs_vfpcc <8 x i16> @sext8_masked_v8i16_align1_zero(<8 x i8> *%dest, <8 x i8> %a) {
+; CHECK-LE-LABEL: sext8_masked_v8i16_align1_zero:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.s16 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext8_masked_v8i16_align1_zero:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.s16 q1, [r0]
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i8> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> zeroinitializer)
+ %ext = sext <8 x i8> %l to <8 x i16>
+ ret <8 x i16> %ext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_masked_v8i16_align1_undef(<8 x i8> *%dest, <8 x i8> %a) {
+; CHECK-LE-LABEL: sext8_masked_v8i16_align1_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.s16 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext8_masked_v8i16_align1_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.s16 q1, [r0]
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i8> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> undef)
+ %ext = sext <8 x i8> %l to <8 x i16>
+ ret <8 x i16> %ext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_masked_v8i16_align1_other(<8 x i8> *%dest, <8 x i8> %a) {
+; CHECK-LE-LABEL: sext8_masked_v8i16_align1_other:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.s16 q1, [r0]
+; CHECK-LE-NEXT: vpsel q0, q1, q0
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext8_masked_v8i16_align1_other:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.s16 q1, [r0]
+; CHECK-BE-NEXT: vpsel q1, q1, q0
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i8> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> %a)
+ %ext = sext <8 x i8> %l to <8 x i16>
+ ret <8 x i16> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext8_masked_v4i32_align1_zero(<4 x i8> *%dest, <4 x i8> %a) {
+; CHECK-LE-LABEL: sext8_masked_v4i32_align1_zero:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.s32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext8_masked_v4i32_align1_zero:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vmovlb.s16 q0, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.s32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i8> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> zeroinitializer)
+ %ext = sext <4 x i8> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext8_masked_v4i32_align1_undef(<4 x i8> *%dest, <4 x i8> %a) {
+; CHECK-LE-LABEL: sext8_masked_v4i32_align1_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.s32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext8_masked_v4i32_align1_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vmovlb.s16 q0, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.s32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i8> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> undef)
+ %ext = sext <4 x i8> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext8_masked_v4i32_align1_other(<4 x i8> *%dest, <4 x i8> %a) {
+; CHECK-LE-LABEL: sext8_masked_v4i32_align1_other:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.s32 q1, [r0]
+; CHECK-LE-NEXT: vpsel q0, q1, q0
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: sext8_masked_v4i32_align1_other:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vmovlb.s16 q0, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.s32 q1, [r0]
+; CHECK-BE-NEXT: vpsel q1, q1, q0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i8> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> %a)
+ %ext = sext <4 x i8> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext8_masked_v4i32_align1_zero(<4 x i8> *%dest, <4 x i8> %a) {
+; CHECK-LE-LABEL: zext8_masked_v4i32_align1_zero:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext8_masked_v4i32_align1_zero:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vmovlb.s16 q0, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.u32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i8> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> zeroinitializer)
+ %ext = zext <4 x i8> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext8_masked_v4i32_align1_undef(<4 x i8> *%dest, <4 x i8> %a) {
+; CHECK-LE-LABEL: zext8_masked_v4i32_align1_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext8_masked_v4i32_align1_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vmovlb.s16 q0, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.u32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i8> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> undef)
+ %ext = zext <4 x i8> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext8_masked_v4i32_align1_other(<4 x i8> *%dest, <4 x i8> %a) {
+; CHECK-LE-LABEL: zext8_masked_v4i32_align1_other:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmov.i32 q1, #0xff
+; CHECK-LE-NEXT: vand q1, q0, q1
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vmovlb.s16 q0, q0
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u32 q0, [r0]
+; CHECK-LE-NEXT: vpsel q0, q0, q1
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext8_masked_v4i32_align1_other:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vmov.i32 q1, #0xff
+; CHECK-BE-NEXT: vrev64.32 q2, q0
+; CHECK-BE-NEXT: vand q0, q2, q1
+; CHECK-BE-NEXT: vmovlb.s8 q1, q2
+; CHECK-BE-NEXT: vmovlb.s16 q1, q1
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrbt.u32 q1, [r0]
+; CHECK-BE-NEXT: vpsel q1, q1, q0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i8> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> %a)
+ %ext = zext <4 x i8> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_zero(<8 x i8> *%dest, <8 x i8> %a) {
+; CHECK-LE-LABEL: zext8_masked_v8i16_align1_zero:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u16 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext8_masked_v8i16_align1_zero:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.u16 q1, [r0]
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i8> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> zeroinitializer)
+ %ext = zext <8 x i8> %l to <8 x i16>
+ ret <8 x i16> %ext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_undef(<8 x i8> *%dest, <8 x i8> %a) {
+; CHECK-LE-LABEL: zext8_masked_v8i16_align1_undef:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u16 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext8_masked_v8i16_align1_undef:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vmovlb.s8 q0, q1
+; CHECK-BE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT: vldrbt.u16 q1, [r0]
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i8> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> undef)
+ %ext = zext <8 x i8> %l to <8 x i16>
+ ret <8 x i16> %ext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_other(<8 x i8> *%dest, <8 x i8> %a) {
+; CHECK-LE-LABEL: zext8_masked_v8i16_align1_other:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vmovlb.u8 q1, q0
+; CHECK-LE-NEXT: vmovlb.s8 q0, q0
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u16 q0, [r0]
+; CHECK-LE-NEXT: vpsel q0, q0, q1
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: zext8_masked_v8i16_align1_other:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vmovlb.u8 q0, q1
+; CHECK-BE-NEXT: vmovlb.s8 q1, q1
+; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
+; CHECK-BE-NEXT: vldrbt.u16 q1, [r0]
+; CHECK-BE-NEXT: vpsel q1, q1, q0
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i8> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> %a)
+ %ext = zext <8 x i8> %l to <8 x i16>
+ ret <8 x i16> %ext
+}
+
define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_preinc:
; CHECK-LE: @ %bb.0: @ %entry
@@ -291,20 +1088,15 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 x i8> %a) {
; CHECK-LE-LABEL: masked_v16i8_align4_zero:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
; CHECK-LE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v16i8_align4_zero:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: vmov.i32 q1, #0x0
-; CHECK-BE-NEXT: vrev64.8 q2, q0
-; CHECK-BE-NEXT: vrev32.8 q1, q1
-; CHECK-BE-NEXT: vpt.s8 gt, q2, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT: vpsel q1, q0, q1
+; CHECK-BE-NEXT: vrev64.8 q1, q0
+; CHECK-BE-NEXT: vpt.s8 gt, q1, zr
+; CHECK-BE-NEXT: vldrbt.u8 q1, [r0]
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
@@ -413,19 +1205,15 @@ entry:
define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4f32_align4_zero:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
; CHECK-LE-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_align4_zero:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: vmov.i32 q1, #0x0
-; CHECK-BE-NEXT: vrev64.32 q2, q0
-; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-BE-NEXT: vpsel q1, q0, q1
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrwt.u32 q1, [r0]
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
@@ -457,17 +1245,82 @@ entry:
define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4f32_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT: @ implicit-def: $q0
+; CHECK-LE-NEXT: vmrs r2, p0
+; CHECK-LE-NEXT: and r1, r2, #1
+; CHECK-LE-NEXT: rsbs r3, r1, #0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: ldrne r2, [r0]
+; CHECK-LE-NEXT: vmovne s0, r2
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r2, [r0, #4]
+; CHECK-LE-NEXT: vmovmi s1, r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r2, [r0, #8]
+; CHECK-LE-NEXT: vmovmi s2, r2
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrmi r0, [r0, #12]
+; CHECK-LE-NEXT: vmovmi s3, r0
+; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT: vrev32.8 q1, q0
+; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT: @ implicit-def: $q1
+; CHECK-BE-NEXT: vmrs r2, p0
+; CHECK-BE-NEXT: and r1, r2, #1
+; CHECK-BE-NEXT: rsbs r3, r1, #0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: bfi r1, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: ldrne r2, [r0]
+; CHECK-BE-NEXT: vmovne s4, r2
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrmi r2, [r0, #4]
+; CHECK-BE-NEXT: vmovmi s5, r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrmi r2, [r0, #8]
+; CHECK-BE-NEXT: vmovmi s6, r2
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrmi r0, [r0, #12]
+; CHECK-BE-NEXT: vmovmi s7, r0
; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -556,20 +1409,15 @@ entry:
define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8f16_align4_zero:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
; CHECK-LE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_align4_zero:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: vmov.i32 q1, #0x0
-; CHECK-BE-NEXT: vrev64.16 q2, q0
-; CHECK-BE-NEXT: vrev32.16 q1, q1
-; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-BE-NEXT: vpsel q1, q0, q1
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u16 q1, [r0]
; CHECK-BE-NEXT: vrev64.16 q0, q1
; CHECK-BE-NEXT: bx lr
entry:
@@ -601,18 +1449,248 @@ entry:
define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8f16_align1_undef:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vldrbt.u8 q0, [r0]
+; CHECK-LE-NEXT: .pad #40
+; CHECK-LE-NEXT: sub sp, #40
+; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr
+; CHECK-LE-NEXT: @ implicit-def: $q0
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r2, r1, #1
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: bfi r2, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #6, #1
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: bfi r2, r1, #7, #1
+; CHECK-LE-NEXT: uxtb r1, r2
+; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: bne .LBB45_9
+; CHECK-LE-NEXT: @ %bb.1: @ %else
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: bmi .LBB45_10
+; CHECK-LE-NEXT: .LBB45_2: @ %else2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: bmi .LBB45_11
+; CHECK-LE-NEXT: .LBB45_3: @ %else5
+; CHECK-LE-NEXT: lsls r2, r1, #28
+; CHECK-LE-NEXT: bmi .LBB45_12
+; CHECK-LE-NEXT: .LBB45_4: @ %else8
+; CHECK-LE-NEXT: lsls r2, r1, #27
+; CHECK-LE-NEXT: bmi .LBB45_13
+; CHECK-LE-NEXT: .LBB45_5: @ %else11
+; CHECK-LE-NEXT: lsls r2, r1, #26
+; CHECK-LE-NEXT: bmi .LBB45_14
+; CHECK-LE-NEXT: .LBB45_6: @ %else14
+; CHECK-LE-NEXT: lsls r2, r1, #25
+; CHECK-LE-NEXT: bmi .LBB45_15
+; CHECK-LE-NEXT: .LBB45_7: @ %else17
+; CHECK-LE-NEXT: lsls r1, r1, #24
+; CHECK-LE-NEXT: bmi .LBB45_16
+; CHECK-LE-NEXT: .LBB45_8: @ %else20
+; CHECK-LE-NEXT: add sp, #40
+; CHECK-LE-NEXT: bx lr
+; CHECK-LE-NEXT: .LBB45_9: @ %cond.load
+; CHECK-LE-NEXT: ldrh r2, [r0]
+; CHECK-LE-NEXT: strh.w r2, [sp, #28]
+; CHECK-LE-NEXT: vldr.16 s0, [sp, #28]
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: bpl .LBB45_2
+; CHECK-LE-NEXT: .LBB45_10: @ %cond.load1
+; CHECK-LE-NEXT: ldrh r2, [r0, #2]
+; CHECK-LE-NEXT: strh.w r2, [sp, #24]
+; CHECK-LE-NEXT: vldr.16 s4, [sp, #24]
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov.16 q0[1], r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: bpl .LBB45_3
+; CHECK-LE-NEXT: .LBB45_11: @ %cond.load4
+; CHECK-LE-NEXT: ldrh r2, [r0, #4]
+; CHECK-LE-NEXT: strh.w r2, [sp, #20]
+; CHECK-LE-NEXT: vldr.16 s4, [sp, #20]
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov.16 q0[2], r2
+; CHECK-LE-NEXT: lsls r2, r1, #28
+; CHECK-LE-NEXT: bpl .LBB45_4
+; CHECK-LE-NEXT: .LBB45_12: @ %cond.load7
+; CHECK-LE-NEXT: ldrh r2, [r0, #6]
+; CHECK-LE-NEXT: strh.w r2, [sp, #16]
+; CHECK-LE-NEXT: vldr.16 s4, [sp, #16]
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov.16 q0[3], r2
+; CHECK-LE-NEXT: lsls r2, r1, #27
+; CHECK-LE-NEXT: bpl .LBB45_5
+; CHECK-LE-NEXT: .LBB45_13: @ %cond.load10
+; CHECK-LE-NEXT: ldrh r2, [r0, #8]
+; CHECK-LE-NEXT: strh.w r2, [sp, #12]
+; CHECK-LE-NEXT: vldr.16 s4, [sp, #12]
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov.16 q0[4], r2
+; CHECK-LE-NEXT: lsls r2, r1, #26
+; CHECK-LE-NEXT: bpl .LBB45_6
+; CHECK-LE-NEXT: .LBB45_14: @ %cond.load13
+; CHECK-LE-NEXT: ldrh r2, [r0, #10]
+; CHECK-LE-NEXT: strh.w r2, [sp, #8]
+; CHECK-LE-NEXT: vldr.16 s4, [sp, #8]
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov.16 q0[5], r2
+; CHECK-LE-NEXT: lsls r2, r1, #25
+; CHECK-LE-NEXT: bpl .LBB45_7
+; CHECK-LE-NEXT: .LBB45_15: @ %cond.load16
+; CHECK-LE-NEXT: ldrh r2, [r0, #12]
+; CHECK-LE-NEXT: strh.w r2, [sp, #4]
+; CHECK-LE-NEXT: vldr.16 s4, [sp, #4]
+; CHECK-LE-NEXT: vmov r2, s4
+; CHECK-LE-NEXT: vmov.16 q0[6], r2
+; CHECK-LE-NEXT: lsls r1, r1, #24
+; CHECK-LE-NEXT: bpl .LBB45_8
+; CHECK-LE-NEXT: .LBB45_16: @ %cond.load19
+; CHECK-LE-NEXT: ldrh r0, [r0, #14]
+; CHECK-LE-NEXT: strh.w r0, [sp]
+; CHECK-LE-NEXT: vldr.16 s4, [sp]
+; CHECK-LE-NEXT: vmov r0, s4
+; CHECK-LE-NEXT: vmov.16 q0[7], r0
+; CHECK-LE-NEXT: add sp, #40
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_align1_undef:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #40
+; CHECK-BE-NEXT: sub sp, #40
; CHECK-BE-NEXT: vrev64.16 q1, q0
-; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT: vrev16.8 q1, q0
+; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr
+; CHECK-BE-NEXT: @ implicit-def: $q1
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: bfi r2, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #6, #1
+; CHECK-BE-NEXT: rsbs r1, r1, #0
+; CHECK-BE-NEXT: bfi r2, r1, #7, #1
+; CHECK-BE-NEXT: uxtb r1, r2
+; CHECK-BE-NEXT: lsls r2, r2, #31
+; CHECK-BE-NEXT: bne .LBB45_10
+; CHECK-BE-NEXT: @ %bb.1: @ %else
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: bmi .LBB45_11
+; CHECK-BE-NEXT: .LBB45_2: @ %else2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: bmi .LBB45_12
+; CHECK-BE-NEXT: .LBB45_3: @ %else5
+; CHECK-BE-NEXT: lsls r2, r1, #28
+; CHECK-BE-NEXT: bmi .LBB45_13
+; CHECK-BE-NEXT: .LBB45_4: @ %else8
+; CHECK-BE-NEXT: lsls r2, r1, #27
+; CHECK-BE-NEXT: bmi .LBB45_14
+; CHECK-BE-NEXT: .LBB45_5: @ %else11
+; CHECK-BE-NEXT: lsls r2, r1, #26
+; CHECK-BE-NEXT: bmi .LBB45_15
+; CHECK-BE-NEXT: .LBB45_6: @ %else14
+; CHECK-BE-NEXT: lsls r2, r1, #25
+; CHECK-BE-NEXT: bmi .LBB45_16
+; CHECK-BE-NEXT: .LBB45_7: @ %else17
+; CHECK-BE-NEXT: lsls r1, r1, #24
+; CHECK-BE-NEXT: bpl .LBB45_9
+; CHECK-BE-NEXT: .LBB45_8: @ %cond.load19
+; CHECK-BE-NEXT: ldrh r0, [r0, #14]
+; CHECK-BE-NEXT: strh.w r0, [sp]
+; CHECK-BE-NEXT: vldr.16 s0, [sp]
+; CHECK-BE-NEXT: vmov r0, s0
+; CHECK-BE-NEXT: vmov.16 q1[7], r0
+; CHECK-BE-NEXT: .LBB45_9: @ %else20
; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: add sp, #40
; CHECK-BE-NEXT: bx lr
+; CHECK-BE-NEXT: .LBB45_10: @ %cond.load
+; CHECK-BE-NEXT: ldrh r2, [r0]
+; CHECK-BE-NEXT: strh.w r2, [sp, #28]
+; CHECK-BE-NEXT: vldr.16 s4, [sp, #28]
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: bpl .LBB45_2
+; CHECK-BE-NEXT: .LBB45_11: @ %cond.load1
+; CHECK-BE-NEXT: ldrh r2, [r0, #2]
+; CHECK-BE-NEXT: strh.w r2, [sp, #24]
+; CHECK-BE-NEXT: vldr.16 s0, [sp, #24]
+; CHECK-BE-NEXT: vmov r2, s0
+; CHECK-BE-NEXT: vmov.16 q1[1], r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: bpl .LBB45_3
+; CHECK-BE-NEXT: .LBB45_12: @ %cond.load4
+; CHECK-BE-NEXT: ldrh r2, [r0, #4]
+; CHECK-BE-NEXT: strh.w r2, [sp, #20]
+; CHECK-BE-NEXT: vldr.16 s0, [sp, #20]
+; CHECK-BE-NEXT: vmov r2, s0
+; CHECK-BE-NEXT: vmov.16 q1[2], r2
+; CHECK-BE-NEXT: lsls r2, r1, #28
+; CHECK-BE-NEXT: bpl .LBB45_4
+; CHECK-BE-NEXT: .LBB45_13: @ %cond.load7
+; CHECK-BE-NEXT: ldrh r2, [r0, #6]
+; CHECK-BE-NEXT: strh.w r2, [sp, #16]
+; CHECK-BE-NEXT: vldr.16 s0, [sp, #16]
+; CHECK-BE-NEXT: vmov r2, s0
+; CHECK-BE-NEXT: vmov.16 q1[3], r2
+; CHECK-BE-NEXT: lsls r2, r1, #27
+; CHECK-BE-NEXT: bpl .LBB45_5
+; CHECK-BE-NEXT: .LBB45_14: @ %cond.load10
+; CHECK-BE-NEXT: ldrh r2, [r0, #8]
+; CHECK-BE-NEXT: strh.w r2, [sp, #12]
+; CHECK-BE-NEXT: vldr.16 s0, [sp, #12]
+; CHECK-BE-NEXT: vmov r2, s0
+; CHECK-BE-NEXT: vmov.16 q1[4], r2
+; CHECK-BE-NEXT: lsls r2, r1, #26
+; CHECK-BE-NEXT: bpl .LBB45_6
+; CHECK-BE-NEXT: .LBB45_15: @ %cond.load13
+; CHECK-BE-NEXT: ldrh r2, [r0, #10]
+; CHECK-BE-NEXT: strh.w r2, [sp, #8]
+; CHECK-BE-NEXT: vldr.16 s0, [sp, #8]
+; CHECK-BE-NEXT: vmov r2, s0
+; CHECK-BE-NEXT: vmov.16 q1[5], r2
+; CHECK-BE-NEXT: lsls r2, r1, #25
+; CHECK-BE-NEXT: bpl .LBB45_7
+; CHECK-BE-NEXT: .LBB45_16: @ %cond.load16
+; CHECK-BE-NEXT: ldrh r2, [r0, #12]
+; CHECK-BE-NEXT: strh.w r2, [sp, #4]
+; CHECK-BE-NEXT: vldr.16 s0, [sp, #4]
+; CHECK-BE-NEXT: vmov r2, s0
+; CHECK-BE-NEXT: vmov.16 q1[6], r2
+; CHECK-BE-NEXT: lsls r1, r1, #24
+; CHECK-BE-NEXT: bmi .LBB45_8
+; CHECK-BE-NEXT: b .LBB45_9
entry:
%c = icmp sgt <8 x i16> %a, zeroinitializer
%l = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %dest, i32 1, <8 x i1> %c, <8 x half> undef)
@@ -722,14 +1800,14 @@ define arm_aapcs_vfpcc <2 x i64> @masked
; CHECK-LE-NEXT: bfi r2, r1, #0, #1
; CHECK-LE-NEXT: and r1, r2, #3
; CHECK-LE-NEXT: lsls r2, r2, #31
-; CHECK-LE-NEXT: beq .LBB29_2
+; CHECK-LE-NEXT: beq .LBB49_2
; CHECK-LE-NEXT: @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT: vldr d1, .LCPI29_0
+; CHECK-LE-NEXT: vldr d1, .LCPI49_0
; CHECK-LE-NEXT: vldr d0, [r0]
-; CHECK-LE-NEXT: b .LBB29_3
-; CHECK-LE-NEXT: .LBB29_2:
+; CHECK-LE-NEXT: b .LBB49_3
+; CHECK-LE-NEXT: .LBB49_2:
; CHECK-LE-NEXT: vmov.i32 q0, #0x0
-; CHECK-LE-NEXT: .LBB29_3: @ %else
+; CHECK-LE-NEXT: .LBB49_3: @ %else
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vldrmi d1, [r0, #8]
@@ -737,7 +1815,7 @@ define arm_aapcs_vfpcc <2 x i64> @masked
; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .p2align 3
; CHECK-LE-NEXT: @ %bb.4:
-; CHECK-LE-NEXT: .LCPI29_0:
+; CHECK-LE-NEXT: .LCPI49_0:
; CHECK-LE-NEXT: .long 0 @ double 0
; CHECK-LE-NEXT: .long 0
;
@@ -766,15 +1844,15 @@ define arm_aapcs_vfpcc <2 x i64> @masked
; CHECK-BE-NEXT: bfi r2, r1, #0, #1
; CHECK-BE-NEXT: and r1, r2, #3
; CHECK-BE-NEXT: lsls r2, r2, #31
-; CHECK-BE-NEXT: beq .LBB29_2
+; CHECK-BE-NEXT: beq .LBB49_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT: vldr d1, .LCPI29_0
+; CHECK-BE-NEXT: vldr d1, .LCPI49_0
; CHECK-BE-NEXT: vldr d0, [r0]
-; CHECK-BE-NEXT: b .LBB29_3
-; CHECK-BE-NEXT: .LBB29_2:
+; CHECK-BE-NEXT: b .LBB49_3
+; CHECK-BE-NEXT: .LBB49_2:
; CHECK-BE-NEXT: vmov.i32 q1, #0x0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: .LBB29_3: @ %else
+; CHECK-BE-NEXT: .LBB49_3: @ %else
; CHECK-BE-NEXT: lsls r1, r1, #30
; CHECK-BE-NEXT: it mi
; CHECK-BE-NEXT: vldrmi d1, [r0, #8]
@@ -782,7 +1860,7 @@ define arm_aapcs_vfpcc <2 x i64> @masked
; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .p2align 3
; CHECK-BE-NEXT: @ %bb.4:
-; CHECK-BE-NEXT: .LCPI29_0:
+; CHECK-BE-NEXT: .LCPI49_0:
; CHECK-BE-NEXT: .long 0 @ double 0
; CHECK-BE-NEXT: .long 0
entry:
@@ -816,14 +1894,14 @@ define arm_aapcs_vfpcc <2 x double> @mas
; CHECK-LE-NEXT: bfi r2, r1, #0, #1
; CHECK-LE-NEXT: and r1, r2, #3
; CHECK-LE-NEXT: lsls r2, r2, #31
-; CHECK-LE-NEXT: beq .LBB30_2
+; CHECK-LE-NEXT: beq .LBB50_2
; CHECK-LE-NEXT: @ %bb.1: @ %cond.load
-; CHECK-LE-NEXT: vldr d1, .LCPI30_0
+; CHECK-LE-NEXT: vldr d1, .LCPI50_0
; CHECK-LE-NEXT: vldr d0, [r0]
-; CHECK-LE-NEXT: b .LBB30_3
-; CHECK-LE-NEXT: .LBB30_2:
+; CHECK-LE-NEXT: b .LBB50_3
+; CHECK-LE-NEXT: .LBB50_2:
; CHECK-LE-NEXT: vmov.i32 q0, #0x0
-; CHECK-LE-NEXT: .LBB30_3: @ %else
+; CHECK-LE-NEXT: .LBB50_3: @ %else
; CHECK-LE-NEXT: lsls r1, r1, #30
; CHECK-LE-NEXT: it mi
; CHECK-LE-NEXT: vldrmi d1, [r0, #8]
@@ -831,7 +1909,7 @@ define arm_aapcs_vfpcc <2 x double> @mas
; CHECK-LE-NEXT: bx lr
; CHECK-LE-NEXT: .p2align 3
; CHECK-LE-NEXT: @ %bb.4:
-; CHECK-LE-NEXT: .LCPI30_0:
+; CHECK-LE-NEXT: .LCPI50_0:
; CHECK-LE-NEXT: .long 0 @ double 0
; CHECK-LE-NEXT: .long 0
;
@@ -860,15 +1938,15 @@ define arm_aapcs_vfpcc <2 x double> @mas
; CHECK-BE-NEXT: bfi r2, r1, #0, #1
; CHECK-BE-NEXT: and r1, r2, #3
; CHECK-BE-NEXT: lsls r2, r2, #31
-; CHECK-BE-NEXT: beq .LBB30_2
+; CHECK-BE-NEXT: beq .LBB50_2
; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
-; CHECK-BE-NEXT: vldr d1, .LCPI30_0
+; CHECK-BE-NEXT: vldr d1, .LCPI50_0
; CHECK-BE-NEXT: vldr d0, [r0]
-; CHECK-BE-NEXT: b .LBB30_3
-; CHECK-BE-NEXT: .LBB30_2:
+; CHECK-BE-NEXT: b .LBB50_3
+; CHECK-BE-NEXT: .LBB50_2:
; CHECK-BE-NEXT: vmov.i32 q1, #0x0
; CHECK-BE-NEXT: vrev64.32 q0, q1
-; CHECK-BE-NEXT: .LBB30_3: @ %else
+; CHECK-BE-NEXT: .LBB50_3: @ %else
; CHECK-BE-NEXT: lsls r1, r1, #30
; CHECK-BE-NEXT: it mi
; CHECK-BE-NEXT: vldrmi d1, [r0, #8]
@@ -876,7 +1954,7 @@ define arm_aapcs_vfpcc <2 x double> @mas
; CHECK-BE-NEXT: bx lr
; CHECK-BE-NEXT: .p2align 3
; CHECK-BE-NEXT: @ %bb.4:
-; CHECK-BE-NEXT: .LCPI30_0:
+; CHECK-BE-NEXT: .LCPI50_0:
; CHECK-BE-NEXT: .long 0 @ double 0
; CHECK-BE-NEXT: .long 0
entry:
@@ -885,10 +1963,254 @@ entry:
ret <2 x double> %l
}
+define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: anyext_v4i16:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: anyext_v4i16:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer)
+ ret <4 x i16> %l
+}
+
+define arm_aapcs_vfpcc <4 x i16> @anyext_v4i16_align1(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: anyext_v4i16_align1:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT: mov.w r12, #0
+; CHECK-LE-NEXT: vmrs r3, p0
+; CHECK-LE-NEXT: and r1, r3, #1
+; CHECK-LE-NEXT: rsbs r2, r1, #0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: bfi r1, r2, #0, #1
+; CHECK-LE-NEXT: ubfx r2, r3, #4, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #1, #1
+; CHECK-LE-NEXT: ubfx r2, r3, #8, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #2, #1
+; CHECK-LE-NEXT: ubfx r2, r3, #12, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: beq .LBB52_2
+; CHECK-LE-NEXT: @ %bb.1: @ %cond.load
+; CHECK-LE-NEXT: ldrh r2, [r0]
+; CHECK-LE-NEXT: vdup.32 q0, r12
+; CHECK-LE-NEXT: vmov.32 q0[0], r2
+; CHECK-LE-NEXT: b .LBB52_3
+; CHECK-LE-NEXT: .LBB52_2:
+; CHECK-LE-NEXT: vmov.i32 q0, #0x0
+; CHECK-LE-NEXT: .LBB52_3: @ %else
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-LE-NEXT: vmovmi.32 q0[1], r2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-LE-NEXT: vmovmi.32 q0[2], r2
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: ldrhmi r0, [r0, #6]
+; CHECK-LE-NEXT: vmovmi.32 q0[3], r0
+; CHECK-LE-NEXT: add sp, #4
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: anyext_v4i16_align1:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: mov.w r12, #0
+; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT: vmrs r3, p0
+; CHECK-BE-NEXT: and r1, r3, #1
+; CHECK-BE-NEXT: rsbs r2, r1, #0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: bfi r1, r2, #0, #1
+; CHECK-BE-NEXT: ubfx r2, r3, #4, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #1, #1
+; CHECK-BE-NEXT: ubfx r2, r3, #8, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #2, #1
+; CHECK-BE-NEXT: ubfx r2, r3, #12, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: beq .LBB52_2
+; CHECK-BE-NEXT: @ %bb.1: @ %cond.load
+; CHECK-BE-NEXT: ldrh r2, [r0]
+; CHECK-BE-NEXT: vdup.32 q1, r12
+; CHECK-BE-NEXT: vmov.32 q1[0], r2
+; CHECK-BE-NEXT: b .LBB52_3
+; CHECK-BE-NEXT: .LBB52_2:
+; CHECK-BE-NEXT: vmov.i32 q1, #0x0
+; CHECK-BE-NEXT: .LBB52_3: @ %else
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #2]
+; CHECK-BE-NEXT: vmovmi.32 q1[1], r2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r2, [r0, #4]
+; CHECK-BE-NEXT: vmovmi.32 q1[2], r2
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: ldrhmi r0, [r0, #6]
+; CHECK-BE-NEXT: vmovmi.32 q1[3], r0
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: add sp, #4
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 1, <4 x i1> %c, <4 x i16> zeroinitializer)
+ ret <4 x i16> %l
+}
+
+define arm_aapcs_vfpcc <4 x i8> @anyext_v4i8(<4 x i8> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: anyext_v4i8:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u32 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: anyext_v4i8:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrbt.u32 q1, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %dest, i32 1, <4 x i1> %c, <4 x i8> zeroinitializer)
+ ret <4 x i8> %l
+}
+
+define arm_aapcs_vfpcc <8 x i8> @anyext_v8i8(<8 x i8> *%dest, <8 x i16> %a) {
+; CHECK-LE-LABEL: anyext_v8i8:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
+; CHECK-LE-NEXT: vldrbt.u16 q0, [r0]
+; CHECK-LE-NEXT: bx lr
+;
+; CHECK-BE-LABEL: anyext_v8i8:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
+; CHECK-BE-NEXT: vldrbt.u16 q1, [r0]
+; CHECK-BE-NEXT: vrev64.16 q0, q1
+; CHECK-BE-NEXT: bx lr
+entry:
+ %c = icmp sgt <8 x i16> %a, zeroinitializer
+ %l = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %dest, i32 1, <8 x i1> %c, <8 x i8> zeroinitializer)
+ ret <8 x i8> %l
+}
+
+define arm_aapcs_vfpcc <4 x i32> @multi_user_zext(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: multi_user_zext:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .save {r7, lr}
+; CHECK-LE-NEXT: push {r7, lr}
+; CHECK-LE-NEXT: .vsave {d8, d9}
+; CHECK-LE-NEXT: vpush {d8, d9}
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u32 q4, [r0]
+; CHECK-LE-NEXT: vmov r0, r1, d8
+; CHECK-LE-NEXT: vmov r2, r3, d9
+; CHECK-LE-NEXT: bl foo
+; CHECK-LE-NEXT: vmovlb.u16 q0, q4
+; CHECK-LE-NEXT: vpop {d8, d9}
+; CHECK-LE-NEXT: pop {r7, pc}
+;
+; CHECK-BE-LABEL: multi_user_zext:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .save {r7, lr}
+; CHECK-BE-NEXT: push {r7, lr}
+; CHECK-BE-NEXT: .vsave {d8, d9}
+; CHECK-BE-NEXT: vpush {d8, d9}
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u32 q4, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q4
+; CHECK-BE-NEXT: vmov r1, r0, d0
+; CHECK-BE-NEXT: vmov r3, r2, d1
+; CHECK-BE-NEXT: bl foo
+; CHECK-BE-NEXT: vmovlb.u16 q1, q4
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: vpop {d8, d9}
+; CHECK-BE-NEXT: pop {r7, pc}
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer)
+ call void @foo(<4 x i16> %l)
+ %ext = zext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+define arm_aapcs_vfpcc <4 x i32> @multi_user_sext(<4 x i16> *%dest, <4 x i32> %a) {
+; CHECK-LE-LABEL: multi_user_sext:
+; CHECK-LE: @ %bb.0: @ %entry
+; CHECK-LE-NEXT: .save {r7, lr}
+; CHECK-LE-NEXT: push {r7, lr}
+; CHECK-LE-NEXT: .vsave {d8, d9}
+; CHECK-LE-NEXT: vpush {d8, d9}
+; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
+; CHECK-LE-NEXT: vldrht.u32 q4, [r0]
+; CHECK-LE-NEXT: vmov r0, r1, d8
+; CHECK-LE-NEXT: vmov r2, r3, d9
+; CHECK-LE-NEXT: bl foo
+; CHECK-LE-NEXT: vmovlb.s16 q0, q4
+; CHECK-LE-NEXT: vpop {d8, d9}
+; CHECK-LE-NEXT: pop {r7, pc}
+;
+; CHECK-BE-LABEL: multi_user_sext:
+; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .save {r7, lr}
+; CHECK-BE-NEXT: push {r7, lr}
+; CHECK-BE-NEXT: .vsave {d8, d9}
+; CHECK-BE-NEXT: vpush {d8, d9}
+; CHECK-BE-NEXT: vrev64.32 q1, q0
+; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
+; CHECK-BE-NEXT: vldrht.u32 q4, [r0]
+; CHECK-BE-NEXT: vrev64.32 q0, q4
+; CHECK-BE-NEXT: vmov r1, r0, d0
+; CHECK-BE-NEXT: vmov r3, r2, d1
+; CHECK-BE-NEXT: bl foo
+; CHECK-BE-NEXT: vmovlb.s16 q1, q4
+; CHECK-BE-NEXT: vrev64.32 q0, q1
+; CHECK-BE-NEXT: vpop {d8, d9}
+; CHECK-BE-NEXT: pop {r7, pc}
+entry:
+ %c = icmp sgt <4 x i32> %a, zeroinitializer
+ %l = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %dest, i32 2, <4 x i1> %c, <4 x i16> zeroinitializer)
+ call void @foo(<4 x i16> %l)
+ %ext = sext <4 x i16> %l to <4 x i32>
+ ret <4 x i32> %ext
+}
+
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare void @foo(<4 x i16>)
Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll Thu Oct 17 00:55:55 2019
@@ -24,16 +24,79 @@ entry:
define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: .pad #4
+; CHECK-LE-NEXT: sub sp, #4
+; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr
+; CHECK-LE-NEXT: vmrs r2, p0
+; CHECK-LE-NEXT: and r1, r2, #1
+; CHECK-LE-NEXT: rsbs r3, r1, #0
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: vmovne r2, s0
+; CHECK-LE-NEXT: strne r2, [r0]
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi r2, s1
+; CHECK-LE-NEXT: strmi r2, [r0, #4]
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi r2, s2
+; CHECK-LE-NEXT: strmi r2, [r0, #8]
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi r1, s3
+; CHECK-LE-NEXT: strmi r1, [r0, #12]
+; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i32_align1:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #4
+; CHECK-BE-NEXT: sub sp, #4
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vrev32.8 q0, q1
-; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr
+; CHECK-BE-NEXT: vmrs r2, p0
+; CHECK-BE-NEXT: and r1, r2, #1
+; CHECK-BE-NEXT: rsbs r3, r1, #0
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: bfi r1, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: vmovne r2, s4
+; CHECK-BE-NEXT: strne r2, [r0]
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi r2, s5
+; CHECK-BE-NEXT: strmi r2, [r0, #4]
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi r2, s6
+; CHECK-BE-NEXT: strmi r2, [r0, #8]
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi r1, s7
+; CHECK-BE-NEXT: strmi r1, [r0, #12]
+; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <4 x i32> %a, zeroinitializer
@@ -126,16 +189,137 @@ entry:
define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: .pad #8
+; CHECK-LE-NEXT: sub sp, #8
+; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r2, r1, #1
+; CHECK-LE-NEXT: rsbs r3, r2, #0
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: bfi r2, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #6, #1
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: bfi r2, r1, #7, #1
+; CHECK-LE-NEXT: uxtb r1, r2
+; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: itt ne
+; CHECK-LE-NEXT: vmovne.u16 r2, q0[0]
+; CHECK-LE-NEXT: strhne r2, [r0]
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r2, q0[1]
+; CHECK-LE-NEXT: strhmi r2, [r0, #2]
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r2, q0[2]
+; CHECK-LE-NEXT: strhmi r2, [r0, #4]
+; CHECK-LE-NEXT: lsls r2, r1, #28
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r2, q0[3]
+; CHECK-LE-NEXT: strhmi r2, [r0, #6]
+; CHECK-LE-NEXT: lsls r2, r1, #27
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r2, q0[4]
+; CHECK-LE-NEXT: strhmi r2, [r0, #8]
+; CHECK-LE-NEXT: lsls r2, r1, #26
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r2, q0[5]
+; CHECK-LE-NEXT: strhmi r2, [r0, #10]
+; CHECK-LE-NEXT: lsls r2, r1, #25
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r2, q0[6]
+; CHECK-LE-NEXT: strhmi r2, [r0, #12]
+; CHECK-LE-NEXT: lsls r1, r1, #24
+; CHECK-LE-NEXT: itt mi
+; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7]
+; CHECK-LE-NEXT: strhmi r1, [r0, #14]
+; CHECK-LE-NEXT: add sp, #8
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_align1:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #8
+; CHECK-BE-NEXT: sub sp, #8
; CHECK-BE-NEXT: vrev64.16 q1, q0
-; CHECK-BE-NEXT: vrev16.8 q0, q1
-; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: bfi r2, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #6, #1
+; CHECK-BE-NEXT: rsbs r1, r1, #0
+; CHECK-BE-NEXT: bfi r2, r1, #7, #1
+; CHECK-BE-NEXT: uxtb r1, r2
+; CHECK-BE-NEXT: lsls r2, r2, #31
+; CHECK-BE-NEXT: itt ne
+; CHECK-BE-NEXT: vmovne.u16 r2, q1[0]
+; CHECK-BE-NEXT: strhne r2, [r0]
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r2, q1[1]
+; CHECK-BE-NEXT: strhmi r2, [r0, #2]
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r2, q1[2]
+; CHECK-BE-NEXT: strhmi r2, [r0, #4]
+; CHECK-BE-NEXT: lsls r2, r1, #28
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r2, q1[3]
+; CHECK-BE-NEXT: strhmi r2, [r0, #6]
+; CHECK-BE-NEXT: lsls r2, r1, #27
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r2, q1[4]
+; CHECK-BE-NEXT: strhmi r2, [r0, #8]
+; CHECK-BE-NEXT: lsls r2, r1, #26
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r2, q1[5]
+; CHECK-BE-NEXT: strhmi r2, [r0, #10]
+; CHECK-BE-NEXT: lsls r2, r1, #25
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r2, q1[6]
+; CHECK-BE-NEXT: strhmi r2, [r0, #12]
+; CHECK-BE-NEXT: lsls r1, r1, #24
+; CHECK-BE-NEXT: itt mi
+; CHECK-BE-NEXT: vmovmi.u16 r1, q1[7]
+; CHECK-BE-NEXT: strhmi r1, [r0, #14]
+; CHECK-BE-NEXT: add sp, #8
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp sgt <8 x i16> %a, zeroinitializer
@@ -311,17 +495,88 @@ entry:
define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
; CHECK-LE-LABEL: masked_v4f32_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.i32 ne, q1, zr
-; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: .pad #20
+; CHECK-LE-NEXT: sub sp, #20
+; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr
+; CHECK-LE-NEXT: movs r1, #0
+; CHECK-LE-NEXT: vmrs r2, p0
+; CHECK-LE-NEXT: and r3, r2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-LE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r1, r3, #2, #1
+; CHECK-LE-NEXT: rsbs r2, r2, #0
+; CHECK-LE-NEXT: bfi r1, r2, #3, #1
+; CHECK-LE-NEXT: lsls r2, r1, #31
+; CHECK-LE-NEXT: ittt ne
+; CHECK-LE-NEXT: vstrne s0, [sp, #12]
+; CHECK-LE-NEXT: ldrne r2, [sp, #12]
+; CHECK-LE-NEXT: strne r2, [r0]
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: ittt mi
+; CHECK-LE-NEXT: vstrmi s1, [sp, #8]
+; CHECK-LE-NEXT: ldrmi r2, [sp, #8]
+; CHECK-LE-NEXT: strmi r2, [r0, #4]
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: ittt mi
+; CHECK-LE-NEXT: vstrmi s2, [sp, #4]
+; CHECK-LE-NEXT: ldrmi r2, [sp, #4]
+; CHECK-LE-NEXT: strmi r2, [r0, #8]
+; CHECK-LE-NEXT: lsls r1, r1, #28
+; CHECK-LE-NEXT: ittt mi
+; CHECK-LE-NEXT: vstrmi s3, [sp]
+; CHECK-LE-NEXT: ldrmi r1, [sp]
+; CHECK-LE-NEXT: strmi r1, [r0, #12]
+; CHECK-LE-NEXT: add sp, #20
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_align1:
; CHECK-BE: @ %bb.0: @ %entry
+; CHECK-BE-NEXT: .pad #20
+; CHECK-BE-NEXT: sub sp, #20
; CHECK-BE-NEXT: vrev64.32 q2, q1
+; CHECK-BE-NEXT: movs r1, #0
+; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr
; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vrev32.8 q0, q1
-; CHECK-BE-NEXT: vpt.i32 ne, q2, zr
-; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: vmrs r2, p0
+; CHECK-BE-NEXT: and r3, r2, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r2, #8, #1
+; CHECK-BE-NEXT: ubfx r2, r2, #12, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r1, r3, #2, #1
+; CHECK-BE-NEXT: rsbs r2, r2, #0
+; CHECK-BE-NEXT: bfi r1, r2, #3, #1
+; CHECK-BE-NEXT: lsls r2, r1, #31
+; CHECK-BE-NEXT: ittt ne
+; CHECK-BE-NEXT: vstrne s4, [sp, #12]
+; CHECK-BE-NEXT: ldrne r2, [sp, #12]
+; CHECK-BE-NEXT: strne r2, [r0]
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: ittt mi
+; CHECK-BE-NEXT: vstrmi s5, [sp, #8]
+; CHECK-BE-NEXT: ldrmi r2, [sp, #8]
+; CHECK-BE-NEXT: strmi r2, [r0, #4]
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: ittt mi
+; CHECK-BE-NEXT: vstrmi s6, [sp, #4]
+; CHECK-BE-NEXT: ldrmi r2, [sp, #4]
+; CHECK-BE-NEXT: strmi r2, [r0, #8]
+; CHECK-BE-NEXT: lsls r1, r1, #28
+; CHECK-BE-NEXT: ittt mi
+; CHECK-BE-NEXT: vstrmi s7, [sp]
+; CHECK-BE-NEXT: ldrmi r1, [sp]
+; CHECK-BE-NEXT: strmi r1, [r0, #12]
+; CHECK-BE-NEXT: add sp, #20
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp ugt <4 x i32> %b, zeroinitializer
@@ -415,17 +670,226 @@ entry:
define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
; CHECK-LE-LABEL: masked_v8f16_align1:
; CHECK-LE: @ %bb.0: @ %entry
-; CHECK-LE-NEXT: vpt.i16 ne, q1, zr
-; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-LE-NEXT: .pad #40
+; CHECK-LE-NEXT: sub sp, #40
+; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr
+; CHECK-LE-NEXT: movs r2, #0
+; CHECK-LE-NEXT: vmrs r1, p0
+; CHECK-LE-NEXT: and r3, r1, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #0, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #1, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #2, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #3, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #4, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #5, #1
+; CHECK-LE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-LE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-LE-NEXT: rsbs r3, r3, #0
+; CHECK-LE-NEXT: bfi r2, r3, #6, #1
+; CHECK-LE-NEXT: rsbs r1, r1, #0
+; CHECK-LE-NEXT: bfi r2, r1, #7, #1
+; CHECK-LE-NEXT: uxtb r1, r2
+; CHECK-LE-NEXT: lsls r2, r2, #31
+; CHECK-LE-NEXT: bne .LBB16_9
+; CHECK-LE-NEXT: @ %bb.1: @ %else
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: bmi .LBB16_10
+; CHECK-LE-NEXT: .LBB16_2: @ %else2
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: bmi .LBB16_11
+; CHECK-LE-NEXT: .LBB16_3: @ %else4
+; CHECK-LE-NEXT: lsls r2, r1, #28
+; CHECK-LE-NEXT: bmi .LBB16_12
+; CHECK-LE-NEXT: .LBB16_4: @ %else6
+; CHECK-LE-NEXT: lsls r2, r1, #27
+; CHECK-LE-NEXT: bmi .LBB16_13
+; CHECK-LE-NEXT: .LBB16_5: @ %else8
+; CHECK-LE-NEXT: lsls r2, r1, #26
+; CHECK-LE-NEXT: bmi .LBB16_14
+; CHECK-LE-NEXT: .LBB16_6: @ %else10
+; CHECK-LE-NEXT: lsls r2, r1, #25
+; CHECK-LE-NEXT: bmi .LBB16_15
+; CHECK-LE-NEXT: .LBB16_7: @ %else12
+; CHECK-LE-NEXT: lsls r1, r1, #24
+; CHECK-LE-NEXT: bmi .LBB16_16
+; CHECK-LE-NEXT: .LBB16_8: @ %else14
+; CHECK-LE-NEXT: add sp, #40
+; CHECK-LE-NEXT: bx lr
+; CHECK-LE-NEXT: .LBB16_9: @ %cond.store
+; CHECK-LE-NEXT: vstr.16 s0, [sp, #28]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #28]
+; CHECK-LE-NEXT: strh r2, [r0]
+; CHECK-LE-NEXT: lsls r2, r1, #30
+; CHECK-LE-NEXT: bpl .LBB16_2
+; CHECK-LE-NEXT: .LBB16_10: @ %cond.store1
+; CHECK-LE-NEXT: vmovx.f16 s4, s0
+; CHECK-LE-NEXT: vstr.16 s4, [sp, #24]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #24]
+; CHECK-LE-NEXT: strh r2, [r0, #2]
+; CHECK-LE-NEXT: lsls r2, r1, #29
+; CHECK-LE-NEXT: bpl .LBB16_3
+; CHECK-LE-NEXT: .LBB16_11: @ %cond.store3
+; CHECK-LE-NEXT: vstr.16 s1, [sp, #20]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #20]
+; CHECK-LE-NEXT: strh r2, [r0, #4]
+; CHECK-LE-NEXT: lsls r2, r1, #28
+; CHECK-LE-NEXT: bpl .LBB16_4
+; CHECK-LE-NEXT: .LBB16_12: @ %cond.store5
+; CHECK-LE-NEXT: vmovx.f16 s4, s1
+; CHECK-LE-NEXT: vstr.16 s4, [sp, #16]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #16]
+; CHECK-LE-NEXT: strh r2, [r0, #6]
+; CHECK-LE-NEXT: lsls r2, r1, #27
+; CHECK-LE-NEXT: bpl .LBB16_5
+; CHECK-LE-NEXT: .LBB16_13: @ %cond.store7
+; CHECK-LE-NEXT: vstr.16 s2, [sp, #12]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #12]
+; CHECK-LE-NEXT: strh r2, [r0, #8]
+; CHECK-LE-NEXT: lsls r2, r1, #26
+; CHECK-LE-NEXT: bpl .LBB16_6
+; CHECK-LE-NEXT: .LBB16_14: @ %cond.store9
+; CHECK-LE-NEXT: vmovx.f16 s4, s2
+; CHECK-LE-NEXT: vstr.16 s4, [sp, #8]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #8]
+; CHECK-LE-NEXT: strh r2, [r0, #10]
+; CHECK-LE-NEXT: lsls r2, r1, #25
+; CHECK-LE-NEXT: bpl .LBB16_7
+; CHECK-LE-NEXT: .LBB16_15: @ %cond.store11
+; CHECK-LE-NEXT: vstr.16 s3, [sp, #4]
+; CHECK-LE-NEXT: ldrh.w r2, [sp, #4]
+; CHECK-LE-NEXT: strh r2, [r0, #12]
+; CHECK-LE-NEXT: lsls r1, r1, #24
+; CHECK-LE-NEXT: bpl .LBB16_8
+; CHECK-LE-NEXT: .LBB16_16: @ %cond.store13
+; CHECK-LE-NEXT: vmovx.f16 s0, s3
+; CHECK-LE-NEXT: vstr.16 s0, [sp]
+; CHECK-LE-NEXT: ldrh.w r1, [sp]
+; CHECK-LE-NEXT: strh r1, [r0, #14]
+; CHECK-LE-NEXT: add sp, #40
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_align1:
; CHECK-BE: @ %bb.0: @ %entry
-; CHECK-BE-NEXT: vrev64.16 q2, q0
-; CHECK-BE-NEXT: vrev16.8 q0, q2
+; CHECK-BE-NEXT: .pad #40
+; CHECK-BE-NEXT: sub sp, #40
; CHECK-BE-NEXT: vrev64.16 q2, q1
-; CHECK-BE-NEXT: vpt.i16 ne, q2, zr
-; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
+; CHECK-BE-NEXT: vrev64.16 q1, q0
+; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr
+; CHECK-BE-NEXT: vmrs r1, p0
+; CHECK-BE-NEXT: and r2, r1, #1
+; CHECK-BE-NEXT: rsbs r3, r2, #0
+; CHECK-BE-NEXT: movs r2, #0
+; CHECK-BE-NEXT: bfi r2, r3, #0, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #2, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #1, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #4, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #2, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #6, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #3, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #8, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #4, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #10, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #5, #1
+; CHECK-BE-NEXT: ubfx r3, r1, #12, #1
+; CHECK-BE-NEXT: ubfx r1, r1, #14, #1
+; CHECK-BE-NEXT: rsbs r3, r3, #0
+; CHECK-BE-NEXT: bfi r2, r3, #6, #1
+; CHECK-BE-NEXT: rsbs r1, r1, #0
+; CHECK-BE-NEXT: bfi r2, r1, #7, #1
+; CHECK-BE-NEXT: uxtb r1, r2
+; CHECK-BE-NEXT: lsls r2, r2, #31
+; CHECK-BE-NEXT: bne .LBB16_9
+; CHECK-BE-NEXT: @ %bb.1: @ %else
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: bmi .LBB16_10
+; CHECK-BE-NEXT: .LBB16_2: @ %else2
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: bmi .LBB16_11
+; CHECK-BE-NEXT: .LBB16_3: @ %else4
+; CHECK-BE-NEXT: lsls r2, r1, #28
+; CHECK-BE-NEXT: bmi .LBB16_12
+; CHECK-BE-NEXT: .LBB16_4: @ %else6
+; CHECK-BE-NEXT: lsls r2, r1, #27
+; CHECK-BE-NEXT: bmi .LBB16_13
+; CHECK-BE-NEXT: .LBB16_5: @ %else8
+; CHECK-BE-NEXT: lsls r2, r1, #26
+; CHECK-BE-NEXT: bmi .LBB16_14
+; CHECK-BE-NEXT: .LBB16_6: @ %else10
+; CHECK-BE-NEXT: lsls r2, r1, #25
+; CHECK-BE-NEXT: bmi .LBB16_15
+; CHECK-BE-NEXT: .LBB16_7: @ %else12
+; CHECK-BE-NEXT: lsls r1, r1, #24
+; CHECK-BE-NEXT: bmi .LBB16_16
+; CHECK-BE-NEXT: .LBB16_8: @ %else14
+; CHECK-BE-NEXT: add sp, #40
+; CHECK-BE-NEXT: bx lr
+; CHECK-BE-NEXT: .LBB16_9: @ %cond.store
+; CHECK-BE-NEXT: vstr.16 s4, [sp, #28]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #28]
+; CHECK-BE-NEXT: strh r2, [r0]
+; CHECK-BE-NEXT: lsls r2, r1, #30
+; CHECK-BE-NEXT: bpl .LBB16_2
+; CHECK-BE-NEXT: .LBB16_10: @ %cond.store1
+; CHECK-BE-NEXT: vmovx.f16 s0, s4
+; CHECK-BE-NEXT: vstr.16 s0, [sp, #24]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #24]
+; CHECK-BE-NEXT: strh r2, [r0, #2]
+; CHECK-BE-NEXT: lsls r2, r1, #29
+; CHECK-BE-NEXT: bpl .LBB16_3
+; CHECK-BE-NEXT: .LBB16_11: @ %cond.store3
+; CHECK-BE-NEXT: vstr.16 s5, [sp, #20]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #20]
+; CHECK-BE-NEXT: strh r2, [r0, #4]
+; CHECK-BE-NEXT: lsls r2, r1, #28
+; CHECK-BE-NEXT: bpl .LBB16_4
+; CHECK-BE-NEXT: .LBB16_12: @ %cond.store5
+; CHECK-BE-NEXT: vmovx.f16 s0, s5
+; CHECK-BE-NEXT: vstr.16 s0, [sp, #16]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #16]
+; CHECK-BE-NEXT: strh r2, [r0, #6]
+; CHECK-BE-NEXT: lsls r2, r1, #27
+; CHECK-BE-NEXT: bpl .LBB16_5
+; CHECK-BE-NEXT: .LBB16_13: @ %cond.store7
+; CHECK-BE-NEXT: vstr.16 s6, [sp, #12]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #12]
+; CHECK-BE-NEXT: strh r2, [r0, #8]
+; CHECK-BE-NEXT: lsls r2, r1, #26
+; CHECK-BE-NEXT: bpl .LBB16_6
+; CHECK-BE-NEXT: .LBB16_14: @ %cond.store9
+; CHECK-BE-NEXT: vmovx.f16 s0, s6
+; CHECK-BE-NEXT: vstr.16 s0, [sp, #8]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #8]
+; CHECK-BE-NEXT: strh r2, [r0, #10]
+; CHECK-BE-NEXT: lsls r2, r1, #25
+; CHECK-BE-NEXT: bpl .LBB16_7
+; CHECK-BE-NEXT: .LBB16_15: @ %cond.store11
+; CHECK-BE-NEXT: vstr.16 s7, [sp, #4]
+; CHECK-BE-NEXT: ldrh.w r2, [sp, #4]
+; CHECK-BE-NEXT: strh r2, [r0, #12]
+; CHECK-BE-NEXT: lsls r1, r1, #24
+; CHECK-BE-NEXT: bpl .LBB16_8
+; CHECK-BE-NEXT: .LBB16_16: @ %cond.store13
+; CHECK-BE-NEXT: vmovx.f16 s0, s7
+; CHECK-BE-NEXT: vstr.16 s0, [sp]
+; CHECK-BE-NEXT: ldrh.w r1, [sp]
+; CHECK-BE-NEXT: strh r1, [r0, #14]
+; CHECK-BE-NEXT: add sp, #40
; CHECK-BE-NEXT: bx lr
entry:
%c = icmp ugt <8 x i16> %b, zeroinitializer
Modified: llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll?rev=375085&r1=375084&r2=375085&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll Thu Oct 17 00:55:55 2019
@@ -3,9 +3,9 @@
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1-m.main-none-eabi"
-; CHECK-LABEL: test
-; CHECK: llvm.masked.store.v4i32.p0v4i32
-define void @test(i32* nocapture %A, i32 %n) #0 {
+; CHECK-LABEL: test_i32_align4
+; CHECK: call void @llvm.masked.store.v4i32.p0v4i32
+define void @test_i32_align4(i32* nocapture %A, i32 %n) #0 {
entry:
%cmp12 = icmp sgt i32 %n, 0
br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
@@ -26,6 +26,142 @@ if.then:
br label %for.inc
for.inc: ; preds = %for.body, %if.then
+ %inc = add nuw nsw i32 %i.013, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+}
+
+; CHECK-LABEL: test_i32_align2
+; CHECK-NOT: call void @llvm.masked.store
+define void @test_i32_align2(i32* nocapture %A, i32 %n) #0 {
+entry:
+ %cmp12 = icmp sgt i32 %n, 0
+ br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.013
+ %0 = load i32, i32* %arrayidx, align 2
+ %.off = add i32 %0, 9
+ %1 = icmp ult i32 %.off, 19
+ br i1 %1, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ store i32 0, i32* %arrayidx, align 2
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %inc = add nuw nsw i32 %i.013, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+}
+
+; CHECK-LABEL: test_i32_noalign
+; CHECK: call void @llvm.masked.store.v4i32.p0v4i32
+define void @test_i32_noalign(i32* nocapture %A, i32 %n) #0 {
+entry:
+ %cmp12 = icmp sgt i32 %n, 0
+ br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.013
+ %0 = load i32, i32* %arrayidx
+ %.off = add i32 %0, 9
+ %1 = icmp ult i32 %.off, 19
+ br i1 %1, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ store i32 0, i32* %arrayidx
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %inc = add nuw nsw i32 %i.013, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+}
+
+; CHECK-LABEL: test_i16_align2
+; CHECK: call void @llvm.masked.store.v8i16.p0v8i16
+define void @test_i16_align2(i16* nocapture %A, i32 %n) #0 {
+entry:
+ %cmp12 = icmp sgt i32 %n, 0
+ br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.013
+ %0 = load i16, i16* %arrayidx, align 2
+ %.off = add i16 %0, 9
+ %1 = icmp ult i16 %.off, 19
+ br i1 %1, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ store i16 0, i16* %arrayidx, align 2
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %inc = add nuw nsw i32 %i.013, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+}
+
+; CHECK-LABEL: test_i16_align1
+; CHECK-NOT: call void @llvm.masked.store
+define void @test_i16_align1(i16* nocapture %A, i32 %n) #0 {
+entry:
+ %cmp12 = icmp sgt i32 %n, 0
+ br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.013
+ %0 = load i16, i16* %arrayidx, align 1
+ %.off = add i16 %0, 9
+ %1 = icmp ult i16 %.off, 19
+ br i1 %1, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ store i16 0, i16* %arrayidx, align 1
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
%inc = add nuw nsw i32 %i.013, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
More information about the llvm-commits
mailing list