[llvm] f6b9836 - [ARM][NEON] Combine base address updates for vld1Ndup intrinsics
Kristina Bessonova via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 13 02:19:29 PDT 2021
Author: Kristina Bessonova
Date: 2021-06-13T11:18:32+02:00
New Revision: f6b9836b09c78dc05abb6dfd4ad39345bc4d9f09
URL: https://github.com/llvm/llvm-project/commit/f6b9836b09c78dc05abb6dfd4ad39345bc4d9f09
DIFF: https://github.com/llvm/llvm-project/commit/f6b9836b09c78dc05abb6dfd4ad39345bc4d9f09.diff
LOG: [ARM][NEON] Combine base address updates for vld1Ndup intrinsics
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D103836
Added:
Modified:
llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/lib/Target/ARM/ARMInstrNEON.td
llvm/test/CodeGen/ARM/arm-vlddup-update.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 8d6bc063d6efb..2167ad5d74676 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -247,10 +247,16 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false},
{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq16OddPseudoWB_fixed, ARM::VLD2DUPd16x2wb_fixed, true, true, false, OddDblSpc, 2, 4 ,false},
+{ ARM::VLD2DUPq16OddPseudoWB_register, ARM::VLD2DUPd16x2wb_register, true, true, true, OddDblSpc, 2, 4 ,false},
{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false},
{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq32OddPseudoWB_fixed, ARM::VLD2DUPd32x2wb_fixed, true, true, false, OddDblSpc, 2, 2 ,false},
+{ ARM::VLD2DUPq32OddPseudoWB_register, ARM::VLD2DUPd32x2wb_register, true, true, true, OddDblSpc, 2, 2 ,false},
{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false},
{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false},
+{ ARM::VLD2DUPq8OddPseudoWB_fixed, ARM::VLD2DUPd8x2wb_fixed, true, true, false, OddDblSpc, 2, 8 ,false},
+{ ARM::VLD2DUPq8OddPseudoWB_register, ARM::VLD2DUPd8x2wb_register, true, true, true, OddDblSpc, 2, 8 ,false},
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
@@ -281,10 +287,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true},
{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true},
{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true},
+{ ARM::VLD3DUPq16OddPseudo_UPD, ARM::VLD3DUPq16_UPD, true, true, true, OddDblSpc, 3, 4 ,true},
{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true},
{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true},
+{ ARM::VLD3DUPq32OddPseudo_UPD, ARM::VLD3DUPq32_UPD, true, true, true, OddDblSpc, 3, 2 ,true},
{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true},
{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true},
+{ ARM::VLD3DUPq8OddPseudo_UPD, ARM::VLD3DUPq8_UPD, true, true, true, OddDblSpc, 3, 8 ,true},
{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true},
{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
@@ -322,10 +331,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true},
{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true},
{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true},
+{ ARM::VLD4DUPq16OddPseudo_UPD, ARM::VLD4DUPq16_UPD, true, true, true, OddDblSpc, 4, 4 ,true},
{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true},
{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true},
+{ ARM::VLD4DUPq32OddPseudo_UPD, ARM::VLD4DUPq32_UPD, true, true, true, OddDblSpc, 4, 2 ,true},
{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true},
{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true},
+{ ARM::VLD4DUPq8OddPseudo_UPD, ARM::VLD4DUPq8_UPD, true, true, true, OddDblSpc, 4, 8 ,true},
{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true},
{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
@@ -567,9 +579,18 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
bool DstIsDead = MI.getOperand(OpIdx).isDead();
Register DstReg = MI.getOperand(OpIdx++).getReg();
- if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
- TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
- TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
+
+ bool IsVLD2DUP = TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2 ||
+ TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_register ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_register ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_register;
+
+ if (IsVLD2DUP) {
unsigned SubRegIndex;
if (RegSpc == EvenDblSpc) {
SubRegIndex = ARM::dsub_0;
@@ -617,7 +638,10 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
TableEntry->RealOpc == ARM::VLD1d8Twb_fixed ||
TableEntry->RealOpc == ARM::VLD1d16Twb_fixed ||
TableEntry->RealOpc == ARM::VLD1d32Twb_fixed ||
- TableEntry->RealOpc == ARM::VLD1d64Twb_fixed) {
+ TableEntry->RealOpc == ARM::VLD1d64Twb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd8x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd16x2wb_fixed ||
+ TableEntry->RealOpc == ARM::VLD2DUPd32x2wb_fixed) {
assert(AM6Offset.getReg() == 0 &&
"A fixed writing-back pseudo instruction provides an offset "
"register!");
@@ -630,9 +654,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
// has an extra operand that is a use of the super-register. Record the
// operand index and skip over it.
unsigned SrcOpIdx = 0;
- if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 &&
- TableEntry->RealOpc != ARM::VLD2DUPd16x2 &&
- TableEntry->RealOpc != ARM::VLD2DUPd32x2) {
+ if (!IsVLD2DUP) {
if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
RegSpc == SingleHighTSpc)
@@ -2697,18 +2719,30 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case ARM::VLD2DUPq16OddPseudo:
case ARM::VLD2DUPq32EvenPseudo:
case ARM::VLD2DUPq32OddPseudo:
+ case ARM::VLD2DUPq8OddPseudoWB_fixed:
+ case ARM::VLD2DUPq8OddPseudoWB_register:
+ case ARM::VLD2DUPq16OddPseudoWB_fixed:
+ case ARM::VLD2DUPq16OddPseudoWB_register:
+ case ARM::VLD2DUPq32OddPseudoWB_fixed:
+ case ARM::VLD2DUPq32OddPseudoWB_register:
case ARM::VLD3DUPq8EvenPseudo:
case ARM::VLD3DUPq8OddPseudo:
case ARM::VLD3DUPq16EvenPseudo:
case ARM::VLD3DUPq16OddPseudo:
case ARM::VLD3DUPq32EvenPseudo:
case ARM::VLD3DUPq32OddPseudo:
+ case ARM::VLD3DUPq8OddPseudo_UPD:
+ case ARM::VLD3DUPq16OddPseudo_UPD:
+ case ARM::VLD3DUPq32OddPseudo_UPD:
case ARM::VLD4DUPq8EvenPseudo:
case ARM::VLD4DUPq8OddPseudo:
case ARM::VLD4DUPq16EvenPseudo:
case ARM::VLD4DUPq16OddPseudo:
case ARM::VLD4DUPq32EvenPseudo:
case ARM::VLD4DUPq32OddPseudo:
+ case ARM::VLD4DUPq8OddPseudo_UPD:
+ case ARM::VLD4DUPq16OddPseudo_UPD:
+ case ARM::VLD4DUPq32OddPseudo_UPD:
ExpandVLD(MBBI);
return true;
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index e737b648017e0..be9d383fa0913 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1972,6 +1972,9 @@ static bool isVLDfixed(unsigned Opc)
case ARM::VLD2DUPd8wb_fixed : return true;
case ARM::VLD2DUPd16wb_fixed : return true;
case ARM::VLD2DUPd32wb_fixed : return true;
+ case ARM::VLD2DUPq8OddPseudoWB_fixed: return true;
+ case ARM::VLD2DUPq16OddPseudoWB_fixed: return true;
+ case ARM::VLD2DUPq32OddPseudoWB_fixed: return true;
}
}
@@ -2035,6 +2038,9 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;
+ case ARM::VLD2DUPq8OddPseudoWB_fixed: return ARM::VLD2DUPq8OddPseudoWB_register;
+ case ARM::VLD2DUPq16OddPseudoWB_fixed: return ARM::VLD2DUPq16OddPseudoWB_register;
+ case ARM::VLD2DUPq32OddPseudoWB_fixed: return ARM::VLD2DUPq32OddPseudoWB_register;
case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
@@ -2987,52 +2993,48 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
SDValue Pred = getAL(CurDAG, dl);
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
- SDNode *VLdDup;
- if (is64BitVector || NumVecs == 1) {
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(MemAddr);
- Ops.push_back(Align);
- unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] :
- QOpcodes0[OpcodeIndex];
- if (isUpdating) {
- // fixed-stride update instructions don't have an explicit writeback
- // operand. It's implicit in the opcode itself.
- SDValue Inc = N->getOperand(2);
- bool IsImmUpdate =
- isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
- if (NumVecs <= 2 && !IsImmUpdate)
- Opc = getVLDSTRegisterUpdateOpcode(Opc);
- if (!IsImmUpdate)
- Ops.push_back(Inc);
- // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
- else if (NumVecs > 2)
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex]
+ : (NumVecs == 1) ? QOpcodes0[OpcodeIndex]
+ : QOpcodes1[OpcodeIndex];
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(2);
+ bool IsImmUpdate =
+ isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+ if (IsImmUpdate) {
+ if (!isVLDfixed(Opc))
Ops.push_back(Reg0);
+ } else {
+ if (isVLDfixed(Opc))
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ Ops.push_back(Inc);
}
- Ops.push_back(Pred);
- Ops.push_back(Reg0);
- Ops.push_back(Chain);
- VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ }
+ if (is64BitVector || NumVecs == 1) {
+ // Double registers and VLD1 quad registers are directly supported.
} else if (NumVecs == 2) {
- const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain };
- SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
- dl, ResTys, OpsA);
-
+ const SDValue OpsA[] = {MemAddr, Align, Pred, Reg0, Chain};
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
+ MVT::Other, OpsA);
Chain = SDValue(VLdA, 1);
- const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain };
- VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
} else {
- SDValue ImplDef =
- SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
- const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain };
- SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
- dl, ResTys, OpsA);
-
- SDValue SuperReg = SDValue(VLdA, 0);
+ SDValue ImplDef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+ const SDValue OpsA[] = {MemAddr, Align, ImplDef, Pred, Reg0, Chain};
+ SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
+ MVT::Other, OpsA);
+ Ops.push_back(SDValue(VLdA, 0));
Chain = SDValue(VLdA, 1);
- const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain };
- VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
}
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+
+ SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
// Transfer memoperands.
MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(VLdDup), {MemOp});
@@ -4192,26 +4194,47 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
}
case ARMISD::VLD2DUP_UPD: {
- static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
- ARM::VLD2DUPd16wb_fixed,
- ARM::VLD2DUPd32wb_fixed };
- SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes);
+ static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8wb_fixed,
+ ARM::VLD2DUPd16wb_fixed,
+ ARM::VLD2DUPd32wb_fixed,
+ ARM::VLD1q64wb_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo,
+ ARM::VLD2DUPq16EvenPseudo,
+ ARM::VLD2DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudoWB_fixed,
+ ARM::VLD2DUPq16OddPseudoWB_fixed,
+ ARM::VLD2DUPq32OddPseudoWB_fixed };
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, DOpcodes, QOpcodes0, QOpcodes1);
return;
}
case ARMISD::VLD3DUP_UPD: {
- static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
- ARM::VLD3DUPd16Pseudo_UPD,
- ARM::VLD3DUPd32Pseudo_UPD };
- SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes);
+ static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+ ARM::VLD3DUPd16Pseudo_UPD,
+ ARM::VLD3DUPd32Pseudo_UPD,
+ ARM::VLD1d64TPseudoWB_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo,
+ ARM::VLD3DUPq16EvenPseudo,
+ ARM::VLD3DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo_UPD,
+ ARM::VLD3DUPq16OddPseudo_UPD,
+ ARM::VLD3DUPq32OddPseudo_UPD };
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
return;
}
case ARMISD::VLD4DUP_UPD: {
- static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
- ARM::VLD4DUPd16Pseudo_UPD,
- ARM::VLD4DUPd32Pseudo_UPD };
- SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes);
+ static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+ ARM::VLD4DUPd16Pseudo_UPD,
+ ARM::VLD4DUPd32Pseudo_UPD,
+ ARM::VLD1d64QPseudoWB_fixed };
+ static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo,
+ ARM::VLD4DUPq16EvenPseudo,
+ ARM::VLD4DUPq32EvenPseudo };
+ static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo_UPD,
+ ARM::VLD4DUPq16OddPseudo_UPD,
+ ARM::VLD4DUPq32OddPseudo_UPD };
+ SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
return;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 81c0565f4813f..d0fd80be10ec9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14774,12 +14774,12 @@ static SDValue CombineBaseUpdate(SDNode *N,
NumVecs = 3; hasAlignment = false; break;
case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
NumVecs = 4; hasAlignment = false; break;
- case Intrinsic::arm_neon_vld2dup:
- case Intrinsic::arm_neon_vld3dup:
- case Intrinsic::arm_neon_vld4dup:
- // TODO: Support updating VLDxDUP nodes. For now, we just skip
- // combining base updates for such intrinsics.
- continue;
+ case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
+ NumVecs = 4; break;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2; isLaneOp = true; break;
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
@@ -14833,8 +14833,12 @@ static SDValue CombineBaseUpdate(SDNode *N,
VecTy = N->getOperand(1).getValueType();
}
+ bool isVLDDUPOp =
+ NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
+ NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
+
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
- if (isLaneOp)
+ if (isLaneOp || isVLDDUPOp)
NumBytes /= VecTy.getVectorNumElements();
// If the increment is a constant, it must match the memory ref size.
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index ba637be05d389..5cafe85b72399 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -1534,6 +1534,13 @@ defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
+def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
class VLD3DUP<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
@@ -1587,6 +1594,10 @@ def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPq8OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPq16OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPq32OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
class VLD4DUP<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1111, op7_4,
@@ -1641,6 +1652,10 @@ def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPq8OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPq16OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPq32OddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
diff --git a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
index c39dfc764dde2..28740fa1953a8 100644
--- a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
+++ b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
@@ -1,43 +1,495 @@
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \
; RUN: -asm-verbose=false | FileCheck %s
+%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> }
+%struct.uint16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.uint16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+
%struct.uint32x2x2_t = type { <2 x i32>, <2 x i32> }
%struct.uint32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
%struct.uint32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.uint64x1x2_t = type { <1 x i64>, <1 x i64> }
+%struct.uint64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> }
+%struct.uint64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
+
+%struct.uint8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.uint8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.uint8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+
+%struct.uint16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.uint16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.uint16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
+
+%struct.uint32x4x2_t = type { <4 x i32>, <4 x i32> }
+%struct.uint32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.uint32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
+
+%struct.uint8x16x2_t = type { <16 x i8>, <16 x i8> }
+%struct.uint8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> }
+%struct.uint8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }
+
+declare %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8*, i32)
+declare %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8*, i32)
declare %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8*, i32)
+declare %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8*, i32)
+
+declare %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8*, i32)
+declare %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8*, i32)
declare %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8*, i32)
+declare %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8*, i32)
+
+declare %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8*, i32)
+declare %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8*, i32)
declare %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8*, i32)
+declare %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8*, i32)
+
+declare %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8*, i32)
+declare %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8*, i32)
+declare %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8*, i32)
+
+declare %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8*, i32)
+declare %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8*, i32)
+declare %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8*, i32)
+
+declare %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8*, i32)
+declare %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8*, i32)
+declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8*, i32)
+
+define i8* @test_vld2_dup_u16_update(%struct.uint16x4x2_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld2_dup_u16_update:
+; CHECK: vld2.16 {d16[], d17[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x4x2_t %tmp, %struct.uint16x4x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 4
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2_dup_u16_update_reg(%struct.uint16x4x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2_dup_u16_update_reg:
+; CHECK: vld2.16 {d16[], d17[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x4x2_t %tmp, %struct.uint16x4x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
-; CHECK-LABEL: test_vld2_dup_update
-; CHECK: vld2.32 {d16[], d17[]}, {{\[}}[[SRC_R:r[0-9]+]]]
-; CHECK: add {{r[0-9]+|lr}}, [[SRC_R]], #4
define i8* @test_vld2_dup_update(%struct.uint32x2x2_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld2_dup_update:
+; CHECK: vld2.32 {d16[], d17[]}, [r1]!
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* %src, i32 4)
store %struct.uint32x2x2_t %tmp, %struct.uint32x2x2_t* %dest, align 8
- %updated_src = getelementptr inbounds i8, i8* %src, i32 4
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 8
ret i8* %updated_src
}
-; CHECK-LABEL: test_vld3_dup_update
-; CHECK: vld3.32 {d16[], d17[], d18[]}, {{\[}}[[SRC_R:r[0-9]+]]]
-; CHECK: add {{r[0-9]+|lr}}, [[SRC_R]], #4
-define i8* @test_vld3_dup_update(%struct.uint32x2x3_t* %dest, i8* %src) {
+define i8* @test_vld2_dup_update_reg(%struct.uint32x2x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2_dup_update_reg:
+; CHECK: vld2.32 {d16[], d17[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x2x2_t %tmp, %struct.uint32x2x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2_dup_u64_update(%struct.uint64x1x2_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld2_dup_u64_update:
+; CHECK: vld1.64 {d16, d17}, [r1:64]!
+entry:
+ %tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* %src, i32 8)
+ store %struct.uint64x1x2_t %tmp, %struct.uint64x1x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 16
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2_dup_u64_update_reg(%struct.uint64x1x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2_dup_u64_update_reg:
+; CHECK: vld1.64 {d16, d17}, [r1:64], r2
+entry:
+ %tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0i8(i8* %src, i32 8)
+ store %struct.uint64x1x2_t %tmp, %struct.uint64x1x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2_dup_u8_update(%struct.uint8x8x2_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld2_dup_u8_update:
+; CHECK: vld2.8 {d16[], d17[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x8x2_t %tmp, %struct.uint8x8x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 2
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2_dup_u8_update_reg(%struct.uint8x8x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2_dup_u8_update_reg:
+; CHECK: vld2.8 {d16[], d17[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x8x2_t %tmp, %struct.uint8x8x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u16_update(%struct.uint16x4x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3_dup_u16_update:
+; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x4x3_t %tmp, %struct.uint16x4x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 6
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u16_update_reg(%struct.uint16x4x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_u16_update_reg:
+; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x4x3_t %tmp, %struct.uint16x4x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u32_update(%struct.uint32x2x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3_dup_u32_update:
+; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1]!
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* %src, i32 4)
store %struct.uint32x2x3_t %tmp, %struct.uint32x2x3_t* %dest, align 8
- %updated_src = getelementptr inbounds i8, i8* %src, i32 4
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 12
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u32_update_reg(%struct.uint32x2x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_u32_update_reg:
+; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x2x3_t %tmp, %struct.uint32x2x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u64_update(%struct.uint64x1x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3_dup_u64_update:
+; CHECK: vld1.64 {d16, d17, d18}, [r1]!
+entry:
+ %tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* %src, i32 8)
+ store %struct.uint64x1x3_t %tmp, %struct.uint64x1x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 24
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u64_update_reg(%struct.uint64x1x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_u64_update_reg:
+; CHECK: vld1.64 {d16, d17, d18}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0i8(i8* %src, i32 8)
+ store %struct.uint64x1x3_t %tmp, %struct.uint64x1x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u8_update(%struct.uint8x8x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3_dup_u8_update:
+; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x8x3_t %tmp, %struct.uint8x8x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 3
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3_dup_u8_update_reg(%struct.uint8x8x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_u8_update_reg:
+; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x8x3_t %tmp, %struct.uint8x8x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4_dup_u16_update(%struct.uint16x4x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4_dup_u16_update:
+; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x4x4_t %tmp, %struct.uint16x4x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 8
ret i8* %updated_src
}
-; CHECK-LABEL: test_vld4_dup_update
-; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, {{\[}}[[SRC_R:r[0-9]+]]]
-; CHECK: add {{r[0-9]+|lr}}, [[SRC_R]], #4
-define i8* @test_vld4_dup_update(%struct.uint32x2x4_t* %dest, i8* %src) {
+define i8* @test_vld4_dup_u16_update_reg(%struct.uint16x4x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_u16_update_reg:
+; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x4x4_t %tmp, %struct.uint16x4x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4_dup_u32_update(%struct.uint32x2x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4_dup_u32_update:
+; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1]!
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* %src, i32 4)
store %struct.uint32x2x4_t %tmp, %struct.uint32x2x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 16
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4_dup_u32_update_reg(%struct.uint32x2x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_u32_update_reg:
+; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x2x4_t %tmp, %struct.uint32x2x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4_dup_u64_update(%struct.uint64x1x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4_dup_u64_update:
+; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64]!
+entry:
+ %tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* %src, i32 8)
+ store %struct.uint64x1x4_t %tmp, %struct.uint64x1x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 32
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4_dup_u64_update_reg(%struct.uint64x1x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_u64_update_reg:
+; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64], r2
+entry:
+ %tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0i8(i8* %src, i32 8)
+ store %struct.uint64x1x4_t %tmp, %struct.uint64x1x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4_dup_u8_update(%struct.uint8x8x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4_dup_u8_update:
+; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x8x4_t %tmp, %struct.uint8x8x4_t* %dest, align 8
%updated_src = getelementptr inbounds i8, i8* %src, i32 4
ret i8* %updated_src
}
+
+define i8* @test_vld4_dup_u8_update_reg(%struct.uint8x8x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_u8_update_reg:
+; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x8x4_t %tmp, %struct.uint8x8x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2q_dup_u16_update(%struct.uint16x8x2_t* %dest, i8* %src, <8 x i16>* %dest0) {
+; CHECK-LABEL: test_vld2q_dup_u16_update:
+; CHECK: vld2.16 {d16[], d18[]}, [r1]
+; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x8x2_t %tmp, %struct.uint16x8x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 4
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2q_dup_u16_update_reg(%struct.uint16x8x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_u16_update_reg:
+; CHECK: vld2.16 {d16[], d18[]}, [r1]
+; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x8x2_t %tmp, %struct.uint16x8x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2q_dup_u32_update(%struct.uint32x4x2_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld2q_dup_u32_update:
+; CHECK: vld2.32 {d16[], d18[]}, [r1]
+; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x4x2_t %tmp, %struct.uint32x4x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 8
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2q_dup_u32_update_reg(%struct.uint32x4x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_u32_update_reg:
+; CHECK: vld2.32 {d16[], d18[]}, [r1]
+; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x4x2_t %tmp, %struct.uint32x4x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2q_dup_u8_update(%struct.uint8x16x2_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld2q_dup_u8_update:
+; CHECK: vld2.8 {d16[], d18[]}, [r1]
+; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x16x2_t %tmp, %struct.uint8x16x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 2
+ ret i8* %updated_src
+}
+
+define i8* @test_vld2q_dup_u8_update_reg(%struct.uint8x16x2_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_u8_update_reg:
+; CHECK: vld2.8 {d16[], d18[]}, [r1]
+; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x16x2_t %tmp, %struct.uint8x16x2_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3q_dup_u16_update(%struct.uint16x8x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3q_dup_u16_update:
+; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
+; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x8x3_t %tmp, %struct.uint16x8x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 6
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3q_dup_u16_update_reg(%struct.uint16x8x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3q_dup_u16_update_reg:
+; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
+; CHECK-NEXT: vld3.16 {d17[], d19[], d21[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x8x3_t %tmp, %struct.uint16x8x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3q_dup_u32_update(%struct.uint32x4x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3q_dup_u32_update:
+; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
+; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x4x3_t %tmp, %struct.uint32x4x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 12
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3q_dup_u32_update_reg(%struct.uint32x4x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3q_dup_u32_update_reg:
+; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
+; CHECK-NEXT: vld3.32 {d17[], d19[], d21[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x4x3_t %tmp, %struct.uint32x4x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3q_dup_u8_update(%struct.uint8x16x3_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld3q_dup_u8_update:
+; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
+; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x16x3_t %tmp, %struct.uint8x16x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 3
+ ret i8* %updated_src
+}
+
+define i8* @test_vld3q_dup_u8_update_reg(%struct.uint8x16x3_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld3q_dup_u8_update_reg:
+; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
+; CHECK-NEXT: vld3.8 {d17[], d19[], d21[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x16x3_t %tmp, %struct.uint8x16x3_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4q_dup_u16_update(%struct.uint16x8x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4q_dup_u16_update:
+; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
+; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x8x4_t %tmp, %struct.uint16x8x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 8
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4q_dup_u16_update_reg(%struct.uint16x8x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4q_dup_u16_update_reg:
+; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
+; CHECK-NEXT: vld4.16 {d17[], d19[], d21[], d23[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0i8(i8* %src, i32 2)
+ store %struct.uint16x8x4_t %tmp, %struct.uint16x8x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4q_dup_u32_update(%struct.uint32x4x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4q_dup_u32_update:
+; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
+; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x4x4_t %tmp, %struct.uint32x4x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 16
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4q_dup_u32_update_reg(%struct.uint32x4x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4q_dup_u32_update_reg:
+; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
+; CHECK-NEXT: vld4.32 {d17[], d19[], d21[], d23[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0i8(i8* %src, i32 4)
+ store %struct.uint32x4x4_t %tmp, %struct.uint32x4x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4q_dup_u8_update(%struct.uint8x16x4_t* %dest, i8* %src) {
+; CHECK-LABEL: test_vld4q_dup_u8_update:
+; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
+; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]!
+entry:
+ %tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x16x4_t %tmp, %struct.uint8x16x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 4
+ ret i8* %updated_src
+}
+
+define i8* @test_vld4q_dup_u8_update_reg(%struct.uint8x16x4_t* %dest, i8* %src, i32 %inc) {
+; CHECK-LABEL: test_vld4q_dup_u8_update_reg:
+; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
+; CHECK-NEXT: vld4.8 {d17[], d19[], d21[], d23[]}, [r1], r2
+entry:
+ %tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0i8(i8* %src, i32 1)
+ store %struct.uint8x16x4_t %tmp, %struct.uint8x16x4_t* %dest, align 8
+ %updated_src = getelementptr inbounds i8, i8* %src, i32 %inc
+ ret i8* %updated_src
+}
More information about the llvm-commits
mailing list