[llvm] [AArch64][GlobalISel] Select llvm.aarch64.neon.ld* intrinsics (PR #65630)
Vladislav Dzhidzhoev via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 7 09:12:04 PDT 2023
https://github.com/dzhidzhoev created https://github.com/llvm/llvm-project/pull/65630:
Similar to llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp.
>From 97dddd4f770d4f82233a0ad1b79ef13ad6b301be Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev at accesssoftek.com>
Date: Mon, 14 Aug 2023 11:16:04 +0200
Subject: [PATCH] [AArch64][GlobalISel] Select llvm.aarch64.neon.ld* intrinsics
Similar to llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp.
---
.../lib/Target/AArch64/AArch64InstrFormats.td | 3 +
llvm/lib/Target/AArch64/AArch64InstrGISel.td | 53 +++
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +
.../GISel/AArch64InstructionSelector.cpp | 336 +++++++++++++-
.../AArch64/GISel/AArch64RegisterBankInfo.cpp | 53 ++-
llvm/test/CodeGen/AArch64/arm64-ld1.ll | 431 ++++++++++++------
6 files changed, 717 insertions(+), 161 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 57d69ae05c47ff..b51504db2dd5cb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -484,6 +484,9 @@ def UImmS8XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
}]>;
+def gi_UImmS1XForm : GICustomOperandRenderer<"renderUImmS1">,
+ GISDNodeXFormEquiv<UImmS1XForm>;
+
// uimm5sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 32 * N].
def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index f9f860607b5877..db720bf614a39e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -255,6 +255,7 @@ def : GINodeEquiv<G_FCMLTZ, AArch64fcmltz>;
def : GINodeEquiv<G_BIT, AArch64bit>;
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
+def : GINodeEquiv<G_INSERT_VECTOR_ELT, insertelt>;
def : GINodeEquiv<G_PREFETCH, AArch64Prefetch>;
@@ -450,3 +451,55 @@ def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(UMINPv2i32 V64:$Rn, V64:$Rn), dsub),
ssub))>;
+
+def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
+ (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
+ (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i16 (load GPR64sp:$Rn)))),
+ (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i16 (load GPR64sp:$Rn)))),
+ (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+ (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+ (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+ (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+ (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane64PatGISel<SDPatternOperator scalar_load, Operand VecIndex,
+ ValueType VTy, ValueType STy, Instruction LD1>
+ : Pat<(insertelt (VTy VecListOne64:$Rd),
+ (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (EXTRACT_SUBREG
+ (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+ (UImmS1XForm VecIndex:$idx), GPR64sp:$Rn),
+ dsub)>;
+
+class Ld1Lane128PatGISel<Operand VecIndex, ValueType VTy,
+ ValueType STy, Instruction LD1>
+ : Pat<(insertelt (VTy VecListOne128:$Rd),
+ (STy (load GPR64sp:$Rn)), VecIndex:$idx),
+ (LD1 VecListOne128:$Rd, (UImmS1XForm VecIndex:$idx), GPR64sp:$Rn)>;
+
+// Enable these patterns only for GlobalISel, since
+// SelectionDAG analogues only select insertelt with i32 indices.
+let Predicates = [OnlyGISel] in {
+ def : Ld1Lane64PatGISel<load, VectorIndexB, v8i8, i8, LD1i8>;
+ def : Ld1Lane64PatGISel<load, VectorIndexB32b, v8i8, i8, LD1i8>;
+ def : Ld1Lane64PatGISel<load, VectorIndexH, v4i16, i16, LD1i16>;
+ def : Ld1Lane64PatGISel<load, VectorIndexH32b, v4i16, i16, LD1i16>;
+ def : Ld1Lane64PatGISel<load, VectorIndexS, v2i32, i32, LD1i32>;
+ def : Ld1Lane64PatGISel<load, VectorIndexS32b, v2i32, i32, LD1i32>;
+
+ def : Ld1Lane128PatGISel<VectorIndexB, v16i8, i8, LD1i8>;
+ def : Ld1Lane128PatGISel<VectorIndexB32b, v16i8, i8, LD1i8>;
+ def : Ld1Lane128PatGISel<VectorIndexH, v8i16, i16, LD1i16>;
+ def : Ld1Lane128PatGISel<VectorIndexH32b, v8i16, i16, LD1i16>;
+ def : Ld1Lane128PatGISel<VectorIndexH, v4i32, i32, LD1i32>;
+ def : Ld1Lane128PatGISel<VectorIndexH32b, v4i32, i32, LD1i32>;
+ def : Ld1Lane128PatGISel<VectorIndexH, v2i64, i64, LD1i64>;
+ def : Ld1Lane128PatGISel<VectorIndexH32b, v2i64, i64, LD1i64>;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4a1f46f2576aec..dbfb7f63a78acd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -869,6 +869,8 @@ let RecomputePerFunction = 1 in {
def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
+
+ def OnlyGISel : Predicate<"MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">;
// Toggles patterns which aren't beneficial in GlobalISel when we aren't
// optimizing. This allows us to selectively use patterns without impacting
// SelectionDAG's behaviour.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3f55adef3e05ff..737f01b7a3ae65 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -143,6 +143,10 @@ class AArch64InstructionSelector : public InstructionSelector {
const TargetRegisterClass *DstRC,
Register Scalar,
MachineIRBuilder &MIRBuilder) const;
+ // Helper to narrow vector that was widened by emitScalarToVector.
+ MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
+ MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI) const;
/// Emit a lane insert into \p DstReg, or a new vector register if
/// std::nullopt is provided.
@@ -186,6 +190,8 @@ class AArch64InstructionSelector : public InstructionSelector {
/// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
MachineInstr &I);
+ bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
+ MachineInstr &I);
bool selectIntrinsicWithSideEffects(MachineInstr &I,
MachineRegisterInfo &MRI);
bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -452,6 +458,9 @@ class AArch64InstructionSelector : public InstructionSelector {
const MachineInstr &MI,
int OpIdx = -1) const;
+ void renderUImmS1(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx = -1) const;
+
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
@@ -3897,6 +3906,31 @@ MachineInstr *AArch64InstructionSelector::emitScalarToVector(
}
}
+MachineInstr *
+AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
+ MachineIRBuilder &MIB,
+ MachineRegisterInfo &MRI) const {
+ LLT DstTy = MRI.getType(DstReg);
+ const TargetRegisterClass *RC =
+ getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
+ if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
+ LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
+ return nullptr;
+ }
+ unsigned SubReg = 0;
+ if (!getSubRegForClass(RC, TRI, SubReg))
+ return nullptr;
+ if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
+ LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
+ << DstTy.getSizeInBits() << "\n");
+ return nullptr;
+ }
+ auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
+ .addReg(SrcReg, 0, SubReg);
+ RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ return Copy;
+}
+
bool AArch64InstructionSelector::selectMergeValues(
MachineInstr &I, MachineRegisterInfo &MRI) {
assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
@@ -5384,24 +5418,8 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
if (VecSize < 128) {
// If we had to widen to perform the insert, then we have to demote back to
// the original size to get the result we want.
- Register DemoteVec = InsMI->getOperand(0).getReg();
- const TargetRegisterClass *RC =
- getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
- if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
- LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
- return false;
- }
- unsigned SubReg = 0;
- if (!getSubRegForClass(RC, TRI, SubReg))
- return false;
- if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
- LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
- << "\n");
+ if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI))
return false;
- }
- MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
- .addReg(DemoteVec, 0, SubReg);
- RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
// No widening needed.
InsMI->getOperand(0).setReg(DstReg);
@@ -5630,6 +5648,60 @@ bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
return true;
}
+bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
+ unsigned Opc, unsigned NumVecs, MachineInstr &I) {
+ assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+ assert(Opc && "Expected an opcode?");
+ assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
+ auto &MRI = *MIB.getMRI();
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ bool Narrow = Ty.getSizeInBits() == 64;
+
+ auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
+ SmallVector<Register, 4> Regs(NumVecs);
+ std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
+ [](auto MO) { return MO.getReg(); });
+
+ if (Narrow) {
+ transform(Regs, Regs.begin(), [this](Register Reg) {
+ return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
+ ->getOperand(0)
+ .getReg();
+ });
+ Ty = Ty.multiplyElements(2);
+ }
+
+ Register Tuple = createQTuple(Regs, MIB);
+ auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
+ if (!LaneNo)
+ return false;
+
+ Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
+ auto Load = MIB.buildInstr(Opc, {Ty}, {})
+ .addReg(Tuple)
+ .addImm(LaneNo->getZExtValue())
+ .addReg(Ptr);
+ Load.cloneMemRefs(I);
+ constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
+ Register SelectedLoadDst = Load->getOperand(0).getReg();
+ unsigned SubReg = AArch64::qsub0;
+ for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+ auto Vec = MIB.buildInstr(TargetOpcode::COPY,
+ {Narrow ? DstOp(&AArch64::FPR128RegClass)
+ : DstOp(I.getOperand(Idx).getReg())},
+ {})
+ .addReg(SelectedLoadDst, 0, SubReg + Idx);
+ Register WideReg = Vec.getReg(0);
+ // Emit the subreg copies and immediately select them.
+ selectCopy(*Vec, TII, MRI, TRI, RBI);
+ if (Narrow &&
+ !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
+ return false;
+ }
+
+ return true;
+}
+
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MachineInstr &I, MachineRegisterInfo &MRI) {
// Find the intrinsic ID.
@@ -5664,6 +5736,78 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MIB.buildInstr(AArch64::BRK, {}, {})
.addImm(I.getOperand(1).getImm() | ('U' << 8));
break;
+ case Intrinsic::aarch64_neon_ld1x2: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD1Twov8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD1Twov16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD1Twov4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD1Twov8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD1Twov2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD1Twov4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD1Twov2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD1Twov1d;
+ else
+ llvm_unreachable("Unexpected type for ld1x2!");
+ selectVectorLoadIntrinsic(Opc, 2, I);
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld1x3: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD1Threev8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD1Threev16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD1Threev4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD1Threev8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD1Threev2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD1Threev4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD1Threev2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD1Threev1d;
+ else
+ llvm_unreachable("Unexpected type for ld1x3!");
+ selectVectorLoadIntrinsic(Opc, 3, I);
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld1x4: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD1Fourv8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD1Fourv16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD1Fourv4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD1Fourv8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD1Fourv2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD1Fourv4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD1Fourv2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD1Fourv1d;
+ else
+ llvm_unreachable("Unexpected type for ld1x4!");
+ selectVectorLoadIntrinsic(Opc, 4, I);
+ break;
+ }
case Intrinsic::aarch64_neon_ld2: {
LLT Ty = MRI.getType(I.getOperand(0).getReg());
unsigned Opc = 0;
@@ -5688,6 +5832,114 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
selectVectorLoadIntrinsic(Opc, 2, I);
break;
}
+ case Intrinsic::aarch64_neon_ld2lane: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc;
+ if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD2i8;
+ else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD2i16;
+ else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD2i32;
+ else if (Ty == LLT::fixed_vector(2, S64) ||
+ Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
+ Opc = AArch64::LD2i64;
+ else
+ llvm_unreachable("Unexpected type for st2lane!");
+ if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
+ return false;
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld2r: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD2Rv8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD2Rv16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD2Rv4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD2Rv8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD2Rv2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD2Rv4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD2Rv2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD2Rv1d;
+ else
+ llvm_unreachable("Unexpected type for ld2r!");
+ selectVectorLoadIntrinsic(Opc, 2, I);
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld3: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD3Threev8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD3Threev16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD3Threev4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD3Threev8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD3Threev2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD3Threev4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD3Threev2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD1Threev1d;
+ else
+ llvm_unreachable("Unexpected type for ld3!");
+ selectVectorLoadIntrinsic(Opc, 3, I);
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld3lane: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc;
+ if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD3i8;
+ else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD3i16;
+ else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD3i32;
+ else if (Ty == LLT::fixed_vector(2, S64) ||
+ Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
+ Opc = AArch64::LD3i64;
+ else
+ llvm_unreachable("Unexpected type for st3lane!");
+ if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
+ return false;
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld3r: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD3Rv8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD3Rv16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD3Rv4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD3Rv8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD3Rv2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD3Rv4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD3Rv2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD3Rv1d;
+ else
+ llvm_unreachable("Unexpected type for ld3r!");
+ selectVectorLoadIntrinsic(Opc, 3, I);
+ break;
+ }
case Intrinsic::aarch64_neon_ld4: {
LLT Ty = MRI.getType(I.getOperand(0).getReg());
unsigned Opc = 0;
@@ -5712,6 +5964,48 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
selectVectorLoadIntrinsic(Opc, 4, I);
break;
}
+ case Intrinsic::aarch64_neon_ld4lane: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc;
+ if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD4i8;
+ else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD4i16;
+ else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD4i32;
+ else if (Ty == LLT::fixed_vector(2, S64) ||
+ Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
+ Opc = AArch64::LD4i64;
+ else
+ llvm_unreachable("Unexpected type for st4lane!");
+ if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
+ return false;
+ break;
+ }
+ case Intrinsic::aarch64_neon_ld4r: {
+ LLT Ty = MRI.getType(I.getOperand(0).getReg());
+ unsigned Opc = 0;
+ if (Ty == LLT::fixed_vector(8, S8))
+ Opc = AArch64::LD4Rv8b;
+ else if (Ty == LLT::fixed_vector(16, S8))
+ Opc = AArch64::LD4Rv16b;
+ else if (Ty == LLT::fixed_vector(4, S16))
+ Opc = AArch64::LD4Rv4h;
+ else if (Ty == LLT::fixed_vector(8, S16))
+ Opc = AArch64::LD4Rv8h;
+ else if (Ty == LLT::fixed_vector(2, S32))
+ Opc = AArch64::LD4Rv2s;
+ else if (Ty == LLT::fixed_vector(4, S32))
+ Opc = AArch64::LD4Rv4s;
+ else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+ Opc = AArch64::LD4Rv2d;
+ else if (Ty == S64 || Ty == P0)
+ Opc = AArch64::LD4Rv1d;
+ else
+ llvm_unreachable("Unexpected type for ld4r!");
+ selectVectorLoadIntrinsic(Opc, 4, I);
+ break;
+ }
case Intrinsic::aarch64_neon_st2: {
Register Src1 = I.getOperand(1).getReg();
Register Src2 = I.getOperand(2).getReg();
@@ -6873,6 +7167,14 @@ void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
.getZExtValue()));
}
+void AArch64InstructionSelector::renderUImmS1(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
+ "Expected G_CONSTANT");
+ MIB.addImm(MI.getOperand(1).getCImm()->getZExtValue());
+}
+
bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
const MachineInstr &MI, unsigned NumBytes) const {
if (!MI.mayLoadOrStore())
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 8ca2bc641b14a7..26954c62e03f1f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -580,6 +580,25 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
case TargetOpcode::G_BUILD_VECTOR:
case TargetOpcode::G_BUILD_VECTOR_TRUNC:
return true;
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+ case Intrinsic::aarch64_neon_ld1x2:
+ case Intrinsic::aarch64_neon_ld1x3:
+ case Intrinsic::aarch64_neon_ld1x4:
+ case Intrinsic::aarch64_neon_ld2:
+ case Intrinsic::aarch64_neon_ld2lane:
+ case Intrinsic::aarch64_neon_ld2r:
+ case Intrinsic::aarch64_neon_ld3:
+ case Intrinsic::aarch64_neon_ld3lane:
+ case Intrinsic::aarch64_neon_ld3r:
+ case Intrinsic::aarch64_neon_ld4:
+ case Intrinsic::aarch64_neon_ld4lane:
+ case Intrinsic::aarch64_neon_ld4r:
+ return true;
+ default:
+ break;
+ }
+ break;
default:
break;
}
@@ -722,10 +741,13 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
Register ScalarReg = MI.getOperand(1).getReg();
LLT ScalarTy = MRI.getType(ScalarReg);
auto ScalarDef = MRI.getVRegDef(ScalarReg);
+ // We want to select dup(load) into LD1R.
+ if (ScalarDef->getOpcode() == TargetOpcode::G_LOAD)
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
// s8 is an exception for G_DUP, which we always want on gpr.
- if (ScalarTy.getSizeInBits() != 8 &&
- (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
- onlyDefinesFP(*ScalarDef, MRI, TRI)))
+ else if (ScalarTy.getSizeInBits() != 8 &&
+ (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
+ onlyDefinesFP(*ScalarDef, MRI, TRI)))
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
else
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
@@ -1015,17 +1037,26 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Assign them FPR for now.
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR};
break;
- case TargetOpcode::G_INTRINSIC: {
+ case TargetOpcode::G_INTRINSIC:
+ case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: {
// Check if we know that the intrinsic has any constraints on its register
// banks. If it does, then update the mapping accordingly.
unsigned Idx = 0;
- if (!isFPIntrinsic(MRI, MI))
- break;
- for (const auto &Op : MI.explicit_operands()) {
- if (Op.isReg())
- OpRegBankIdx[Idx] = PMI_FirstFPR;
- ++Idx;
- }
+ if (onlyDefinesFP(MI, MRI, TRI))
+ for (const auto &Op : MI.defs()) {
+ if (Op.isReg())
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+ ++Idx;
+ }
+ else
+ Idx += MI.getNumExplicitDefs();
+
+ if (onlyUsesFP(MI, MRI, TRI))
+ for (const auto &Op : MI.explicit_uses()) {
+ if (Op.isReg())
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+ ++Idx;
+ }
break;
}
case TargetOpcode::G_LROUND:
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 47fb3308175b02..5b5ced1097e441 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -global-isel=1 -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
@@ -350,39 +351,63 @@ declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr) nounwi
define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld2lane_16b:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ld2.b { v0, v1 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld2lane_16b:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: ld2.b { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld2lane_16b:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: ld2.b { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, i64 1, ptr %A)
ret %struct.__neon_int8x16x2_t %tmp2
}
define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld3lane_16b:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: ld3.b { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld3lane_16b:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld3lane_16b:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, ptr %A)
ret %struct.__neon_int8x16x3_t %tmp2
}
define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld4lane_16b:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: ld4.b { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld4lane_16b:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld4lane_16b:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, ptr %A)
ret %struct.__neon_int8x16x4_t %tmp2
}
@@ -393,39 +418,63 @@ declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8>
define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld2lane_8h:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ld2.h { v0, v1 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld2lane_8h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: ld2.h { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld2lane_8h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: ld2.h { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, i64 1, ptr %A)
ret %struct.__neon_int16x8x2_t %tmp2
}
define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld3lane_8h:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: ld3.h { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld3lane_8h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: ld3.h { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld3lane_8h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: ld3.h { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, ptr %A)
ret %struct.__neon_int16x8x3_t %tmp2
}
define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld4lane_8h:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: ld4.h { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld4lane_8h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld4lane_8h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, ptr %A)
ret %struct.__neon_int16x8x4_t %tmp2
}
@@ -436,39 +485,63 @@ declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16>
define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld2lane_4s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ld2.s { v0, v1 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld2lane_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: ld2.s { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld2lane_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: ld2.s { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, i64 1, ptr %A)
ret %struct.__neon_int32x4x2_t %tmp2
}
define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld3lane_4s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: ld3.s { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld3lane_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld3lane_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, ptr %A)
ret %struct.__neon_int32x4x3_t %tmp2
}
define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld4lane_4s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: ld4.s { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld4lane_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld4lane_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, ptr %A)
ret %struct.__neon_int32x4x4_t %tmp2
}
@@ -479,39 +552,63 @@ declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32>
define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld2lane_2d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ld2.d { v0, v1 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld2lane_2d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT: ld2.d { v0, v1 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld2lane_2d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT: ld2.d { v0, v1 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, i64 1, ptr %A)
ret %struct.__neon_int64x2x2_t %tmp2
}
define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld3lane_2d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT: ld3.d { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld3lane_2d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT: ld3.d { v0, v1, v2 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld3lane_2d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT: ld3.d { v0, v1, v2 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, ptr %A)
ret %struct.__neon_int64x2x3_t %tmp2
}
define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, ptr %A) nounwind {
; Make sure we are using the operands defined by the ABI
-; CHECK-LABEL: ld4lane_2d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT: ld4.d { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld4lane_2d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT: ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld4lane_2d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT: ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-GI-NEXT: ret
%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, ptr %A)
ret %struct.__neon_int64x2x4_t %tmp2
}
@@ -907,10 +1004,16 @@ declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr) nounwin
declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr) nounwind readonly
define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) {
-; CHECK-LABEL: ld1_16b:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ld1.b { v0 }[0], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_16b:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ld1.b { v0 }[0], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_16b:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr b1, [x0]
+; CHECK-GI-NEXT: mov.b v0[0], v1[0]
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i8, ptr %bar
%tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
@@ -918,10 +1021,16 @@ define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) {
}
define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) {
-; CHECK-LABEL: ld1_8h:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ld1.h { v0 }[0], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_8h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ld1.h { v0 }[0], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_8h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr h1, [x0]
+; CHECK-GI-NEXT: mov.h v0[0], v1[0]
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i16, ptr %bar
%tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
@@ -929,10 +1038,16 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) {
}
define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) {
-; CHECK-LABEL: ld1_4s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_4s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_4s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr s1, [x0]
+; CHECK-GI-NEXT: mov.s v0[0], v1[0]
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i32, ptr %bar
%tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
@@ -940,10 +1055,16 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) {
}
define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) {
-; CHECK-LABEL: ld1_4s_float:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_4s_float:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_4s_float:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr s1, [x0]
+; CHECK-GI-NEXT: mov.s v0[0], v1[0]
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load float, ptr %bar
%tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
@@ -951,10 +1072,16 @@ define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) {
}
define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) {
-; CHECK-LABEL: ld1_2d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ld1.d { v0 }[0], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_2d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ld1.d { v0 }[0], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_2d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: mov.d v0[0], v1[0]
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i64, ptr %bar
%tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
@@ -962,10 +1089,16 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) {
}
define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) {
-; CHECK-LABEL: ld1_2d_double:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ld1.d { v0 }[0], [x0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_2d_double:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ld1.d { v0 }[0], [x0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_2d_double:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: mov.d v0[0], v1[0]
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load double, ptr %bar
%tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
@@ -983,12 +1116,20 @@ define <1 x i64> @ld1_1d(ptr %p) {
}
define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) {
-; CHECK-LABEL: ld1_8b:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ld1.b { v0 }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_8b:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: ld1.b { v0 }[0], [x0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_8b:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr b1, [x0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov.b v0[0], v1[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i8, ptr %bar
%tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
@@ -996,12 +1137,20 @@ define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) {
}
define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) {
-; CHECK-LABEL: ld1_4h:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ld1.h { v0 }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_4h:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: ld1.h { v0 }[0], [x0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_4h:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr h1, [x0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov.h v0[0], v1[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i16, ptr %bar
%tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
@@ -1009,12 +1158,20 @@ define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) {
}
define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) {
-; CHECK-LABEL: ld1_2s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_2s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_2s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr s1, [x0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov.s v0[0], v1[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load i32, ptr %bar
%tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
@@ -1022,12 +1179,20 @@ define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) {
}
define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) {
-; CHECK-LABEL: ld1_2s_float:
-; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ld1_2s_float:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ld1_2s_float:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr s1, [x0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov.s v0[0], v1[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
; Make sure we are using the operands defined by the ABI
%tmp1 = load float, ptr %bar
%tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
More information about the llvm-commits
mailing list