[llvm] [AArch64][GlobalISel] Legalize Insert vector element (PR #81453)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 12 00:51:59 PST 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/81453
This attempts to standardize and extend some of the insert vector element lowering. Most notably:
- More types are handled by splitting illegal vectors
- The index type for G_INSERT_VECTOR_ELT is canonicalized to TLI.getVectorIdxTy(), similar to extact_vector_element.
- Some of the existing patterns now have the index type specified to make sure they can apply to GISel too.
- The C++ selection code has been removed, relying on tablegen patterns.
- G_INSERT_VECTOR_ELT with small GPR input elements are pre-selected to use a i32 type, allowing the existing patterns to apply.
- Variable index inserts are lowered in post-legalizer lowering, expanding into a stack store and reload.
>From 5fe4d0b7196ef89f8a528f6d51fdd302af2ce2bf Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 12 Feb 2024 08:47:55 +0000
Subject: [PATCH] [AArch64][GlobalISel] Legalize Insert vector element
This attempts to standardize and extend some of the insert vector element
lowering. Most notably:
- More types are handled by splitting illegal vectors
- The index type for G_INSERT_VECTOR_ELT is canonicalized to
TLI.getVectorIdxTy(), similar to extact_vector_element.
- Some of the existing patterns now have the index type specified to make sure
they can apply to GISel too.
- The C++ selection code has been removed, relying on tablegen patterns.
- G_INSERT_VECTOR_ELT with small GPR input elements are pre-selected to use a
i32 type, allowing the existing patterns to apply.
- Variable index inserts are lowered in post-legalizer lowering, expanding
into a stack store and reload.
---
.../Target/GlobalISel/SelectionDAGCompat.td | 1 +
llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 17 +-
llvm/lib/Target/AArch64/AArch64Combine.td | 12 +-
.../lib/Target/AArch64/AArch64InstrAtomics.td | 4 +-
.../lib/Target/AArch64/AArch64InstrFormats.td | 6 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 39 +-
.../GISel/AArch64InstructionSelector.cpp | 87 +-
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 16 +-
.../GISel/AArch64PostLegalizerLowering.cpp | 54 +
.../AArch64/GlobalISel/arm64-fallback.ll | 16 -
.../AArch64/GlobalISel/arm64-irtranslator.ll | 3 +-
.../AArch64/GlobalISel/legalize-fcopysign.mir | 6 +-
.../GlobalISel/select-insert-vector-elt.mir | 50 +-
llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 88 +-
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 244 ++---
llvm/test/CodeGen/AArch64/insertextract.ll | 937 ++++++++++++------
16 files changed, 954 insertions(+), 626 deletions(-)
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 6bc19421fb0169..1ed4f03fc9cd4f 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -141,6 +141,7 @@ def : GINodeEquiv<G_CTLZ_ZERO_UNDEF, ctlz_zero_undef>;
def : GINodeEquiv<G_CTTZ_ZERO_UNDEF, cttz_zero_undef>;
def : GINodeEquiv<G_CTPOP, ctpop>;
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, extractelt>;
+def : GINodeEquiv<G_INSERT_VECTOR_ELT, vector_insert>;
def : GINodeEquiv<G_CONCAT_VECTORS, concat_vectors>;
def : GINodeEquiv<G_BUILD_VECTOR, build_vector>;
def : GINodeEquiv<G_FCEIL, fceil>;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index c1d8e890a66edb..8d122673888a0c 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2969,7 +2969,22 @@ bool IRTranslator::translateInsertElement(const User &U,
Register Res = getOrCreateVReg(U);
Register Val = getOrCreateVReg(*U.getOperand(0));
Register Elt = getOrCreateVReg(*U.getOperand(1));
- Register Idx = getOrCreateVReg(*U.getOperand(2));
+ const auto &TLI = *MF->getSubtarget().getTargetLowering();
+ unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+ Register Idx;
+ if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(2))) {
+ if (CI->getBitWidth() != PreferredVecIdxWidth) {
+ APInt NewIdx = CI->getValue().zextOrTrunc(PreferredVecIdxWidth);
+ auto *NewIdxCI = ConstantInt::get(CI->getContext(), NewIdx);
+ Idx = getOrCreateVReg(*NewIdxCI);
+ }
+ }
+ if (!Idx)
+ Idx = getOrCreateVReg(*U.getOperand(2));
+ if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
+ const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+ Idx = MIRBuilder.buildZExtOrTrunc(VecIdxTy, Idx).getReg(0);
+ }
MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx);
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index fdea974d4540a0..3e8c410c46c7e7 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -114,6 +114,13 @@ def ext: GICombineRule <
(apply [{ applyEXT(*${root}, ${matchinfo}); }])
>;
+def insertelt_nonconst: GICombineRule <
+ (defs root:$root, shuffle_matchdata:$matchinfo),
+ (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
+ [{ return matchNonConstInsert(*${root}, MRI); }]),
+ (apply [{ applyNonConstInsert(*${root}, MRI, B); }])
+>;
+
def shuf_to_ins_matchdata : GIDefMatchData<"std::tuple<Register, int, Register, int>">;
def shuf_to_ins: GICombineRule <
(defs root:$root, shuf_to_ins_matchdata:$matchinfo),
@@ -140,8 +147,7 @@ def form_duplane : GICombineRule <
>;
def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
- form_duplane,
- shuf_to_ins]>;
+ form_duplane, shuf_to_ins]>;
// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
def vector_unmerge_lowering : GICombineRule <
@@ -269,7 +275,7 @@ def AArch64PostLegalizerLowering
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mull,
- vector_unmerge_lowering]> {
+ vector_unmerge_lowering, insertelt_nonconst]> {
}
// Post-legalization combines which are primarily optimizations.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 0002db52b1995c..de94cf64c9801c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -547,10 +547,10 @@ let Predicates = [HasLSE] in {
let Predicates = [HasRCPC3, HasNEON] in {
// LDAP1 loads
def : Pat<(vector_insert (v2i64 VecListOne128:$Rd),
- (i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)), VectorIndexD:$idx),
+ (i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)), (i64 VectorIndexD:$idx)),
(LDAP1 VecListOne128:$Rd, VectorIndexD:$idx, GPR64sp:$Rn)>;
def : Pat<(vector_insert (v2f64 VecListOne128:$Rd),
- (f64 (bitconvert (i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)))), VectorIndexD:$idx),
+ (f64 (bitconvert (i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)))), (i64 VectorIndexD:$idx)),
(LDAP1 VecListOne128:$Rd, VectorIndexD:$idx, GPR64sp:$Rn)>;
def : Pat<(v1i64 (scalar_to_vector
(i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)))),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 10ad5b1f8f2580..85722e25bfc970 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7982,7 +7982,7 @@ class SIMDInsFromMain<string size, ValueType vectype,
"|" # size # "\t$Rd$idx, $Rn}",
"$Rd = $dst",
[(set V128:$dst,
- (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+ (vector_insert (vectype V128:$Rd), regtype:$Rn, (i64 idxtype:$idx)))]> {
let Inst{14-11} = 0b0011;
}
@@ -7996,8 +7996,8 @@ class SIMDInsFromElement<string size, ValueType vectype,
[(set V128:$dst,
(vector_insert
(vectype V128:$Rd),
- (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
- idxtype:$idx))]>;
+ (elttype (vector_extract (vectype V128:$Rn), (i64 idxtype:$idx2))),
+ (i64 idxtype:$idx)))]>;
class SIMDInsMainMovAlias<string size, Instruction inst,
RegisterClass regtype, Operand idxtype>
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9c3a6927d043ba..51818192c6b773 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6540,6 +6540,15 @@ def : Pat<(v8i8 (vector_insert (v8i8 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndex
VectorIndexB:$imm, GPR32:$Rm),
dsub)>;
+def : Pat<(v8i8 (vector_insert (v8i8 V64:$Rn), (i8 FPR8:$Rm), (i64 VectorIndexB:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi8lane (v16i8 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexB:$imm, (v16i8 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR8:$Rm, bsub)), (i64 0)),
+ dsub)>;
+def : Pat<(v16i8 (vector_insert (v16i8 V128:$Rn), (i8 FPR8:$Rm), (i64 VectorIndexB:$imm))),
+ (INSvi8lane V128:$Rn, VectorIndexB:$imm,
+ (v16i8 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR8:$Rm, bsub)), (i64 0))>;
+
// Copy an element at a constant index in one vector into a constant indexed
// element of another.
// FIXME refactor to a shared class/dev parameterized on vector type, vector
@@ -6572,26 +6581,26 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
ValueType VTScal, Instruction INS> {
def : Pat<(VT128 (vector_insert V128:$src,
- (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
- imm:$Immd)),
+ (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
+ (i64 imm:$Immd))),
(INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
def : Pat<(VT128 (vector_insert V128:$src,
- (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
- imm:$Immd)),
+ (VTScal (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
+ (i64 imm:$Immd))),
(INS V128:$src, imm:$Immd,
(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
def : Pat<(VT64 (vector_insert V64:$src,
- (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
- imm:$Immd)),
+ (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
+ (i64 imm:$Immd))),
(EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
imm:$Immd, V128:$Rn, imm:$Immn),
dsub)>;
def : Pat<(VT64 (vector_insert V64:$src,
- (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
- imm:$Immd)),
+ (VTScal (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
+ (i64 imm:$Immd))),
(EXTRACT_SUBREG
(INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
@@ -6610,14 +6619,14 @@ defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi64lane>;
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
-def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)),
+def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
(INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0)>;
-def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)),
+def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), (i64 imm:$Immd))),
(EXTRACT_SUBREG
(INSvi32lane (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$src, dsub)),
imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0),
dsub)>;
-def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), imm:$Immd)),
+def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), (i64 imm:$Immd))),
(INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>;
// bitcast of an extract
@@ -7999,7 +8008,7 @@ def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
: Pat<(vector_insert (VTy VecListOne128:$Rd),
- (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (STy (scalar_load GPR64sp:$Rn)), (i64 VecIndex:$idx)),
(LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>;
@@ -8022,14 +8031,14 @@ class Ld1Lane128IdxOpPat<SDPatternOperator scalar_load, Operand
VecIndex, ValueType VTy, ValueType STy,
Instruction LD1, SDNodeXForm IdxOp>
: Pat<(vector_insert (VTy VecListOne128:$Rd),
- (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (STy (scalar_load GPR64sp:$Rn)), (i64 VecIndex:$idx)),
(LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>;
class Ld1Lane64IdxOpPat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1,
SDNodeXForm IdxOp>
: Pat<(vector_insert (VTy VecListOne64:$Rd),
- (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (STy (scalar_load GPR64sp:$Rn)), (i64 VecIndex:$idx)),
(EXTRACT_SUBREG
(LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
(IdxOp VecIndex:$idx), GPR64sp:$Rn),
@@ -8069,7 +8078,7 @@ let Predicates = [IsNeonAvailable] in {
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
: Pat<(vector_insert (VTy VecListOne64:$Rd),
- (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+ (STy (scalar_load GPR64sp:$Rn)), (i64 VecIndex:$idx)),
(EXTRACT_SUBREG
(LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
VecIndex:$idx, GPR64sp:$Rn),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 9d51a7f7616ddb..d6978bf90a4b21 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -191,7 +191,6 @@ class AArch64InstructionSelector : public InstructionSelector {
MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
MachineIRBuilder &MIRBuilder);
- bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
MachineRegisterInfo &MRI);
/// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
@@ -2121,6 +2120,31 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
}
return false;
}
+ case TargetOpcode::G_INSERT_VECTOR_ELT: {
+ Register InsReg = I.getOperand(2).getReg();
+ LLT SrcTy = MRI.getType(InsReg);
+ if (RBI.getRegBank(InsReg, MRI, TRI)->getID() == AArch64::GPRRegBankID &&
+ SrcTy.getSizeInBits() < 32) {
+ if (auto *MI = MRI.getVRegDef(InsReg)) {
+ if (MI->getOpcode() == TargetOpcode::G_TRUNC &&
+ (MRI.getType(MI->getOperand(1).getReg()).getSizeInBits() == 32 ||
+ MRI.getType(MI->getOperand(1).getReg()).getSizeInBits() == 64)) {
+ I.getOperand(2).setReg(MI->getOperand(1).getReg());
+ return true;
+ }
+ }
+ auto Ext = MIB.buildAnyExt(LLT::scalar(32), InsReg);
+ Register ExtDst = Ext.getReg(0);
+ MRI.setRegBank(ExtDst, RBI.getRegBank(AArch64::GPRRegBankID));
+ if (!select(*Ext)) {
+ LLVM_DEBUG(dbgs() << "Failed to select G_ANYEXT in G_INSERT_VECTOR_ELT");
+ return false;
+ }
+ I.getOperand(2).setReg(ExtDst);
+ return true;
+ }
+ return false;
+ }
default:
return false;
}
@@ -3487,8 +3511,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return selectShuffleVector(I, MRI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return selectExtractElt(I, MRI);
- case TargetOpcode::G_INSERT_VECTOR_ELT:
- return selectInsertElt(I, MRI);
case TargetOpcode::G_CONCAT_VECTORS:
return selectConcatVectors(I, MRI);
case TargetOpcode::G_JUMP_TABLE:
@@ -5319,65 +5341,6 @@ bool AArch64InstructionSelector::selectUSMovFromExtend(
return true;
}
-bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
- MachineRegisterInfo &MRI) {
- assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
-
- // Get information on the destination.
- Register DstReg = I.getOperand(0).getReg();
- const LLT DstTy = MRI.getType(DstReg);
- unsigned VecSize = DstTy.getSizeInBits();
-
- // Get information on the element we want to insert into the destination.
- Register EltReg = I.getOperand(2).getReg();
- const LLT EltTy = MRI.getType(EltReg);
- unsigned EltSize = EltTy.getSizeInBits();
- if (EltSize < 8 || EltSize > 64)
- return false;
-
- // Find the definition of the index. Bail out if it's not defined by a
- // G_CONSTANT.
- Register IdxReg = I.getOperand(3).getReg();
- auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
- if (!VRegAndVal)
- return false;
- unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
-
- // Perform the lane insert.
- Register SrcReg = I.getOperand(1).getReg();
- const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
-
- if (VecSize < 128) {
- // If the vector we're inserting into is smaller than 128 bits, widen it
- // to 128 to do the insert.
- MachineInstr *ScalarToVec =
- emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
- if (!ScalarToVec)
- return false;
- SrcReg = ScalarToVec->getOperand(0).getReg();
- }
-
- // Create an insert into a new FPR128 register.
- // Note that if our vector is already 128 bits, we end up emitting an extra
- // register.
- MachineInstr *InsMI =
- emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB);
-
- if (VecSize < 128) {
- // If we had to widen to perform the insert, then we have to demote back to
- // the original size to get the result we want.
- if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI))
- return false;
- } else {
- // No widening needed.
- InsMI->getOperand(0).setReg(DstReg);
- constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
- }
-
- I.eraseFromParent();
- return true;
-}
-
MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
unsigned int Op;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index cbf5655706e694..b545c04e1652a4 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -870,8 +870,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampMaxNumElements(1, p0, 2);
getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
- .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64}))
- .widenVectorEltsToVectorMinSize(0, 64);
+ .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0}))
+ .widenVectorEltsToVectorMinSize(0, 64)
+ .moreElementsToNextPow2(0)
+ .clampNumElements(0, v8s8, v16s8)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampMaxNumElements(0, s64, 2)
+ .clampMaxNumElements(0, p0, 2);
getActionDefinitionsBuilder(G_BUILD_VECTOR)
.legalFor({{v8s8, s8},
@@ -2001,11 +2007,11 @@ bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
// Widen In1 and In2 to 128 bits. We want these to eventually become
// INSERT_SUBREGs.
auto Undef = MIRBuilder.buildUndef(VecTy);
- auto Zero = MIRBuilder.buildConstant(DstTy, 0);
+ auto ZeroIdx = MIRBuilder.buildConstant(LLT::scalar(64), 0);
auto Ins1 = MIRBuilder.buildInsertVectorElement(
- VecTy, Undef, MI.getOperand(1).getReg(), Zero);
+ VecTy, Undef, MI.getOperand(1).getReg(), ZeroIdx);
auto Ins2 = MIRBuilder.buildInsertVectorElement(
- VecTy, Undef, MI.getOperand(2).getReg(), Zero);
+ VecTy, Undef, MI.getOperand(2).getReg(), ZeroIdx);
// Construct the mask.
auto Mask = MIRBuilder.buildConstant(VecTy, EltMask);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 9bc5815ae05371..a12cac010f036e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -37,6 +37,7 @@
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
@@ -475,6 +476,59 @@ void applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
MI.eraseFromParent();
}
+bool matchNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
+
+ auto ValAndVReg =
+ getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+ return !ValAndVReg;
+}
+
+void applyNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder) {
+ assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
+ bool InsertVal = true;
+ Builder.setInstrAndDebugLoc(MI);
+
+ Register Offset = MI.getOperand(3).getReg();
+ LLT VecTy = MRI.getType(MI.getOperand(0).getReg());
+ LLT EltTy = MRI.getType(MI.getOperand(2).getReg());
+ LLT IdxTy = MRI.getType(MI.getOperand(3).getReg());
+
+ // Create a stack slot and store the vector into it
+ MachineFunction &MF = Builder.getMF();
+ int FrameIdx = MF.getFrameInfo().CreateStackObject(VecTy.getSizeInBytes(),
+ Align(8), false);
+ LLT FramePtrTy = LLT::pointer(0, 64);
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
+ auto StackTemp = Builder.buildFrameIndex(FramePtrTy, FrameIdx);
+
+ Builder.buildStore(MI.getOperand(1), StackTemp, PtrInfo, Align(8));
+
+ // Get the pointer to the element, and be sure not to hit undefined behavior
+ // if the index is out of bounds.
+ assert(isPowerOf2_64(VecTy.getNumElements()) &&
+ "Expected a power-2 vector size");
+ auto Mask = Builder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
+ Register And = Builder.buildAnd(IdxTy, Offset, Mask).getReg(0);
+ auto EltSize = Builder.buildConstant(IdxTy, EltTy.getSizeInBytes());
+ Register Mul = Builder.buildMul(IdxTy, And, EltSize).getReg(0);
+ Register EltPtr =
+ Builder.buildPtrAdd(MRI.getType(StackTemp.getReg(0)), StackTemp, Mul)
+ .getReg(0);
+
+ if (InsertVal) {
+ // Write the inserted element
+ Builder.buildStore(MI.getOperand(2).getReg(), EltPtr, PtrInfo, Align(1));
+
+ // Reload the whole vector.
+ Builder.buildLoad(MI.getOperand(0).getReg(), StackTemp, PtrInfo, Align(8));
+ } else {
+ Builder.buildLoad(MI.getOperand(0).getReg(), EltPtr, PtrInfo, Align(1));
+ }
+ MI.eraseFromParent();
+}
+
/// Match a G_SHUFFLE_VECTOR with a mask which corresponds to a
/// G_INSERT_VECTOR_ELT and G_EXTRACT_VECTOR_ELT pair.
///
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 0a2d695acb4e08..29c320da6c0a74 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -27,22 +27,6 @@ define void @test_write_register_intrin() {
@_ZTIi = external global ptr
declare i32 @__gxx_personality_v0(...)
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(<2 x p0>) = G_INSERT_VECTOR_ELT %0:_, %{{[0-9]+}}:_(p0), %{{[0-9]+}}:_(s32) (in function: vector_of_pointers_insertelement)
-; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_insertelement
-; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_insertelement:
-define void @vector_of_pointers_insertelement() {
- br label %end
-
-block:
- %dummy = insertelement <2 x ptr> %vec, ptr null, i32 0
- store <2 x ptr> %dummy, ptr undef
- ret void
-
-end:
- %vec = load <2 x ptr>, ptr undef
- br label %block
-}
-
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: RET_ReallyLR implicit $x0 (in function: strict_align_feature)
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for strict_align_feature
; FALLBACK-WITH-REPORT-OUT-LABEL: strict_align_feature
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 92ddc6309546f3..a131f35e66d033 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1538,7 +1538,8 @@ define <2 x i32> @test_insertelement(<2 x i32> %vec, i32 %elt, i32 %idx){
; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = COPY $d0
; CHECK: [[ELT:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[IDX:%[0-9]+]]:_(s32) = COPY $w1
-; CHECK: [[RES:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[VEC]], [[ELT]](s32), [[IDX]](s32)
+; CHECK: [[IDX2:%[0-9]+]]:_(s64) = G_ZEXT [[IDX]]
+; CHECK: [[RES:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[VEC]], [[ELT]](s32), [[IDX2]](s64)
; CHECK: $d0 = COPY [[RES]](<2 x s32>)
%res = insertelement <2 x i32> %vec, i32 %elt, i32 %idx
ret <2 x i32> %res
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir
index 86824127132da2..776023376e7332 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fcopysign.mir
@@ -14,9 +14,9 @@ body: |
; CHECK-NEXT: %val:_(s32) = COPY $s0
; CHECK-NEXT: %sign:_(s32) = COPY $s1
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %val(s32), [[C]](s32)
- ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %sign(s32), [[C]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %val(s32), [[C]](s64)
+ ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %sign(s32), [[C]](s64)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
; CHECK-NEXT: [[BSP:%[0-9]+]]:_(<4 x s32>) = G_BSP [[BUILD_VECTOR]], [[IVEC1]], [[IVEC]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-insert-vector-elt.mir
index d6618d440f42aa..9f9389f2362e07 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-insert-vector-elt.mir
@@ -21,8 +21,8 @@ body: |
%0:gpr(s32) = COPY $w0
%trunc:gpr(s8) = G_TRUNC %0
%1:fpr(<16 x s8>) = COPY $q1
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<16 x s8>) = G_INSERT_VECTOR_ELT %1, %trunc:gpr(s8), %3:gpr(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<16 x s8>) = G_INSERT_VECTOR_ELT %1, %trunc:gpr(s8), %3:gpr(s64)
$q0 = COPY %2(<16 x s8>)
RET_ReallyLR implicit $q0
@@ -51,8 +51,8 @@ body: |
%0:gpr(s32) = COPY $w0
%trunc:gpr(s8) = G_TRUNC %0
%1:fpr(<8 x s8>) = COPY $d0
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<8 x s8>) = G_INSERT_VECTOR_ELT %1, %trunc(s8), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<8 x s8>) = G_INSERT_VECTOR_ELT %1, %trunc(s8), %3(s64)
$d0 = COPY %2(<8 x s8>)
RET_ReallyLR implicit $d0
@@ -78,8 +78,8 @@ body: |
%0:gpr(s32) = COPY $w0
%trunc:gpr(s16) = G_TRUNC %0
%1:fpr(<8 x s16>) = COPY $q1
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<8 x s16>) = G_INSERT_VECTOR_ELT %1, %trunc:gpr(s16), %3:gpr(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<8 x s16>) = G_INSERT_VECTOR_ELT %1, %trunc:gpr(s16), %3:gpr(s64)
$q0 = COPY %2(<8 x s16>)
RET_ReallyLR implicit $q0
@@ -106,8 +106,8 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:fpr(s16) = COPY $h0
%1:fpr(<8 x s16>) = COPY $q1
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<8 x s16>) = G_INSERT_VECTOR_ELT %1, %0(s16), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<8 x s16>) = G_INSERT_VECTOR_ELT %1, %0(s16), %3(s64)
$q0 = COPY %2(<8 x s16>)
RET_ReallyLR implicit $q0
@@ -134,8 +134,8 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:fpr(s32) = COPY $s0
%1:fpr(<4 x s32>) = COPY $q1
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s64)
$q0 = COPY %2(<4 x s32>)
RET_ReallyLR implicit $q0
@@ -160,8 +160,8 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:gpr(s32) = COPY $w0
%1:fpr(<4 x s32>) = COPY $q0
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<4 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s64)
$q0 = COPY %2(<4 x s32>)
RET_ReallyLR implicit $q0
@@ -190,8 +190,8 @@ body: |
%0:gpr(s32) = COPY $w0
%trunc:gpr(s16) = G_TRUNC %0
%1:fpr(<4 x s16>) = COPY $d0
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<4 x s16>) = G_INSERT_VECTOR_ELT %1, %trunc(s16), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<4 x s16>) = G_INSERT_VECTOR_ELT %1, %trunc(s16), %3(s64)
$d0 = COPY %2(<4 x s16>)
RET_ReallyLR implicit $d0
@@ -218,8 +218,8 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:fpr(s64) = COPY $d0
%1:fpr(<2 x s64>) = COPY $q1
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %1, %0(s64), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %1, %0(s64), %3(s64)
$q0 = COPY %2(<2 x s64>)
RET_ReallyLR implicit $q0
@@ -244,8 +244,8 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:gpr(s64) = COPY $x0
%1:fpr(<2 x s64>) = COPY $q0
- %3:gpr(s32) = G_CONSTANT i32 0
- %2:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %1, %0(s64), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 0
+ %2:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %1, %0(s64), %3(s64)
$q0 = COPY %2(<2 x s64>)
RET_ReallyLR implicit $q0
@@ -266,17 +266,17 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $s0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
- ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.dsub
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub
; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
- ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY]], %subreg.ssub
- ; CHECK-NEXT: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
+ ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
+ ; CHECK-NEXT: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG]], 0
; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[INSvi32lane]].dsub
; CHECK-NEXT: $d0 = COPY [[COPY2]]
; CHECK-NEXT: RET_ReallyLR implicit $d0
%0:fpr(s32) = COPY $s0
%1:fpr(<2 x s32>) = COPY $d1
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s64)
$d0 = COPY %2(<2 x s32>)
RET_ReallyLR implicit $d0
@@ -304,8 +304,8 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit $d0
%0:gpr(s32) = COPY $w0
%1:fpr(<2 x s32>) = COPY $d0
- %3:gpr(s32) = G_CONSTANT i32 1
- %2:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s32)
+ %3:gpr(s64) = G_CONSTANT i64 1
+ %2:fpr(<2 x s32>) = G_INSERT_VECTOR_ELT %1, %0(s32), %3(s64)
$d0 = COPY %2(<2 x s32>)
RET_ReallyLR implicit $d0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index b4ddff76f25b87..80ef5ce641504a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -1,8 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI: warning: Instruction selection used fallback path for test_bit_sink_operand
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
; BIT Bitwise Insert if True
;
@@ -195,34 +193,62 @@ define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
}
define <4 x i32> @test_bit_sink_operand(<4 x i32> %src, <4 x i32> %dst, <4 x i32> %mask, i32 %scratch) {
-; CHECK-LABEL: test_bit_sink_operand:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: mov w9, wzr
-; CHECK-NEXT: cinc w8, w0, lt
-; CHECK-NEXT: asr w8, w8, #1
-; CHECK-NEXT: .LBB11_1: // %do.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: add x10, sp, #16
-; CHECK-NEXT: mov x11, sp
-; CHECK-NEXT: bfi x10, x9, #2, #2
-; CHECK-NEXT: bfi x11, x9, #2, #2
-; CHECK-NEXT: add w9, w9, #1
-; CHECK-NEXT: cmp w9, #5
-; CHECK-NEXT: str q1, [sp, #16]
-; CHECK-NEXT: str w0, [x10]
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: str w8, [x11]
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK-NEXT: b.ne .LBB11_1
-; CHECK-NEXT: // %bb.2: // %do.end
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bit_sink_operand:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #32
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: cmp w0, #0
+; CHECK-SD-NEXT: mov w9, wzr
+; CHECK-SD-NEXT: cinc w8, w0, lt
+; CHECK-SD-NEXT: asr w8, w8, #1
+; CHECK-SD-NEXT: .LBB11_1: // %do.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: bit v1.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT: add x10, sp, #16
+; CHECK-SD-NEXT: mov x11, sp
+; CHECK-SD-NEXT: bfi x10, x9, #2, #2
+; CHECK-SD-NEXT: bfi x11, x9, #2, #2
+; CHECK-SD-NEXT: add w9, w9, #1
+; CHECK-SD-NEXT: cmp w9, #5
+; CHECK-SD-NEXT: str q1, [sp, #16]
+; CHECK-SD-NEXT: str w0, [x10]
+; CHECK-SD-NEXT: ldr q1, [sp, #16]
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: str w8, [x11]
+; CHECK-SD-NEXT: ldr q0, [sp]
+; CHECK-SD-NEXT: b.ne .LBB11_1
+; CHECK-SD-NEXT: // %bb.2: // %do.end
+; CHECK-SD-NEXT: mov v0.16b, v1.16b
+; CHECK-SD-NEXT: add sp, sp, #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bit_sink_operand:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #32
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT: mov w9, #2 // =0x2
+; CHECK-GI-NEXT: mov w8, wzr
+; CHECK-GI-NEXT: add x10, sp, #16
+; CHECK-GI-NEXT: sdiv w9, w0, w9
+; CHECK-GI-NEXT: mov x11, sp
+; CHECK-GI-NEXT: .LBB11_1: // %do.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: bit v1.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT: mov w12, w8
+; CHECK-GI-NEXT: add w8, w8, #1
+; CHECK-GI-NEXT: and x12, x12, #0x3
+; CHECK-GI-NEXT: cmp w8, #5
+; CHECK-GI-NEXT: str q1, [sp, #16]
+; CHECK-GI-NEXT: str w0, [x10, x12, lsl #2]
+; CHECK-GI-NEXT: ldr q1, [sp, #16]
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: str w9, [x11, x12, lsl #2]
+; CHECK-GI-NEXT: ldr q0, [sp]
+; CHECK-GI-NEXT: b.ne .LBB11_1
+; CHECK-GI-NEXT: // %bb.2: // %do.end
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: add sp, sp, #32
+; CHECK-GI-NEXT: ret
entry:
%0 = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 328b782c14956c..053f4185c00ca3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -4,9 +4,6 @@
; CHECK-GI: warning: Instruction selection used fallback path for test_bitcastv2f32tov1f64
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_bitcastv1f64tov2f32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_extracts_inserts_varidx_insert
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_concat_v1i32_undef
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_concat_diff_v1i32_v1i32
define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
; CHECK-LABEL: ins16bw:
@@ -96,36 +93,22 @@ define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
}
define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-; CHECK-SD-LABEL: ins8h8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov v1.h[7], v0.h[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins8h8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.h[7], v2.h[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins8h8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v1.h[7], v0.h[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
ret <8 x i16> %tmp4
}
define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-; CHECK-SD-LABEL: ins4s4:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins4s4:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins4s4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v1.s[1], v0.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <4 x i32> %tmp1, i32 2
%tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
ret <4 x i32> %tmp4
@@ -143,18 +126,11 @@ define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
}
define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-; CHECK-SD-LABEL: ins4f4:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins4f4:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins4f4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov v1.s[1], v0.s[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <4 x float> %tmp1, i32 2
%tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
ret <4 x float> %tmp4
@@ -192,40 +168,24 @@ define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
}
define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-; CHECK-SD-LABEL: ins4h8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.h[7], v0.h[2]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins4h8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.h[7], v2.h[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins4h8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v1.h[7], v0.h[2]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
ret <8 x i16> %tmp4
}
define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-; CHECK-SD-LABEL: ins2s4:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[1]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins2s4:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins2s4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <2 x i32> %tmp1, i32 1
%tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
ret <4 x i32> %tmp4
@@ -244,20 +204,12 @@ define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
}
define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-; CHECK-SD-LABEL: ins2f4:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[1]
-; CHECK-SD-NEXT: mov v0.16b, v1.16b
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins2f4:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s2, v0.s[1]
-; CHECK-GI-NEXT: mov v0.16b, v1.16b
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins2f4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v1.s[1], v0.s[1]
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
%tmp3 = extractelement <2 x float> %tmp1, i32 1
%tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
ret <4 x float> %tmp4
@@ -307,40 +259,24 @@ define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
}
define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-; CHECK-SD-LABEL: ins8h4:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov v1.h[3], v0.h[2]
-; CHECK-SD-NEXT: fmov d0, d1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins8h4:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: fmov d0, d1
-; CHECK-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins8h4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v1.h[3], v0.h[2]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
ret <4 x i16> %tmp4
}
define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-; CHECK-SD-LABEL: ins4s2:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[2]
-; CHECK-SD-NEXT: fmov d0, d1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins4s2:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: fmov d0, d1
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins4s2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v1.s[1], v0.s[2]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
%tmp3 = extractelement <4 x i32> %tmp1, i32 2
%tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
ret <2 x i32> %tmp4
@@ -357,20 +293,12 @@ define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
}
define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-; CHECK-SD-LABEL: ins4f2:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov v1.s[1], v0.s[2]
-; CHECK-SD-NEXT: fmov d0, d1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins4f2:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: fmov d0, d1
-; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins4f2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v1.s[1], v0.s[2]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
%tmp3 = extractelement <4 x float> %tmp1, i32 2
%tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
ret <2 x float> %tmp4
@@ -415,22 +343,13 @@ define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
}
define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-; CHECK-SD-LABEL: ins4h4:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.h[3], v0.h[2]
-; CHECK-SD-NEXT: fmov d0, d1
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: ins4h4:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: fmov d0, d1
-; CHECK-GI-NEXT: mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: ins4h4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov v1.h[3], v0.h[2]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
ret <4 x i16> %tmp4
@@ -1543,21 +1462,38 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
}
define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) {
-; CHECK-LABEL: test_extracts_inserts_varidx_insert:
-; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: bfi x8, x0, #1, #2
-; CHECK-NEXT: str h0, [x8]
-; CHECK-NEXT: ldr d1, [sp, #8]
-; CHECK-NEXT: mov v1.h[1], v0.h[1]
-; CHECK-NEXT: mov v1.h[2], v0.h[2]
-; CHECK-NEXT: mov v1.h[3], v0.h[3]
-; CHECK-NEXT: fmov d0, d1
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_extracts_inserts_varidx_insert:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: add x8, sp, #8
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: bfi x8, x0, #1, #2
+; CHECK-SD-NEXT: str h0, [x8]
+; CHECK-SD-NEXT: ldr d1, [sp, #8]
+; CHECK-SD-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-SD-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-SD-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-SD-NEXT: fmov d0, d1
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_extracts_inserts_varidx_insert:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: add x9, sp, #8
+; CHECK-GI-NEXT: str d0, [sp, #8]
+; CHECK-GI-NEXT: and x8, x8, #0x3
+; CHECK-GI-NEXT: str h0, [x9, x8, lsl #1]
+; CHECK-GI-NEXT: ldr d1, [sp, #8]
+; CHECK-GI-NEXT: mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT: mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT: mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT: fmov d0, d1
+; CHECK-GI-NEXT: add sp, sp, #16
+; CHECK-GI-NEXT: ret
%tmp = extractelement <8 x i16> %x, i32 0
%tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 %idx
%tmp3 = extractelement <8 x i16> %x, i32 1
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 794abca1ae421e..83ac870dabb6d0 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -2,44 +2,7 @@
; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; CHECK-GI: warning: Instruction selection used fallback path for insert_v2f64_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3f64_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f64_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f64_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f64_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2f32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3f32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f32_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f32_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4f16_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8f16_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16f16_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16f16_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16f16_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i8_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i8_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v32i8_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v32i8_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v32i8_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i16_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i16_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i16_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i16_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v16i16_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3i32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i32_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i32_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v8i32_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i64_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v3i64_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i64_0
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i64_2
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v4i64_c
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v32i8_0
+; CHECK-GI: warning: Instruction selection used fallback path for extract_v32i8_0
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v32i8_2
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v32i8_c
@@ -66,17 +29,29 @@ entry:
}
define <2 x double> @insert_v2f64_c(<2 x double> %a, double %b, i32 %c) {
-; CHECK-LABEL: insert_v2f64_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x0, #3, #1
-; CHECK-NEXT: str d1, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v2f64_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x0, #3, #1
+; CHECK-SD-NEXT: str d1, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v2f64_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w0
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x1
+; CHECK-GI-NEXT: str d1, [x8, x9, lsl #3]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <2 x double> %a, double %b, i32 %c
ret <2 x double> %d
@@ -114,25 +89,51 @@ entry:
}
define <3 x double> @insert_v3f64_c(<3 x double> %a, double %b, i32 %c) {
-; CHECK-LABEL: insert_v3f64_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: stp q0, q2, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: and x9, x0, #0x3
-; CHECK-NEXT: str d3, [x8, x9, lsl #3]
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK-NEXT: ldr d2, [sp, #16]
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v3f64_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: stp q0, q2, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: and x9, x0, #0x3
+; CHECK-SD-NEXT: str d3, [x8, x9, lsl #3]
+; CHECK-SD-NEXT: ldr q0, [sp]
+; CHECK-SD-NEXT: ldr d2, [sp, #16]
+; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: add sp, sp, #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v3f64_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: and x8, x8, #0x3
+; CHECK-GI-NEXT: stp q0, q2, [sp]
+; CHECK-GI-NEXT: str d3, [x9, x8, lsl #3]
+; CHECK-GI-NEXT: ldp q0, q2, [sp]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x double> %a, double %b, i32 %c
ret <3 x double> %d
@@ -161,16 +162,35 @@ entry:
}
define <4 x double> @insert_v4f64_c(<4 x double> %a, double %b, i32 %c) {
-; CHECK-LABEL: insert_v4f64_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x0, #0x3
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: str d2, [x9, x8, lsl #3]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v4f64_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x0, #0x3
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: str d2, [x9, x8, lsl #3]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v4f64_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0x3
+; CHECK-GI-NEXT: str d2, [x9, x8, lsl #3]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <4 x double> %a, double %b, i32 %c
ret <4 x double> %d
@@ -203,18 +223,31 @@ entry:
}
define <2 x float> @insert_v2f32_c(<2 x float> %a, float %b, i32 %c) {
-; CHECK-LABEL: insert_v2f32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: str d0, [sp, #8]
-; CHECK-NEXT: bfi x8, x0, #2, #1
-; CHECK-NEXT: str s1, [x8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v2f32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: add x8, sp, #8
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: str d0, [sp, #8]
+; CHECK-SD-NEXT: bfi x8, x0, #2, #1
+; CHECK-SD-NEXT: str s1, [x8]
+; CHECK-SD-NEXT: ldr d0, [sp, #8]
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v2f32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w0
+; CHECK-GI-NEXT: add x8, sp, #8
+; CHECK-GI-NEXT: str d0, [sp, #8]
+; CHECK-GI-NEXT: and x9, x9, #0x1
+; CHECK-GI-NEXT: str s1, [x8, x9, lsl #2]
+; CHECK-GI-NEXT: ldr d0, [sp, #8]
+; CHECK-GI-NEXT: add sp, sp, #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <2 x float> %a, float %b, i32 %c
ret <2 x float> %d
@@ -265,17 +298,29 @@ entry:
}
define <3 x float> @insert_v3f32_c(<3 x float> %a, float %b, i32 %c) {
-; CHECK-LABEL: insert_v3f32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x0, #2, #2
-; CHECK-NEXT: str s1, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v3f32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x0, #2, #2
+; CHECK-SD-NEXT: str s1, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v3f32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w0
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: str s1, [x8, x9, lsl #2]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x float> %a, float %b, i32 %c
ret <3 x float> %d
@@ -304,17 +349,29 @@ entry:
}
define <4 x float> @insert_v4f32_c(<4 x float> %a, float %b, i32 %c) {
-; CHECK-LABEL: insert_v4f32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x0, #2, #2
-; CHECK-NEXT: str s1, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v4f32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x0, #2, #2
+; CHECK-SD-NEXT: str s1, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v4f32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w0
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: str s1, [x8, x9, lsl #2]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <4 x float> %a, float %b, i32 %c
ret <4 x float> %d
@@ -343,16 +400,35 @@ entry:
}
define <8 x float> @insert_v8f32_c(<8 x float> %a, float %b, i32 %c) {
-; CHECK-LABEL: insert_v8f32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x0, #0x7
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: str s2, [x9, x8, lsl #2]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v8f32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x0, #0x7
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: str s2, [x9, x8, lsl #2]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v8f32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0x7
+; CHECK-GI-NEXT: str s2, [x9, x8, lsl #2]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <8 x float> %a, float %b, i32 %c
ret <8 x float> %d
@@ -385,18 +461,31 @@ entry:
}
define <4 x half> @insert_v4f16_c(<4 x half> %a, half %b, i32 %c) {
-; CHECK-LABEL: insert_v4f16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: str d0, [sp, #8]
-; CHECK-NEXT: bfi x8, x0, #1, #2
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v4f16_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: add x8, sp, #8
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: str d0, [sp, #8]
+; CHECK-SD-NEXT: bfi x8, x0, #1, #2
+; CHECK-SD-NEXT: str h1, [x8]
+; CHECK-SD-NEXT: ldr d0, [sp, #8]
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v4f16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w0
+; CHECK-GI-NEXT: add x8, sp, #8
+; CHECK-GI-NEXT: str d0, [sp, #8]
+; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: str h1, [x8, x9, lsl #1]
+; CHECK-GI-NEXT: ldr d0, [sp, #8]
+; CHECK-GI-NEXT: add sp, sp, #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <4 x half> %a, half %b, i32 %c
ret <4 x half> %d
@@ -425,17 +514,29 @@ entry:
}
define <8 x half> @insert_v8f16_c(<8 x half> %a, half %b, i32 %c) {
-; CHECK-LABEL: insert_v8f16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x0, #1, #3
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v8f16_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x0, #1, #3
+; CHECK-SD-NEXT: str h1, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v8f16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w0
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x7
+; CHECK-GI-NEXT: str h1, [x8, x9, lsl #1]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <8 x half> %a, half %b, i32 %c
ret <8 x half> %d
@@ -464,16 +565,35 @@ entry:
}
define <16 x half> @insert_v16f16_c(<16 x half> %a, half %b, i32 %c) {
-; CHECK-LABEL: insert_v16f16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x0, #0xf
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: str h2, [x9, x8, lsl #1]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v16f16_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x0, #0xf
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: str h2, [x9, x8, lsl #1]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v16f16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w0
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0xf
+; CHECK-GI-NEXT: str h2, [x9, x8, lsl #1]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <16 x half> %a, half %b, i32 %c
ret <16 x half> %d
@@ -504,18 +624,33 @@ entry:
}
define <8 x i8> @insert_v8i8_c(<8 x i8> %a, i8 %b, i32 %c) {
-; CHECK-LABEL: insert_v8i8_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str d0, [sp, #8]
-; CHECK-NEXT: bfxil x8, x1, #0, #3
-; CHECK-NEXT: strb w0, [x8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v8i8_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: add x8, sp, #8
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str d0, [sp, #8]
+; CHECK-SD-NEXT: bfxil x8, x1, #0, #3
+; CHECK-SD-NEXT: strb w0, [x8]
+; CHECK-SD-NEXT: ldr d0, [sp, #8]
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v8i8_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: mov w8, #1 // =0x1
+; CHECK-GI-NEXT: str d0, [sp, #8]
+; CHECK-GI-NEXT: and x9, x9, #0x7
+; CHECK-GI-NEXT: mul x8, x9, x8
+; CHECK-GI-NEXT: add x9, sp, #8
+; CHECK-GI-NEXT: strb w0, [x9, x8]
+; CHECK-GI-NEXT: ldr d0, [sp, #8]
+; CHECK-GI-NEXT: add sp, sp, #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <8 x i8> %a, i8 %b, i32 %c
ret <8 x i8> %d
@@ -542,17 +677,31 @@ entry:
}
define <16 x i8> @insert_v16i8_c(<16 x i8> %a, i8 %b, i32 %c) {
-; CHECK-LABEL: insert_v16i8_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfxil x8, x1, #0, #4
-; CHECK-NEXT: strb w0, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v16i8_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfxil x8, x1, #0, #4
+; CHECK-SD-NEXT: strb w0, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v16i8_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: mov w8, #1 // =0x1
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0xf
+; CHECK-GI-NEXT: mul x8, x9, x8
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: strb w0, [x9, x8]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <16 x i8> %a, i8 %b, i32 %c
ret <16 x i8> %d
@@ -579,16 +728,37 @@ entry:
}
define <32 x i8> @insert_v32i8_c(<32 x i8> %a, i8 %b, i32 %c) {
-; CHECK-LABEL: insert_v32i8_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x1, #0x1f
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: strb w0, [x9, x8]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v32i8_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x1, #0x1f
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: strb w0, [x9, x8]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v32i8_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w1
+; CHECK-GI-NEXT: mov x10, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0x1f
+; CHECK-GI-NEXT: lsl x9, x8, #1
+; CHECK-GI-NEXT: sub x8, x9, x8
+; CHECK-GI-NEXT: strb w0, [x10, x8]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <32 x i8> %a, i8 %b, i32 %c
ret <32 x i8> %d
@@ -619,18 +789,31 @@ entry:
}
define <4 x i16> @insert_v4i16_c(<4 x i16> %a, i16 %b, i32 %c) {
-; CHECK-LABEL: insert_v4i16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str d0, [sp, #8]
-; CHECK-NEXT: bfi x8, x1, #1, #2
-; CHECK-NEXT: strh w0, [x8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v4i16_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: add x8, sp, #8
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str d0, [sp, #8]
+; CHECK-SD-NEXT: bfi x8, x1, #1, #2
+; CHECK-SD-NEXT: strh w0, [x8]
+; CHECK-SD-NEXT: ldr d0, [sp, #8]
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v4i16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: add x8, sp, #8
+; CHECK-GI-NEXT: str d0, [sp, #8]
+; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: strh w0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT: ldr d0, [sp, #8]
+; CHECK-GI-NEXT: add sp, sp, #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <4 x i16> %a, i16 %b, i32 %c
ret <4 x i16> %d
@@ -657,17 +840,29 @@ entry:
}
define <8 x i16> @insert_v8i16_c(<8 x i16> %a, i16 %b, i32 %c) {
-; CHECK-LABEL: insert_v8i16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x1, #1, #3
-; CHECK-NEXT: strh w0, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v8i16_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x1, #1, #3
+; CHECK-SD-NEXT: strh w0, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v8i16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x7
+; CHECK-GI-NEXT: strh w0, [x8, x9, lsl #1]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <8 x i16> %a, i16 %b, i32 %c
ret <8 x i16> %d
@@ -694,16 +889,35 @@ entry:
}
define <16 x i16> @insert_v16i16_c(<16 x i16> %a, i16 %b, i32 %c) {
-; CHECK-LABEL: insert_v16i16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x1, #0xf
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: strh w0, [x9, x8, lsl #1]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v16i16_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x1, #0xf
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: strh w0, [x9, x8, lsl #1]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v16i16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w1
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0xf
+; CHECK-GI-NEXT: strh w0, [x9, x8, lsl #1]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <16 x i16> %a, i16 %b, i32 %c
ret <16 x i16> %d
@@ -734,18 +948,31 @@ entry:
}
define <2 x i32> @insert_v2i32_c(<2 x i32> %a, i32 %b, i32 %c) {
-; CHECK-LABEL: insert_v2i32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: add x8, sp, #8
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str d0, [sp, #8]
-; CHECK-NEXT: bfi x8, x1, #2, #1
-; CHECK-NEXT: str w0, [x8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v2i32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: add x8, sp, #8
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str d0, [sp, #8]
+; CHECK-SD-NEXT: bfi x8, x1, #2, #1
+; CHECK-SD-NEXT: str w0, [x8]
+; CHECK-SD-NEXT: ldr d0, [sp, #8]
+; CHECK-SD-NEXT: add sp, sp, #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v2i32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: add x8, sp, #8
+; CHECK-GI-NEXT: str d0, [sp, #8]
+; CHECK-GI-NEXT: and x9, x9, #0x1
+; CHECK-GI-NEXT: str w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT: ldr d0, [sp, #8]
+; CHECK-GI-NEXT: add sp, sp, #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <2 x i32> %a, i32 %b, i32 %c
ret <2 x i32> %d
@@ -796,17 +1023,29 @@ entry:
}
define <3 x i32> @insert_v3i32_c(<3 x i32> %a, i32 %b, i32 %c) {
-; CHECK-LABEL: insert_v3i32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x1, #2, #2
-; CHECK-NEXT: str w0, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v3i32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x1, #2, #2
+; CHECK-SD-NEXT: str w0, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v3i32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: str w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x i32> %a, i32 %b, i32 %c
ret <3 x i32> %d
@@ -833,17 +1072,29 @@ entry:
}
define <4 x i32> @insert_v4i32_c(<4 x i32> %a, i32 %b, i32 %c) {
-; CHECK-LABEL: insert_v4i32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x1, #2, #2
-; CHECK-NEXT: str w0, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v4i32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x1, #2, #2
+; CHECK-SD-NEXT: str w0, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v4i32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x3
+; CHECK-GI-NEXT: str w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <4 x i32> %a, i32 %b, i32 %c
ret <4 x i32> %d
@@ -870,16 +1121,35 @@ entry:
}
define <8 x i32> @insert_v8i32_c(<8 x i32> %a, i32 %b, i32 %c) {
-; CHECK-LABEL: insert_v8i32_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x1, #0x7
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: str w0, [x9, x8, lsl #2]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v8i32_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x1, #0x7
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: str w0, [x9, x8, lsl #2]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v8i32_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w1
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0x7
+; CHECK-GI-NEXT: str w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <8 x i32> %a, i32 %b, i32 %c
ret <8 x i32> %d
@@ -906,17 +1176,29 @@ entry:
}
define <2 x i64> @insert_v2i64_c(<2 x i64> %a, i64 %b, i32 %c) {
-; CHECK-LABEL: insert_v2i64_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: bfi x8, x1, #3, #1
-; CHECK-NEXT: str x0, [x8]
-; CHECK-NEXT: ldr q0, [sp], #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v2i64_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sub sp, sp, #16
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: str q0, [sp]
+; CHECK-SD-NEXT: bfi x8, x1, #3, #1
+; CHECK-SD-NEXT: str x0, [x8]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v2i64_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #16
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: mov w9, w1
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str q0, [sp]
+; CHECK-GI-NEXT: and x9, x9, #0x1
+; CHECK-GI-NEXT: str x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT: ldr q0, [sp], #16
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <2 x i64> %a, i64 %b, i32 %c
ret <2 x i64> %d
@@ -953,25 +1235,51 @@ entry:
}
define <3 x i64> @insert_v3i64_c(<3 x i64> %a, i64 %b, i32 %c) {
-; CHECK-LABEL: insert_v3i64_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: stp q0, q2, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: and x9, x1, #0x3
-; CHECK-NEXT: str x0, [x8, x9, lsl #3]
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK-NEXT: ldr d2, [sp, #16]
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v3i64_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT: stp q0, q2, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-NEXT: and x9, x1, #0x3
+; CHECK-SD-NEXT: str x0, [x8, x9, lsl #3]
+; CHECK-SD-NEXT: ldr q0, [sp]
+; CHECK-SD-NEXT: ldr d2, [sp, #16]
+; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: add sp, sp, #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v3i64_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: mov w8, w1
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT: and x8, x8, #0x3
+; CHECK-GI-NEXT: stp q0, q2, [sp]
+; CHECK-GI-NEXT: str x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT: ldp q0, q2, [sp]
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x i64> %a, i64 %b, i32 %c
ret <3 x i64> %d
@@ -998,16 +1306,35 @@ entry:
}
define <4 x i64> @insert_v4i64_c(<4 x i64> %a, i64 %b, i32 %c) {
-; CHECK-LABEL: insert_v4i64_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: stp q0, q1, [sp, #-32]!
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: and x8, x1, #0x3
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: str x0, [x9, x8, lsl #3]
-; CHECK-NEXT: ldp q0, q1, [sp], #32
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: insert_v4i64_c:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: and x8, x1, #0x3
+; CHECK-SD-NEXT: mov x9, sp
+; CHECK-SD-NEXT: str x0, [x9, x8, lsl #3]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: insert_v4i64_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: sub x9, sp, #48
+; CHECK-GI-NEXT: mov x29, sp
+; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-GI-NEXT: .cfi_def_cfa w29, 16
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: mov w8, w1
+; CHECK-GI-NEXT: mov x9, sp
+; CHECK-GI-NEXT: stp q0, q1, [sp]
+; CHECK-GI-NEXT: and x8, x8, #0x3
+; CHECK-GI-NEXT: str x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT: ldp q0, q1, [sp]
+; CHECK-GI-NEXT: mov sp, x29
+; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%d = insertelement <4 x i64> %a, i64 %b, i32 %c
ret <4 x i64> %d
More information about the llvm-commits
mailing list