[llvm] 1588aab - [AArch64] Generalize integer FPR lane stores for all types (#134117)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 17 01:26:03 PDT 2025
Author: Benjamin Maxwell
Date: 2025-04-17T09:25:57+01:00
New Revision: 1588aab6ed2d02db2ffb23ca7f339d38a4d8c5e9
URL: https://github.com/llvm/llvm-project/commit/1588aab6ed2d02db2ffb23ca7f339d38a4d8c5e9
DIFF: https://github.com/llvm/llvm-project/commit/1588aab6ed2d02db2ffb23ca7f339d38a4d8c5e9.diff
LOG: [AArch64] Generalize integer FPR lane stores for all types (#134117)
This rewrites the fold from #129756 to apply to all types, including
stores of i8s. This required adding a new `aarch64mfp8` MVT to represent
FPR8 types on AArch64, which can be used to extract and store 8-bit
values using b sub-registers.
Follow on from: #129756
Closes: #131793
Added:
Modified:
llvm/include/llvm/CodeGen/ValueTypes.td
llvm/lib/CodeGen/ValueTypes.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/AArch64RegisterInfo.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
llvm/test/CodeGen/AArch64/add.ll
llvm/test/CodeGen/AArch64/andorxor.ll
llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
llvm/test/CodeGen/AArch64/arm64-rev.ll
llvm/test/CodeGen/AArch64/arm64-st1.ll
llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
llvm/test/CodeGen/AArch64/concat-vector.ll
llvm/test/CodeGen/AArch64/ctlz.ll
llvm/test/CodeGen/AArch64/ctpop.ll
llvm/test/CodeGen/AArch64/cttz.ll
llvm/test/CodeGen/AArch64/dp1.ll
llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
llvm/test/CodeGen/AArch64/insertextract.ll
llvm/test/CodeGen/AArch64/mul.ll
llvm/test/CodeGen/AArch64/neon-truncstore.ll
llvm/test/CodeGen/AArch64/nontemporal-load.ll
llvm/test/CodeGen/AArch64/pr-cf624b2.ll
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
llvm/test/CodeGen/AArch64/store.ll
llvm/test/CodeGen/AArch64/sub.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
llvm/test/CodeGen/AArch64/tbl-loops.ll
llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
llvm/test/CodeGen/AArch64/usub_sat_vec.ll
llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
llvm/test/CodeGen/AArch64/vec_uaddo.ll
llvm/test/CodeGen/AArch64/vec_umulo.ll
llvm/test/CodeGen/AArch64/vector-compress.ll
llvm/test/CodeGen/AArch64/zext-to-tbl.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index fc1a95e33380b..28216a7a55398 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>;
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added.
def amdgpuBufferStridedPointer : ValueType<192, 235>;
+def aarch64mfp8 : ValueType<8, 236>; // 8-bit value in FPR (AArch64)
+
let isNormalValueType = false in {
def token : ValueType<0, 504>; // TokenTy
def MetadataVT : ValueType<0, 505> { // Metadata
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 0554b6387c5e6..10970b719fcae 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -198,6 +198,8 @@ std::string EVT::getEVTString() const {
return "amdgpuBufferFatPointer";
case MVT::amdgpuBufferStridedPointer:
return "amdgpuBufferStridedPointer";
+ case MVT::aarch64mfp8:
+ return "aarch64mfp8";
}
}
@@ -221,6 +223,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
case MVT::aarch64svcount:
return TargetExtType::get(Context, "aarch64.svcount");
+ case MVT::aarch64mfp8:
+ return FixedVectorType::get(IntegerType::get(Context, 8), 1);
case MVT::x86amx: return Type::getX86_AMXTy(Context);
case MVT::i64x8: return IntegerType::get(Context, 512);
case MVT::amdgpuBufferFatPointer: return IntegerType::get(Context, 160);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a95d8d343adf2..771eee1b3fecf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -400,6 +400,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasFPARMv8()) {
+ addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
@@ -23930,6 +23931,8 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
static unsigned getFPSubregForVT(EVT VT) {
assert(VT.isSimple() && "Expected simple VT");
switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::aarch64mfp8:
+ return AArch64::bsub;
case MVT::f16:
return AArch64::hsub;
case MVT::f32:
@@ -24019,39 +24022,65 @@ static SDValue performSTORECombine(SDNode *N,
SDValue ExtIdx = Value.getOperand(1);
EVT VectorVT = Vector.getValueType();
EVT ElemVT = VectorVT.getVectorElementType();
- if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
+
+ if (!ValueVT.isInteger())
+ return SDValue();
+
+ // Propagate zero constants (applying this fold may miss optimizations).
+ if (ISD::isConstantSplatVectorAllZeros(Vector.getNode())) {
+ SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
+ DAG.ReplaceAllUsesWith(Value, ZeroElt);
return SDValue();
+ }
+
if (ValueVT != MemVT && !ST->isTruncatingStore())
return SDValue();
- // Heuristic: If there are other users of integer scalars extracted from
- // this vector that won't fold into the store -- abandon folding. Applying
- // this fold may extend the vector lifetime and disrupt paired stores.
- for (const auto &Use : Vector->uses()) {
- if (Use.getResNo() != Vector.getResNo())
- continue;
- const SDNode *User = Use.getUser();
- if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- (!User->hasOneUse() ||
- (*User->user_begin())->getOpcode() != ISD::STORE))
- return SDValue();
- }
+ // This could generate an additional extract if the index is non-zero and
+ // the extracted value has multiple uses.
+ auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
+ if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
+ return SDValue();
- EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
- EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
- SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
- SDValue Ext =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
+ // These can lower to st1, which is preferable if we're unlikely to fold the
+ // addressing into the store.
+ if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
+ (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
+ !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
+ return SDValue();
- EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
- if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
- SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
- FPMemVT, Ext);
- return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
- ST->getMemOperand());
+ if (MemVT == MVT::i64 || MemVT == MVT::i32) {
+ // Heuristic: If there are other users of w/x integer scalars extracted
+ // from this vector that won't fold into the store -- abandon folding.
+ // Applying this fold may disrupt paired stores.
+ for (const auto &Use : Vector->uses()) {
+ if (Use.getResNo() != Vector.getResNo())
+ continue;
+ const SDNode *User = Use.getUser();
+ if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ (!User->hasOneUse() ||
+ (*User->user_begin())->getOpcode() != ISD::STORE))
+ return SDValue();
+ }
}
- return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
+ SDValue ExtVector = Vector;
+ if (!ExtCst || !ExtCst->isZero()) {
+ // Handle extracting from lanes != 0.
+ SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Value.getValueType(), Vector, ExtIdx);
+ SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+ ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
+ DAG.getUNDEF(VectorVT), Ext, Zero);
+ }
+
+ EVT FPMemVT = MemVT == MVT::i8
+ ? MVT::aarch64mfp8
+ : EVT::getFloatingPointVT(MemVT.getSizeInBits());
+ SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
+ FPMemVT, ExtVector);
+
+ return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
ST->getMemOperand());
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 083b9410f7c11..a060a2f597ccd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3590,7 +3590,7 @@ defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
[(set (f16 FPR16Op:$Rt),
@@ -3778,7 +3778,7 @@ defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
[(set (f16 FPR16Op:$Rt),
@@ -4348,7 +4348,7 @@ defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
[(store (f16 FPR16Op:$Rt),
@@ -4484,7 +4484,7 @@ defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
[(store (f16 FPR16Op:$Rt),
@@ -4604,6 +4604,12 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+// aarch64mfp8 (bsub) stores
+def : Pat<(store aarch64mfp8:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store aarch64mfp8:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+ (STRBui FPR8:$Rt, GPR64sp:$Rn, uimm12s1:$offset)>;
+
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
@@ -7245,8 +7251,15 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
// Move elements between vectors
multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE,
- ValueType VTScal, Operand SVEIdxTy, Instruction INS> {
+ ValueType VTScal, Operand SVEIdxTy, Instruction INS, Instruction DUP, SubRegIndex DUPSub> {
// Extracting from the lowest 128-bits of an SVE vector
+ def : Pat<(VT128 (vector_insert undef,
+ (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))),
+ (i64 0))),
+ (INSERT_SUBREG (VT128 (IMPLICIT_DEF)),
+ (DUP (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn),
+ DUPSub)>;
+
def : Pat<(VT128 (vector_insert VT128:$Rn,
(VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))),
(i64 imm:$Immd))),
@@ -7265,6 +7278,11 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE
(i64 imm:$Immd))),
(INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+ def : Pat<(VT128 (vector_insert undef,
+ (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
+ (i64 0))),
+ (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), (DUP V128:$Rn, imm:$Immn), DUPSub)>;
+
def : Pat<(VT128 (vector_insert V128:$src,
(VTScal (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
@@ -7287,15 +7305,15 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE
dsub)>;
}
-defm : Neon_INS_elt_pattern<v8f16, v4f16, nxv8f16, f16, VectorIndexH, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v8bf16, v4bf16, nxv8bf16, bf16, VectorIndexH, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v4f32, v2f32, nxv4f32, f32, VectorIndexS, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, nxv2f64, f64, VectorIndexD, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v8f16, v4f16, nxv8f16, f16, VectorIndexH, INSvi16lane, DUPi16, hsub>;
+defm : Neon_INS_elt_pattern<v8bf16, v4bf16, nxv8bf16, bf16, VectorIndexH, INSvi16lane, DUPi16, hsub>;
+defm : Neon_INS_elt_pattern<v4f32, v2f32, nxv4f32, f32, VectorIndexS, INSvi32lane, DUPi32, ssub>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, nxv2f64, f64, VectorIndexD, INSvi64lane, DUPi64, dsub>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8, nxv16i8, i32, VectorIndexB, INSvi8lane>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, nxv16i8, i32, VectorIndexB, INSvi8lane, DUPi8, bsub>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi16lane, DUPi16, hsub>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane, DUPi32, ssub>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane, DUPi64, dsub>;
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 40553aff04919..d3252ea54321e 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -497,7 +497,7 @@ def Q30 : AArch64Reg<30, "q30", [D30, D30_HI], ["v30", ""]>, DwarfRegAlias<B30
def Q31 : AArch64Reg<31, "q31", [D31, D31_HI], ["v31", ""]>, DwarfRegAlias<B31>;
}
-def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> {
+def FPR8 : RegisterClass<"AArch64", [i8, aarch64mfp8], 8, (sequence "B%u", 0, 31)> {
let Size = 8;
let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::FPR8RegClassID, 0, 32>";
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4830aebc2739d..e5d99037b6c63 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3498,6 +3498,22 @@ let Predicates = [HasSVE_or_SME] in {
(EXTRACT_SUBREG ZPR:$Zs, dsub)>;
}
+ multiclass sve_insert_extract_elt<ValueType VT, ValueType VTScalar, Instruction DUP, Operand IdxTy> {
+ // NOP pattern (needed to avoid pointless DUPs being added by the second pattern).
+ def : Pat<(VT (vector_insert undef,
+ (VTScalar (vector_extract VT:$vec, (i64 0))), (i64 0))),
+ (VT $vec)>;
+
+ def : Pat<(VT (vector_insert undef,
+ (VTScalar (vector_extract VT:$vec, (i64 IdxTy:$Idx))), (i64 0))),
+ (DUP ZPR:$vec, IdxTy:$Idx)>;
+ }
+
+ defm : sve_insert_extract_elt<nxv16i8, i32, DUP_ZZI_B, sve_elm_idx_extdup_b>;
+ defm : sve_insert_extract_elt<nxv8i16, i32, DUP_ZZI_H, sve_elm_idx_extdup_h>;
+ defm : sve_insert_extract_elt<nxv4i32, i32, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : sve_insert_extract_elt<nxv2i64, i64, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+
multiclass sve_predicated_add<SDNode extend, int value> {
def : Pat<(nxv16i8 (add ZPR:$op, (extend nxv16i1:$pred))),
(ADD_ZPmZ_B PPR:$pred, ZPR:$op, (DUP_ZI_B value, 0))>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index d39c9bf760621..eb215898a7ad5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING
-; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
@@ -106,19 +106,11 @@ entry:
}
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: strb w8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s8:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: strb w8, [x0]
-; STREAMING-COMPAT-NEXT: ret
-
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, z0.b[7]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
store i8 %0, ptr %a, align 1
@@ -128,10 +120,8 @@ entry:
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
-
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
store i8 %0, ptr %a, align 1
@@ -201,6 +191,19 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
ret void
}
+define void @test_str_reduction_i32_to_i8(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ store i8 %trunc, ptr %ptr, align 1
+ ret void
+}
+
define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
; CHECK: // %bb.0:
@@ -242,6 +245,20 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
ret void
}
+define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ %out_ptr = getelementptr inbounds i8, ptr %ptr, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_lane_s32_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -297,19 +314,11 @@ entry:
}
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
-; STREAMING-COMPAT-NEXT: ret
-
+; CHECK-LABEL: test_str_lane_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, z0.b[7]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
@@ -320,10 +329,8 @@ entry:
define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8_negative_offset:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sturb w8, [x0, #-8]
+; CHECK-NEXT: stur b0, [x0, #-8]
; CHECK-NEXT: ret
-
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
@@ -385,6 +392,47 @@ entry:
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -413,3 +461,46 @@ entry:
store i16 %trunc, ptr %out_ptr, align 2
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index fc0ba336b21cc..d5bd1b712a2a6 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -63,10 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -101,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -263,10 +262,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr d0, [x0]
; CHECK-SD-NEXT: ldr d1, [x1]
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i16:
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 0384848082caa..f7df1092287bd 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -183,10 +183,9 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: and_v2i8:
@@ -220,10 +219,9 @@ define void @or_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_v2i8:
@@ -257,10 +255,9 @@ define void @xor_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: xor_v2i8:
@@ -295,11 +292,11 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -343,11 +340,11 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -391,11 +388,11 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -802,8 +799,8 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: and x8, x8, x9
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: str w8, [x0]
-; CHECK-SD-NEXT: add x8, x0, #4
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
+; CHECK-SD-NEXT: str h0, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: and_v3i16:
@@ -839,8 +836,8 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: orr x8, x8, x9
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: str w8, [x0]
-; CHECK-SD-NEXT: add x8, x0, #4
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
+; CHECK-SD-NEXT: str h0, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_v3i16:
@@ -876,8 +873,8 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: eor x8, x8, x9
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: str w8, [x0]
-; CHECK-SD-NEXT: add x8, x0, #4
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
+; CHECK-SD-NEXT: str h0, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: xor_v3i16:
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 2c065e0051cd7..7f2bebf584d8f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -615,11 +615,10 @@ define <1 x i8> @getL() {
; CHECK-NEXT: ; kill
; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L at GOTPAGEOFF]
-; Ultimately we should generate str b0, but right now, we match the vector
-; variant which does not allow to fold the immediate into the store.
-; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str b0, [x[[LDRGOT_REG]]]
; CHECK-NEXT: ret
-; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
define void @setL(<1 x i8> %t) {
store <1 x i8> %t, ptr @L, align 4
ret void
@@ -678,6 +677,6 @@ if.end.i:
call void (ptr, ...) @callee(ptr @.str.89, ptr @.str.90, double %sub)
unreachable
}
-declare void @callee(ptr nocapture readonly, ...)
+declare void @callee(ptr nocapture readonly, ...)
attributes #0 = { "target-cpu"="cyclone" }
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index c0d91c1e0c836..2a085dc0e72bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -2062,7 +2062,7 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT: mov h2, v0.h[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov v2.h[1], v0.h[1]
; CHECK-GI-NEXT: mov v2.h[2], v0.h[2]
@@ -2189,7 +2189,7 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov s2, v0.s[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
@@ -2252,7 +2252,7 @@ define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
;
; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT: mov d0, v0.d[0]
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index 7d87be0ce8e1c..7721616be436c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -674,16 +674,10 @@ entry:
}
define void @test_vst1_lane_s64(ptr %a, <1 x i64> %b) {
-; CHECK-GI-LABEL: test_vst1_lane_s64:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: str d0, [x0]
-; CHECK-GI-NEXT: ret
-;
-; CHECK-SD-LABEL: test_vst1_lane_s64:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str d0, [x0]
-; CHECK-SD-NEXT: ret
+; CHECK-LABEL: test_vst1_lane_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <1 x i64> %b, i32 0
store i64 %0, ptr %a, align 8
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index 14ab7b5108125..6bdd5f998a3b9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -462,9 +462,9 @@ define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp
; CHECK-SD-LABEL: test_vrev64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: add x8, x1, #2
-; CHECK-SD-NEXT: st1.h { v0 }[5], [x8]
+; CHECK-SD-NEXT: mov h1, v0[5]
; CHECK-SD-NEXT: st1.h { v0 }[6], [x1]
+; CHECK-SD-NEXT: str h1, [x1, #2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vrev64:
diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll
index 02797f3ed186c..d6abf829bc989 100644
--- a/llvm/test/CodeGen/AArch64/arm64-st1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll
@@ -5,10 +5,17 @@
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m3 | FileCheck --check-prefixes=CHECK,EXYNOS %s
define void @st1lane_16b(<16 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane_16b:
-; CHECK: add x8, x0, #1
-; CHECK: st1.b { v0 }[1], [x8]
-
+; SD-CHECK-LABEL: st1lane_16b:
+; SD-CHECK: mov b0, v0[1]
+; SD-CHECK: stur b0, [x0, #1]
+;
+; GI-CHECK-LABEL: st1lane_16b:
+; GI-CHECK: add x8, x0, #1
+; GI-CHECK: st1.b { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_16b:
+; EXYNOS: mov b0, v0[1]
+; EXYNOS: stur b0, [x0, #1]
%ptr = getelementptr i8, ptr %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -16,9 +23,15 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) {
}
define void @st1lane0_16b(<16 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane0_16b:
-; CHECK: add x8, x0, #1
-; CHECK: st1.b { v0 }[0], [x8]
+; SD-CHECK-LABEL: st1lane0_16b:
+; SD-CHECK: stur b0, [x0, #1]
+;
+; GI-CHECK-LABEL: st1lane0_16b:
+; GI-CHECK: add x8, x0, #1
+; GI-CHECK: st1.b { v0 }[0], [x8]
+;
+; EXYNOS-LABEL: st1lane0_16b:
+; EXYNOS: stur b0, [x0, #1]
%ptr = getelementptr i8, ptr %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 0
@@ -27,10 +40,15 @@ define void @st1lane0_16b(<16 x i8> %A, ptr %D) {
}
define void @st1lane0u_16b(<16 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane0u_16b:
-; CHECK: sub x8, x0, #1
-; CHECK: st1.b { v0 }[0], [x8]
-
+; SD-CHECK-LABEL: st1lane0u_16b:
+; SD-CHECK: stur b0, [x0, #-1]
+;
+; GI-CHECK-LABEL: st1lane0u_16b:
+; GI-CHECK: sub x8, x0, #1
+; GI-CHECK: st1.b { v0 }[0], [x8]
+;
+; EXYNOS-LABEL: st1lane0u_16b:
+; EXYNOS: stur b0, [x0, #-1]
%ptr = getelementptr i8, ptr %D, i64 -1
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
@@ -38,10 +56,17 @@ define void @st1lane0u_16b(<16 x i8> %A, ptr %D) {
}
define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_16b:
-; CHECK: add x8, x0, x1
-; CHECK: st1.b { v0 }[1], [x8]
-
+; SD-CHECK-LABEL: st1lane_ro_16b:
+; SD-CHECK: mov b0, v0[1]
+; SD-CHECK: str b0, [x0, x1]
+;
+; GI-CHECK-LABEL: st1lane_ro_16b:
+; GI-CHECK: add x8, x0, x1
+; GI-CHECK: st1.b { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_16b:
+; EXYNOS: mov b0, v0[1]
+; EXYNOS: str b0, [x0, x1]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <16 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -49,10 +74,15 @@ define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
}
define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane0_ro_16b:
-; CHECK: add x8, x0, x1
-; CHECK: st1.b { v0 }[0], [x8]
-
+; SD-CHECK-LABEL: st1lane0_ro_16b:
+; SD-CHECK: str b0, [x0, x1]
+;
+; GI-CHECK-LABEL: st1lane0_ro_16b:
+; GI-CHECK: add x8, x0, x1
+; GI-CHECK: st1.b { v0 }[0], [x8]
+;
+; EXYNOS-LABEL: st1lane0_ro_16b:
+; EXYNOS: str b0, [x0, x1]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
@@ -60,9 +90,17 @@ define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
}
define void @st1lane_8h(<8 x i16> %A, ptr %D) {
-; CHECK-LABEL: st1lane_8h:
-; CHECK: add x8, x0, #2
-; CHECK: st1.h { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_8h:
+; SD-CHECK: mov h0, v0[1]
+; SD-CHECK: str h0, [x0, #2]
+;
+; GI-CHECK-LABEL: st1lane_8h:
+; GI-CHECK: add x8, x0, #2
+; GI-CHECK: st1.h { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_8h:
+; EXYNOS: mov h0, v0[1]
+; EXYNOS: str h0, [x0, #2]
%ptr = getelementptr i16, ptr %D, i64 1
%tmp = extractelement <8 x i16> %A, i32 1
store i16 %tmp, ptr %ptr
@@ -88,9 +126,17 @@ define void @st1lane0u_8h(<8 x i16> %A, ptr %D) {
}
define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_8h:
-; CHECK: add x8, x0, x1, lsl #1
-; CHECK: st1.h { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_ro_8h:
+; SD-CHECK: mov h0, v0[1]
+; SD-CHECK: str h0, [x0, x1, lsl #1]
+;
+; GI-CHECK-LABEL: st1lane_ro_8h:
+; GI-CHECK: add x8, x0, x1, lsl #1
+; GI-CHECK: st1.h { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_8h:
+; EXYNOS: mov h0, v0[1]
+; EXYNOS: str h0, [x0, x1, lsl #1]
%ptr = getelementptr i16, ptr %D, i64 %offset
%tmp = extractelement <8 x i16> %A, i32 1
store i16 %tmp, ptr %ptr
@@ -107,9 +153,17 @@ define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) {
}
define void @st1lane_4s(<4 x i32> %A, ptr %D) {
-; CHECK-LABEL: st1lane_4s:
-; CHECK: add x8, x0, #4
-; CHECK: st1.s { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_4s:
+; SD-CHECK: mov s0, v0[1]
+; SD-CHECK: str s0, [x0, #4]
+;
+; GI-CHECK-LABEL: st1lane_4s:
+; GI-CHECK: add x8, x0, #4
+; GI-CHECK: st1.s { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_4s:
+; EXYNOS: mov s0, v0[1]
+; EXYNOS: str s0, [x0, #4]
%ptr = getelementptr i32, ptr %D, i64 1
%tmp = extractelement <4 x i32> %A, i32 1
store i32 %tmp, ptr %ptr
@@ -135,9 +189,17 @@ define void @st1lane0u_4s(<4 x i32> %A, ptr %D) {
}
define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_4s:
-; CHECK: add x8, x0, x1, lsl #2
-; CHECK: st1.s { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_ro_4s:
+; SD-CHECK: mov s0, v0[1]
+; SD-CHECK: str s0, [x0, x1, lsl #2]
+;
+; GI-CHECK-LABEL: st1lane_ro_4s:
+; GI-CHECK: add x8, x0, x1, lsl #2
+; GI-CHECK: st1.s { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_4s:
+; EXYNOS: mov s0, v0[1]
+; EXYNOS: str s0, [x0, x1, lsl #2]
%ptr = getelementptr i32, ptr %D, i64 %offset
%tmp = extractelement <4 x i32> %A, i32 1
store i32 %tmp, ptr %ptr
@@ -201,9 +263,17 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) {
}
define void @st1lane_2d(<2 x i64> %A, ptr %D) {
-; CHECK-LABEL: st1lane_2d:
-; CHECK: add x8, x0, #8
-; CHECK: st1.d { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_2d:
+; SD-CHECK: mov d0, v0[1]
+; SD-CHECK: str d0, [x0, #8]
+;
+; GI-CHECK-LABEL: st1lane_2d:
+; GI-CHECK: add x8, x0, #8
+; GI-CHECK: st1.d { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_2d:
+; EXYNOS: mov d0, v0[1]
+; EXYNOS: str d0, [x0, #8]
%ptr = getelementptr i64, ptr %D, i64 1
%tmp = extractelement <2 x i64> %A, i32 1
store i64 %tmp, ptr %ptr
@@ -229,9 +299,17 @@ define void @st1lane0u_2d(<2 x i64> %A, ptr %D) {
}
define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_2d:
-; CHECK: add x8, x0, x1, lsl #3
-; CHECK: st1.d { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_ro_2d:
+; SD-CHECK: mov d0, v0[1]
+; SD-CHECK: str d0, [x0, x1, lsl #3]
+;
+; GI-CHECK-LABEL: st1lane_ro_2d:
+; GI-CHECK: add x8, x0, x1, lsl #3
+; GI-CHECK: st1.d { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_2d:
+; EXYNOS: mov d0, v0[1]
+; EXYNOS: str d0, [x0, x1, lsl #3]
%ptr = getelementptr i64, ptr %D, i64 %offset
%tmp = extractelement <2 x i64> %A, i32 1
store i64 %tmp, ptr %ptr
@@ -295,10 +373,17 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) {
}
define void @st1lane_8b(<8 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane_8b:
-; CHECK: add x8, x0, #1
-; CHECK: st1.b { v0 }[1], [x8]
-
+; SD-CHECK-LABEL: st1lane_8b:
+; SD-CHECK: mov b0, v0[1]
+; SD-CHECK: stur b0, [x0, #1]
+;
+; GI-CHECK-LABEL: st1lane_8b:
+; GI-CHECK: add x8, x0, #1
+; GI-CHECK: st1.b { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_8b:
+; EXYNOS: mov b0, v0[1]
+; EXYNOS: stur b0, [x0, #1]
%ptr = getelementptr i8, ptr %D, i64 1
%tmp = extractelement <8 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -306,10 +391,17 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) {
}
define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_8b:
-; CHECK: add x8, x0, x1
-; CHECK: st1.b { v0 }[1], [x8]
-
+; SD-CHECK-LABEL: st1lane_ro_8b:
+; SD-CHECK: mov b0, v0[1]
+; SD-CHECK: str b0, [x0, x1]
+;
+; GI-CHECK-LABEL: st1lane_ro_8b:
+; GI-CHECK: add x8, x0, x1
+; GI-CHECK: st1.b { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_8b:
+; EXYNOS: mov b0, v0[1]
+; EXYNOS: str b0, [x0, x1]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <8 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -317,10 +409,15 @@ define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
}
define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane0_ro_8b:
-; CHECK: add x8, x0, x1
-; CHECK: st1.b { v0 }[0], [x8]
-
+; SD-CHECK-LABEL: st1lane0_ro_8b:
+; SD-CHECK: str b0, [x0, x1]
+;
+; GI-CHECK-LABEL: st1lane0_ro_8b:
+; GI-CHECK: add x8, x0, x1
+; GI-CHECK: st1.b { v0 }[0], [x8]
+;
+; EXYNOS-LABEL: st1lane0_ro_8b:
+; EXYNOS: str b0, [x0, x1]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <8 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
@@ -328,9 +425,17 @@ define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
}
define void @st1lane_4h(<4 x i16> %A, ptr %D) {
-; CHECK-LABEL: st1lane_4h:
-; CHECK: add x8, x0, #2
-; CHECK: st1.h { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_4h:
+; SD-CHECK: mov h0, v0[1]
+; SD-CHECK: str h0, [x0, #2]
+;
+; GI-CHECK-LABEL: st1lane_4h:
+; GI-CHECK: add x8, x0, #2
+; GI-CHECK: st1.h { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_4h:
+; EXYNOS: mov h0, v0[1]
+; EXYNOS: str h0, [x0, #2]
%ptr = getelementptr i16, ptr %D, i64 1
%tmp = extractelement <4 x i16> %A, i32 1
store i16 %tmp, ptr %ptr
@@ -356,9 +461,17 @@ define void @st1lane0u_4h(<4 x i16> %A, ptr %D) {
}
define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_4h:
-; CHECK: add x8, x0, x1, lsl #1
-; CHECK: st1.h { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_ro_4h:
+; SD-CHECK: mov h0, v0[1]
+; SD-CHECK: str h0, [x0, x1, lsl #1]
+;
+; GI-CHECK-LABEL: st1lane_ro_4h:
+; GI-CHECK: add x8, x0, x1, lsl #1
+; GI-CHECK: st1.h { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_4h:
+; EXYNOS: mov h0, v0[1]
+; EXYNOS: str h0, [x0, x1, lsl #1]
%ptr = getelementptr i16, ptr %D, i64 %offset
%tmp = extractelement <4 x i16> %A, i32 1
store i16 %tmp, ptr %ptr
@@ -375,9 +488,17 @@ define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) {
}
define void @st1lane_2s(<2 x i32> %A, ptr %D) {
-; CHECK-LABEL: st1lane_2s:
-; CHECK: add x8, x0, #4
-; CHECK: st1.s { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_2s:
+; SD-CHECK: mov s0, v0[1]
+; SD-CHECK: str s0, [x0, #4]
+;
+; GI-CHECK-LABEL: st1lane_2s:
+; GI-CHECK: add x8, x0, #4
+; GI-CHECK: st1.s { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_2s:
+; EXYNOS: mov s0, v0[1]
+; EXYNOS: str s0, [x0, #4]
%ptr = getelementptr i32, ptr %D, i64 1
%tmp = extractelement <2 x i32> %A, i32 1
store i32 %tmp, ptr %ptr
@@ -403,9 +524,17 @@ define void @st1lane0u_2s(<2 x i32> %A, ptr %D) {
}
define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_2s:
-; CHECK: add x8, x0, x1, lsl #2
-; CHECK: st1.s { v0 }[1], [x8]
+; SD-CHECK-LABEL: st1lane_ro_2s:
+; SD-CHECK: mov s0, v0[1]
+; SD-CHECK: str s0, [x0, x1, lsl #2]
+;
+; GI-CHECK-LABEL: st1lane_ro_2s:
+; GI-CHECK: add x8, x0, x1, lsl #2
+; GI-CHECK: st1.s { v0 }[1], [x8]
+;
+; EXYNOS-LABEL: st1lane_ro_2s:
+; EXYNOS: mov s0, v0[1]
+; EXYNOS: str s0, [x0, x1, lsl #2]
%ptr = getelementptr i32, ptr %D, i64 %offset
%tmp = extractelement <2 x i32> %A, i32 1
store i32 %tmp, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
index aff3ffc70a711..2866214e1e473 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
@@ -1,14 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s
; Part of PR21549: going through the stack isn't ideal but is correct.
define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) {
-; CHECK-LABEL: test_bitcast_v2i8_to_i16
-; CHECK: mov.s [[WREG_HI:w[0-9]+]], v0[1]
-; CHECK-NEXT: fmov [[WREG_LO:w[0-9]+]], s0
-; CHECK-NEXT: strb [[WREG_HI]], [sp, #15]
-; CHECK-NEXT: strb [[WREG_LO]], [sp, #14]
-; CHECK-NEXT: ldrh w0, [sp, #14]
+; CHECK-LABEL: test_bitcast_v2i8_to_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov s1, v0[1]
+; CHECK-NEXT: str b0, [sp, #14]
+; CHECK-NEXT: stur b1, [sp, #15]
+; CHECK-NEXT: ldrh w0, [sp, #14]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
%aa = bitcast <2 x i8> %a to i16
ret i16 %aa
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 0daa6e7f16202..05a40453833ee 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -480,7 +480,7 @@ define <2 x i64> @concat_high_high_v2i64(<2 x i64> %a_vec, <2 x i64> %b_vec) {
;
; CHECK-GI-LABEL: concat_high_high_v2i64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], v0.d[1]
+; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[1]
; CHECK-GI-NEXT: ret
entry:
@@ -498,7 +498,7 @@ define <2 x double> @concat_high_high_v2f64(<2 x double> %a_vec, <2 x double> %b
;
; CHECK-GI-LABEL: concat_high_high_v2f64:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.d[0], v0.d[1]
+; CHECK-GI-NEXT: mov d0, v0.d[1]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[1]
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 742433c50d390..fcd1fa2983420 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -13,10 +13,9 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: mov v1.s[1], w9
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -49,11 +48,11 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: clz v1.4h, v1.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -179,10 +178,10 @@ define void @v3i16(ptr %p1) {
; CHECK-SD-LABEL: v3i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: clz v0.4h, v0.4h
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i16:
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index c7c378d3e67cd..10ec1d0c1982a 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -13,10 +13,9 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -48,11 +47,11 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -178,11 +177,11 @@ define void @v3i16(ptr %p1) {
; CHECK-SD-LABEL: v3i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i16:
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index 41843e03cb81e..60125f8a19811 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -16,10 +16,9 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: movi v1.2s, #32
; CHECK-SD-NEXT: clz v0.2s, v0.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -59,11 +58,11 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: clz v0.4h, v0.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -263,14 +262,14 @@ define void @v3i16(ptr %p1) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v0.4h, #1
; CHECK-SD-NEXT: ldr d1, [x0]
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: bic v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: movi v1.4h, #16
; CHECK-SD-NEXT: clz v0.4h, v0.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i16:
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index 4f48aac72ebc3..e904f4b6d247a 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -205,8 +205,7 @@ define void @ctpop_i32() {
; CHECK-SDAG-NEXT: fmov d0, x9
; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
; CHECK-SDAG-NEXT: addv b0, v0.8b
-; CHECK-SDAG-NEXT: fmov w9, s0
-; CHECK-SDAG-NEXT: str w9, [x8]
+; CHECK-SDAG-NEXT: str s0, [x8]
; CHECK-SDAG-NEXT: ret
;
; CHECK-GISEL-LABEL: ctpop_i32:
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
index 8345fdfa46b4c..f076ee12427d8 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -184,17 +184,16 @@ define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #235
-; CHECK-NEXT: adrp x9, .LCPI8_0
+; CHECK-NEXT: adrp x8, .LCPI8_0
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0]
; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0]
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v1.4h, v0.4s
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: umov w9, v1.h[1]
-; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: str b0, [x8]
; CHECK-NEXT: and w0, w9, #0x1
-; CHECK-NEXT: strb w10, [x8]
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> %a, splat(i32 235)
%ext = extractelement <4 x i1> %icmp, i32 1
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 54ee693db1239..aa4f31fb5f53e 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -271,7 +271,7 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3f32_2:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov s2, v0.s[0]
; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1
; CHECK-GI-NEXT: mov v2.s[1], v0.s[1]
; CHECK-GI-NEXT: mov v2.s[2], v1.s[0]
@@ -992,7 +992,7 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3i32_2:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT: mov s1, v0.s[0]
; CHECK-GI-NEXT: mov v1.s[1], v0.s[1]
; CHECK-GI-NEXT: mov v1.s[2], w0
; CHECK-GI-NEXT: mov v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 500379d1cfdec..8d9a6e6b92914 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -75,10 +75,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -113,11 +112,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -275,10 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr d0, [x0]
; CHECK-SD-NEXT: ldr d1, [x1]
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i16:
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index 3d3362d314a99..f25c90af7968e 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -89,10 +89,9 @@ define void @v2i32_v2i8(<2 x i32> %a, ptr %result) {
; CHECK-LABEL: v2i32_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w9, [x0]
-; CHECK-NEXT: strb w8, [x0, #1]
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: stur b1, [x0, #1]
; CHECK-NEXT: ret
%b = trunc <2 x i32> %a to <2 x i8>
store <2 x i8> %b, ptr %result
@@ -156,10 +155,9 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) {
; CHECK-LABEL: v2i16_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w9, [x0]
-; CHECK-NEXT: strb w8, [x0, #1]
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: stur b1, [x0, #1]
; CHECK-NEXT: ret
%b = trunc <2 x i16> %a to <2 x i8>
store <2 x i8> %b, ptr %result
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index 959ac7f68e351..adb209c0c6348 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -449,10 +449,9 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) {
; CHECK-LABEL: test_ldnp_v33i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldnp q0, q1, [x0]
-; CHECK-NEXT: add x9, x8, #32
; CHECK-NEXT: ldr b2, [x0, #32]
; CHECK-NEXT: stp q0, q1, [x8]
-; CHECK-NEXT: st1.b { v2 }[0], [x9]
+; CHECK-NEXT: stur b2, [x8, #32]
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v33i8:
diff --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
index ea9588e9e3db7..02375b07b3482 100644
--- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
+++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
@@ -11,45 +11,31 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov b1, v0.b[7]
+; CHECK-NEXT: mov b2, v0.b[6]
+; CHECK-NEXT: stur b0, [sp, #15]
+; CHECK-NEXT: stur b0, [sp, #14]
+; CHECK-NEXT: stur b0, [sp, #13]
+; CHECK-NEXT: stur b0, [sp, #12]
+; CHECK-NEXT: stur b1, [sp, #7]
+; CHECK-NEXT: mov b1, v0.b[5]
+; CHECK-NEXT: stur b2, [sp, #6]
+; CHECK-NEXT: mov b2, v0.b[4]
+; CHECK-NEXT: stur b0, [sp, #11]
+; CHECK-NEXT: stur b0, [sp, #10]
+; CHECK-NEXT: stur b1, [sp, #5]
+; CHECK-NEXT: mov b1, v0.b[3]
+; CHECK-NEXT: stur b0, [sp, #9]
+; CHECK-NEXT: stur b2, [sp, #4]
+; CHECK-NEXT: mov b2, v0.b[2]
+; CHECK-NEXT: str b0, [sp]
+; CHECK-NEXT: mov b0, v0.b[1]
+; CHECK-NEXT: stur b1, [sp, #3]
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: orr x9, x8, #0xf
-; CHECK-NEXT: orr x10, x8, #0xe
-; CHECK-NEXT: st1 { v0.b }[0], [x8]
-; CHECK-NEXT: st1 { v0.b }[15], [x9]
-; CHECK-NEXT: orr x9, x8, #0xc
-; CHECK-NEXT: st1 { v0.b }[12], [x9]
-; CHECK-NEXT: orr x9, x8, #0x8
-; CHECK-NEXT: st1 { v0.b }[8], [x9]
-; CHECK-NEXT: orr x9, x8, #0x7
-; CHECK-NEXT: st1 { v0.b }[7], [x9]
-; CHECK-NEXT: orr x9, x8, #0x6
-; CHECK-NEXT: st1 { v0.b }[6], [x9]
-; CHECK-NEXT: orr x9, x8, #0x4
-; CHECK-NEXT: st1 { v0.b }[4], [x9]
-; CHECK-NEXT: orr x9, x8, #0x3
-; CHECK-NEXT: st1 { v0.b }[3], [x9]
-; CHECK-NEXT: orr x9, x8, #0x2
-; CHECK-NEXT: st1 { v0.b }[14], [x10]
-; CHECK-NEXT: mov w10, #13 // =0xd
-; CHECK-NEXT: st1 { v0.b }[2], [x9]
-; CHECK-NEXT: orr x9, x8, #0x1
-; CHECK-NEXT: st1 { v0.b }[1], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #11 // =0xb
-; CHECK-NEXT: st1 { v0.b }[13], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #10 // =0xa
-; CHECK-NEXT: st1 { v0.b }[11], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #9 // =0x9
-; CHECK-NEXT: st1 { v0.b }[10], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: st1 { v0.b }[9], [x9]
-; CHECK-NEXT: mov w9, #5 // =0x5
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: st1 { v0.b }[5], [x8]
+; CHECK-NEXT: stur b2, [sp, #2]
+; CHECK-NEXT: stur b0, [sp, #8]
+; CHECK-NEXT: stur b0, [sp, #1]
; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: add sp, sp, #16
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 4d76994be204f..d54dde3c86364 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -200,10 +200,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x2]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -325,7 +324,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: sqadd v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
index c0a728014e390..950ac92a8b12f 100644
--- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
+++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
@@ -12,8 +12,7 @@ define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
%tst = icmp eq <4 x i22> %l, %r
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index ae2a16929e254..dc39ad0571b14 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -201,10 +201,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x2]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -326,7 +325,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: sqsub v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 37a6ad08d4cb3..3a9f12b838702 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -110,10 +110,9 @@ define void @store_v2i8(<2 x i8> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v2i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v2i8:
@@ -231,12 +230,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
define void @store_v7i8(<7 x i8> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v7i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: add x8, x0, #6
-; CHECK-SD-NEXT: add x9, x0, #4
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mov b1, v0.b[6]
+; CHECK-SD-NEXT: mov h2, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
-; CHECK-SD-NEXT: st1 { v0.b }[6], [x8]
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x9]
+; CHECK-SD-NEXT: stur b1, [x0, #6]
+; CHECK-SD-NEXT: str h2, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v7i8:
@@ -263,10 +262,10 @@ define void @store_v7i8(<7 x i8> %a, ptr %ptr){
define void @store_v3i16(<3 x i16> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v3i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v3i16:
@@ -285,11 +284,11 @@ define void @store_v3i16(<3 x i16> %a, ptr %ptr){
define void @store_v7i16(<7 x i16> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v7i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: add x8, x0, #12
-; CHECK-SD-NEXT: add x9, x0, #8
+; CHECK-SD-NEXT: mov h1, v0.h[6]
+; CHECK-SD-NEXT: mov s2, v0.s[2]
; CHECK-SD-NEXT: str d0, [x0]
-; CHECK-SD-NEXT: st1 { v0.h }[6], [x8]
-; CHECK-SD-NEXT: st1 { v0.s }[2], [x9]
+; CHECK-SD-NEXT: str h1, [x0, #12]
+; CHECK-SD-NEXT: str s2, [x0, #8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v7i16:
@@ -315,9 +314,9 @@ define void @store_v7i16(<7 x i16> %a, ptr %ptr){
define void @store_v3i32(<3 x i32> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v3i32:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: add x8, x0, #8
+; CHECK-SD-NEXT: mov s1, v0.s[2]
; CHECK-SD-NEXT: str d0, [x0]
-; CHECK-SD-NEXT: st1 { v0.s }[2], [x8]
+; CHECK-SD-NEXT: str s1, [x0, #8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v3i32:
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 8183a82f21cb5..7a436eddb23a6 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -63,10 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x0]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -101,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov h0, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -263,10 +262,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr d0, [x0]
; CHECK-SD-NEXT: ldr d1, [x1]
-; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: mov h1, v0.h[2]
; CHECK-SD-NEXT: str s0, [x0]
+; CHECK-SD-NEXT: str h1, [x0, #4]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
index 797f953591b11..0d0b5cbc776c4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
@@ -376,43 +376,39 @@ define void @test_revv8i16v8i16(ptr %a, ptr %b, ptr %c) #1 {
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ldr q0, [x1]
-; CHECK-NEXT: ldr q1, [x0]
-; CHECK-NEXT: orr x9, x8, #0x1e
-; CHECK-NEXT: orr x10, x8, #0x1c
+; CHECK-NEXT: ldr q5, [x0]
+; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: st1 { v0.h }[4], [x9]
-; CHECK-NEXT: orr x9, x8, #0x18
-; CHECK-NEXT: st1 { v0.h }[7], [x9]
-; CHECK-NEXT: orr x9, x8, #0xe
-; CHECK-NEXT: st1 { v1.h }[4], [x9]
-; CHECK-NEXT: orr x9, x8, #0xc
-; CHECK-NEXT: st1 { v1.h }[5], [x9]
-; CHECK-NEXT: orr x9, x8, #0x8
-; CHECK-NEXT: st1 { v0.h }[5], [x10]
-; CHECK-NEXT: orr x10, x8, #0x10
-; CHECK-NEXT: st1 { v1.h }[7], [x9]
-; CHECK-NEXT: orr x9, x8, #0x4
-; CHECK-NEXT: st1 { v0.h }[3], [x10]
-; CHECK-NEXT: mov w10, #26 // =0x1a
-; CHECK-NEXT: st1 { v1.h }[1], [x9]
-; CHECK-NEXT: orr x9, x8, #0x2
-; CHECK-NEXT: st1 { v1.h }[2], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #20 // =0x14
-; CHECK-NEXT: st1 { v0.h }[6], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #18 // =0x12
-; CHECK-NEXT: st1 { v0.h }[1], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: st1 { v0.h }[2], [x9]
-; CHECK-NEXT: mov w9, #10 // =0xa
-; CHECK-NEXT: orr x9, x8, x9
-; CHECK-NEXT: st1 { v1.h }[3], [x8]
-; CHECK-NEXT: st1 { v1.h }[6], [x9]
+; CHECK-NEXT: mov h1, v0.h[4]
+; CHECK-NEXT: mov h2, v0.h[5]
+; CHECK-NEXT: mov h3, v0.h[6]
+; CHECK-NEXT: mov h4, v0.h[7]
; CHECK-NEXT: str h0, [sp, #22]
-; CHECK-NEXT: str h1, [sp, #6]
+; CHECK-NEXT: st1 { v5.h }[3], [x8]
+; CHECK-NEXT: str h5, [sp, #6]
+; CHECK-NEXT: str h1, [sp, #30]
+; CHECK-NEXT: mov h1, v0.h[1]
+; CHECK-NEXT: str h2, [sp, #28]
+; CHECK-NEXT: mov h2, v0.h[2]
+; CHECK-NEXT: mov h0, v0.h[3]
+; CHECK-NEXT: str h3, [sp, #26]
+; CHECK-NEXT: mov h3, v5.h[2]
+; CHECK-NEXT: str h4, [sp, #24]
+; CHECK-NEXT: str h1, [sp, #20]
+; CHECK-NEXT: mov h1, v5.h[4]
+; CHECK-NEXT: str h2, [sp, #18]
+; CHECK-NEXT: mov h2, v5.h[5]
+; CHECK-NEXT: str h0, [sp, #16]
+; CHECK-NEXT: mov h0, v5.h[6]
+; CHECK-NEXT: str h3, [sp, #2]
+; CHECK-NEXT: str h1, [sp, #14]
+; CHECK-NEXT: mov h1, v5.h[7]
+; CHECK-NEXT: str h2, [sp, #12]
+; CHECK-NEXT: mov h2, v5.h[1]
+; CHECK-NEXT: str h0, [sp, #10]
+; CHECK-NEXT: str h1, [sp, #8]
+; CHECK-NEXT: str h2, [sp, #4]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
; CHECK-NEXT: st1h { z0.h }, p0, [x2]
; CHECK-NEXT: mov sp, x29
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index aa1adfd306a4c..e8c9704940c70 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -119,11 +119,11 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: bl def
; CHECK-NEXT: adrp x8, .LCPI2_0
-; CHECK-NEXT: ldp q0, q2, [sp]
+; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [x19, #8]
+; CHECK-NEXT: ldr q1, [sp, #16]
+; CHECK-NEXT: stur b1, [x19, #8]
; CHECK-NEXT: str d0, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index d9f8482a3c503..b1ac9469c0573 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -20,9 +20,8 @@ define <2 x i64> @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uaddv d0, p0, z0.d
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: and w8, w8, #0xff
+; CHECK-NEXT: str b0, [sp, #12]
+; CHECK-NEXT: ldrb w8, [sp, #12]
; CHECK-NEXT: tbz w8, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %cond.load
; CHECK-NEXT: fmov x9, d1
@@ -109,11 +108,10 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: and z1.d, z2.d, z1.d
-; CHECK-NEXT: uaddv d1, p0, z1.d
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: uaddv d2, p0, z1.d
; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: and w8, w8, #0xff
+; CHECK-NEXT: str b2, [sp, #12]
+; CHECK-NEXT: ldrb w8, [sp, #12]
; CHECK-NEXT: tbnz w8, #0, .LBB1_3
; CHECK-NEXT: // %bb.1: // %else
; CHECK-NEXT: tbnz w8, #1, .LBB1_4
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index a20a330b39bb4..3d9f407c3064c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -159,26 +159,26 @@ define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK-NEXT: mov z4.h, z3.h[7]
; CHECK-NEXT: mov z6.h, z3.h[6]
; CHECK-NEXT: mov z16.h, z3.h[5]
-; CHECK-NEXT: mov z20.h, z2.h[7]
-; CHECK-NEXT: mov z21.h, z1.h[7]
; CHECK-NEXT: mov z18.h, z3.h[4]
; CHECK-NEXT: mov z19.h, z0.h[4]
+; CHECK-NEXT: mov z20.h, z2.h[7]
+; CHECK-NEXT: mov z21.h, z1.h[7]
; CHECK-NEXT: mov z22.h, z2.h[6]
; CHECK-NEXT: mov z23.h, z1.h[6]
; CHECK-NEXT: zip1 z24.h, z5.h, z4.h
; CHECK-NEXT: zip1 z25.h, z7.h, z6.h
-; CHECK-NEXT: zip1 z17.h, z17.h, z16.h
+; CHECK-NEXT: zip1 z16.h, z17.h, z16.h
; CHECK-NEXT: ldp q4, q6, [x0, #32]
-; CHECK-NEXT: zip1 z16.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z17.h, z19.h, z18.h
; CHECK-NEXT: ldp q5, q7, [x1, #32]
-; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
-; CHECK-NEXT: zip1 z19.s, z25.s, z24.s
+; CHECK-NEXT: zip1 z18.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z21.s, z25.s, z24.s
; CHECK-NEXT: zip1 z22.h, z23.h, z22.h
; CHECK-NEXT: mov z23.h, z2.h[5]
-; CHECK-NEXT: mov z21.h, z6.h[7]
+; CHECK-NEXT: mov z20.h, z6.h[7]
; CHECK-NEXT: mov z24.h, z1.h[5]
; CHECK-NEXT: mov z25.h, z2.h[4]
-; CHECK-NEXT: mov z20.h, z7.h[7]
+; CHECK-NEXT: mov z19.h, z7.h[7]
; CHECK-NEXT: mov z26.h, z1.h[4]
; CHECK-NEXT: mov z27.h, z6.h[6]
; CHECK-NEXT: mov z28.h, z7.h[5]
@@ -187,8 +187,8 @@ define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK-NEXT: mov z31.h, z6.h[4]
; CHECK-NEXT: mov z8.h, z5.h[7]
; CHECK-NEXT: mov z9.h, z4.h[7]
-; CHECK-NEXT: zip1 z20.h, z21.h, z20.h
-; CHECK-NEXT: mov z21.h, z7.h[6]
+; CHECK-NEXT: zip1 z19.h, z20.h, z19.h
+; CHECK-NEXT: mov z20.h, z7.h[6]
; CHECK-NEXT: mov z10.h, z5.h[6]
; CHECK-NEXT: mov z11.h, z4.h[6]
; CHECK-NEXT: mov z12.h, z5.h[5]
@@ -196,7 +196,7 @@ define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK-NEXT: mov z14.h, z5.h[4]
; CHECK-NEXT: mov z15.h, z4.h[4]
; CHECK-NEXT: zip1 z23.h, z24.h, z23.h
-; CHECK-NEXT: zip1 z21.h, z27.h, z21.h
+; CHECK-NEXT: zip1 z20.h, z27.h, z20.h
; CHECK-NEXT: zip1 z27.h, z29.h, z28.h
; CHECK-NEXT: zip1 z28.h, z31.h, z30.h
; CHECK-NEXT: zip1 z24.h, z26.h, z25.h
@@ -207,23 +207,23 @@ define void @zip_v32i16(ptr %a, ptr %b) {
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: zip1 z30.h, z15.h, z14.h
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: zip1 z17.s, z18.s, z17.s
-; CHECK-NEXT: zip1 z18.s, z21.s, z20.s
-; CHECK-NEXT: zip1 z20.s, z28.s, z27.s
-; CHECK-NEXT: zip1 z16.s, z22.s, z16.s
-; CHECK-NEXT: zip1 z21.s, z24.s, z23.s
+; CHECK-NEXT: zip1 z16.s, z17.s, z16.s
+; CHECK-NEXT: zip1 z17.s, z20.s, z19.s
+; CHECK-NEXT: zip1 z19.s, z28.s, z27.s
+; CHECK-NEXT: zip1 z18.s, z22.s, z18.s
+; CHECK-NEXT: zip1 z20.s, z24.s, z23.s
; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
; CHECK-NEXT: zip1 z3.s, z26.s, z25.s
; CHECK-NEXT: zip1 z22.s, z30.s, z29.s
; CHECK-NEXT: zip1 z6.h, z6.h, z7.h
-; CHECK-NEXT: zip1 z7.d, z17.d, z19.d
-; CHECK-NEXT: zip1 z17.d, z20.d, z18.d
+; CHECK-NEXT: zip1 z7.d, z16.d, z21.d
+; CHECK-NEXT: zip1 z16.d, z19.d, z17.d
; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
; CHECK-NEXT: zip1 z2.h, z4.h, z5.h
-; CHECK-NEXT: zip1 z4.d, z21.d, z16.d
+; CHECK-NEXT: zip1 z4.d, z20.d, z18.d
; CHECK-NEXT: zip1 z3.d, z22.d, z3.d
; CHECK-NEXT: add z0.h, z0.h, z6.h
-; CHECK-NEXT: add z5.h, z7.h, z17.h
+; CHECK-NEXT: add z5.h, z7.h, z16.h
; CHECK-NEXT: add z1.h, z1.h, z2.h
; CHECK-NEXT: add z2.h, z4.h, z3.h
; CHECK-NEXT: stp q0, q5, [x0, #32]
@@ -1476,44 +1476,44 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
; CHECK-NEXT: zip1 z20.b, z24.b, z23.b
; CHECK-NEXT: zip1 z21.b, z26.b, z25.b
; CHECK-NEXT: zip1 z22.b, z28.b, z27.b
+; CHECK-NEXT: zip1 z23.b, z17.b, z29.b
; CHECK-NEXT: mov z24.b, z2.b[14]
; CHECK-NEXT: mov z25.b, z2.b[12]
; CHECK-NEXT: mov z26.b, z2.b[10]
; CHECK-NEXT: mov z27.b, z2.b[8]
-; CHECK-NEXT: zip1 z23.b, z17.b, z29.b
; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
; CHECK-NEXT: zip1 z4.h, z6.h, z5.h
; CHECK-NEXT: zip1 z5.h, z7.h, z18.h
; CHECK-NEXT: zip1 z6.h, z19.h, z16.h
; CHECK-NEXT: zip1 z7.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z16.h, z23.h, z22.h
; CHECK-NEXT: zip1 z18.b, z25.b, z24.b
; CHECK-NEXT: zip1 z19.b, z27.b, z26.b
; CHECK-NEXT: mov z20.b, z2.b[6]
; CHECK-NEXT: mov z21.b, z2.b[4]
+; CHECK-NEXT: mov z23.b, z17.b[15]
+; CHECK-NEXT: mov z24.b, z17.b[13]
; CHECK-NEXT: mov z29.b, z17.b[3]
; CHECK-NEXT: mov z30.b, z17.b[1]
; CHECK-NEXT: mov z31.b, z2.b[15]
; CHECK-NEXT: mov z8.b, z2.b[13]
-; CHECK-NEXT: zip1 z16.h, z23.h, z22.h
; CHECK-NEXT: mov z22.b, z2.b[2]
-; CHECK-NEXT: mov z23.b, z17.b[15]
-; CHECK-NEXT: mov z24.b, z17.b[13]
; CHECK-NEXT: mov z25.b, z17.b[11]
; CHECK-NEXT: mov z26.b, z17.b[9]
; CHECK-NEXT: mov z27.b, z17.b[7]
; CHECK-NEXT: mov z28.b, z17.b[5]
; CHECK-NEXT: zip1 z17.h, z19.h, z18.h
-; CHECK-NEXT: zip1 z21.b, z21.b, z20.b
-; CHECK-NEXT: zip1 z19.b, z30.b, z29.b
-; CHECK-NEXT: zip1 z20.b, z8.b, z31.b
+; CHECK-NEXT: zip1 z18.b, z21.b, z20.b
+; CHECK-NEXT: zip1 z20.b, z24.b, z23.b
+; CHECK-NEXT: zip1 z23.b, z30.b, z29.b
+; CHECK-NEXT: zip1 z24.b, z8.b, z31.b
; CHECK-NEXT: mov z29.b, z1.b[15]
; CHECK-NEXT: mov z30.b, z1.b[13]
; CHECK-NEXT: mov z31.b, z1.b[11]
; CHECK-NEXT: mov z8.b, z1.b[9]
-; CHECK-NEXT: zip1 z22.b, z2.b, z22.b
-; CHECK-NEXT: zip1 z23.b, z24.b, z23.b
-; CHECK-NEXT: zip1 z24.b, z26.b, z25.b
-; CHECK-NEXT: zip1 z18.b, z28.b, z27.b
+; CHECK-NEXT: zip1 z19.b, z2.b, z22.b
+; CHECK-NEXT: zip1 z21.b, z26.b, z25.b
+; CHECK-NEXT: zip1 z22.b, z28.b, z27.b
; CHECK-NEXT: mov z25.b, z2.b[11]
; CHECK-NEXT: mov z26.b, z2.b[9]
; CHECK-NEXT: mov z27.b, z2.b[7]
@@ -1538,25 +1538,25 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
; CHECK-NEXT: zip1 z25.b, z26.b, z25.b
; CHECK-NEXT: zip1 z26.b, z28.b, z27.b
; CHECK-NEXT: zip1 z2.b, z2.b, z8.b
-; CHECK-NEXT: zip1 z21.h, z22.h, z21.h
-; CHECK-NEXT: zip1 z22.h, z24.h, z23.h
-; CHECK-NEXT: zip1 z23.h, z31.h, z29.h
+; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
+; CHECK-NEXT: zip1 z19.h, z21.h, z20.h
+; CHECK-NEXT: zip1 z20.h, z31.h, z29.h
; CHECK-NEXT: zip1 z1.h, z1.h, z9.h
; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: zip1 z24.h, z10.h, z11.h
+; CHECK-NEXT: zip1 z21.h, z10.h, z11.h
; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: zip1 z0.h, z30.h, z0.h
-; CHECK-NEXT: zip1 z18.h, z19.h, z18.h
-; CHECK-NEXT: zip1 z19.h, z25.h, z20.h
+; CHECK-NEXT: zip1 z22.h, z23.h, z22.h
+; CHECK-NEXT: zip1 z23.h, z25.h, z24.h
; CHECK-NEXT: zip1 z2.h, z2.h, z26.h
; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
; CHECK-NEXT: zip1 z4.s, z6.s, z5.s
; CHECK-NEXT: zip1 z5.s, z16.s, z7.s
-; CHECK-NEXT: zip1 z1.s, z1.s, z23.s
-; CHECK-NEXT: zip1 z6.s, z21.s, z17.s
-; CHECK-NEXT: zip1 z0.s, z0.s, z24.s
-; CHECK-NEXT: zip1 z7.s, z18.s, z22.s
-; CHECK-NEXT: zip1 z2.s, z2.s, z19.s
+; CHECK-NEXT: zip1 z1.s, z1.s, z20.s
+; CHECK-NEXT: zip1 z6.s, z18.s, z17.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z21.s
+; CHECK-NEXT: zip1 z7.s, z22.s, z19.s
+; CHECK-NEXT: zip1 z2.s, z2.s, z23.s
; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
; CHECK-NEXT: zip1 z1.d, z6.d, z5.d
@@ -1752,67 +1752,67 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset b8, -16
-; CHECK-NEXT: ldp q1, q6, [x0]
-; CHECK-NEXT: ldp q0, q2, [x1]
-; CHECK-NEXT: mov z3.h, z6.h[6]
-; CHECK-NEXT: mov z4.h, z6.h[4]
-; CHECK-NEXT: mov z5.h, z6.h[2]
+; CHECK-NEXT: ldp q1, q0, [x0]
+; CHECK-NEXT: ldp q5, q6, [x1]
+; CHECK-NEXT: mov z2.h, z0.h[6]
+; CHECK-NEXT: mov z3.h, z0.h[4]
+; CHECK-NEXT: mov z4.h, z0.h[2]
; CHECK-NEXT: mov z7.h, z1.h[6]
; CHECK-NEXT: mov z16.h, z1.h[4]
; CHECK-NEXT: mov z17.h, z1.h[2]
-; CHECK-NEXT: mov z18.h, z2.h[6]
-; CHECK-NEXT: mov z19.h, z2.h[4]
-; CHECK-NEXT: mov z20.h, z2.h[2]
-; CHECK-NEXT: mov z21.h, z0.h[6]
-; CHECK-NEXT: mov z22.h, z0.h[4]
-; CHECK-NEXT: zip1 z3.h, z4.h, z3.h
-; CHECK-NEXT: zip1 z4.h, z6.h, z5.h
-; CHECK-NEXT: zip1 z5.h, z16.h, z7.h
+; CHECK-NEXT: mov z18.h, z6.h[6]
+; CHECK-NEXT: mov z19.h, z6.h[4]
+; CHECK-NEXT: mov z20.h, z6.h[2]
+; CHECK-NEXT: mov z21.h, z5.h[6]
+; CHECK-NEXT: mov z22.h, z5.h[4]
+; CHECK-NEXT: zip1 z2.h, z3.h, z2.h
+; CHECK-NEXT: zip1 z3.h, z0.h, z4.h
+; CHECK-NEXT: zip1 z4.h, z16.h, z7.h
; CHECK-NEXT: zip1 z7.h, z1.h, z17.h
; CHECK-NEXT: zip1 z16.h, z19.h, z18.h
-; CHECK-NEXT: zip1 z18.h, z2.h, z20.h
-; CHECK-NEXT: mov z19.h, z0.h[2]
-; CHECK-NEXT: zip1 z17.h, z22.h, z21.h
-; CHECK-NEXT: mov z20.h, z6.h[7]
-; CHECK-NEXT: mov z21.h, z6.h[5]
-; CHECK-NEXT: mov z22.h, z6.h[3]
-; CHECK-NEXT: mov z6.h, z6.h[1]
+; CHECK-NEXT: zip1 z17.h, z6.h, z20.h
+; CHECK-NEXT: mov z19.h, z5.h[2]
+; CHECK-NEXT: zip1 z18.h, z22.h, z21.h
+; CHECK-NEXT: mov z20.h, z0.h[7]
+; CHECK-NEXT: mov z21.h, z0.h[5]
+; CHECK-NEXT: mov z22.h, z0.h[3]
+; CHECK-NEXT: mov z0.h, z0.h[1]
; CHECK-NEXT: mov z23.h, z1.h[7]
; CHECK-NEXT: mov z24.h, z1.h[5]
; CHECK-NEXT: mov z25.h, z1.h[3]
; CHECK-NEXT: mov z1.h, z1.h[1]
-; CHECK-NEXT: mov z26.h, z2.h[7]
-; CHECK-NEXT: mov z27.h, z2.h[5]
-; CHECK-NEXT: mov z28.h, z2.h[3]
-; CHECK-NEXT: mov z2.h, z2.h[1]
-; CHECK-NEXT: mov z29.h, z0.h[7]
-; CHECK-NEXT: mov z30.h, z0.h[5]
-; CHECK-NEXT: mov z31.h, z0.h[3]
-; CHECK-NEXT: mov z8.h, z0.h[1]
-; CHECK-NEXT: zip1 z0.h, z0.h, z19.h
+; CHECK-NEXT: mov z26.h, z6.h[7]
+; CHECK-NEXT: mov z27.h, z6.h[5]
+; CHECK-NEXT: mov z28.h, z6.h[3]
+; CHECK-NEXT: mov z6.h, z6.h[1]
+; CHECK-NEXT: mov z29.h, z5.h[7]
+; CHECK-NEXT: mov z30.h, z5.h[5]
+; CHECK-NEXT: mov z31.h, z5.h[3]
+; CHECK-NEXT: mov z8.h, z5.h[1]
+; CHECK-NEXT: zip1 z5.h, z5.h, z19.h
; CHECK-NEXT: zip1 z19.h, z21.h, z20.h
-; CHECK-NEXT: zip1 z6.h, z6.h, z22.h
+; CHECK-NEXT: zip1 z0.h, z0.h, z22.h
; CHECK-NEXT: zip1 z20.h, z24.h, z23.h
; CHECK-NEXT: zip1 z1.h, z1.h, z25.h
; CHECK-NEXT: zip1 z21.h, z27.h, z26.h
-; CHECK-NEXT: zip1 z2.h, z2.h, z28.h
+; CHECK-NEXT: zip1 z6.h, z6.h, z28.h
; CHECK-NEXT: zip1 z22.h, z30.h, z29.h
; CHECK-NEXT: zip1 z23.h, z8.h, z31.h
-; CHECK-NEXT: zip1 z3.s, z4.s, z3.s
-; CHECK-NEXT: zip1 z4.s, z7.s, z5.s
-; CHECK-NEXT: zip1 z5.s, z18.s, z16.s
-; CHECK-NEXT: zip1 z6.s, z6.s, z19.s
+; CHECK-NEXT: zip1 z2.s, z3.s, z2.s
+; CHECK-NEXT: zip1 z3.s, z7.s, z4.s
+; CHECK-NEXT: zip1 z4.s, z17.s, z16.s
+; CHECK-NEXT: zip1 z0.s, z0.s, z19.s
; CHECK-NEXT: zip1 z1.s, z1.s, z20.s
-; CHECK-NEXT: zip1 z0.s, z0.s, z17.s
-; CHECK-NEXT: zip1 z2.s, z2.s, z21.s
+; CHECK-NEXT: zip1 z5.s, z5.s, z18.s
+; CHECK-NEXT: zip1 z6.s, z6.s, z21.s
; CHECK-NEXT: zip1 z7.s, z23.s, z22.s
-; CHECK-NEXT: zip1 z3.d, z4.d, z3.d
-; CHECK-NEXT: zip1 z1.d, z1.d, z6.d
-; CHECK-NEXT: zip1 z0.d, z0.d, z5.d
-; CHECK-NEXT: zip1 z2.d, z7.d, z2.d
-; CHECK-NEXT: add z1.h, z3.h, z1.h
-; CHECK-NEXT: add z0.h, z0.h, z2.h
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT: zip1 z0.d, z1.d, z0.d
+; CHECK-NEXT: zip1 z1.d, z5.d, z4.d
+; CHECK-NEXT: zip1 z3.d, z7.d, z6.d
+; CHECK-NEXT: add z0.h, z2.h, z0.h
+; CHECK-NEXT: add z1.h, z1.h, z3.h
+; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 0ad9900865518..b5d64112db727 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -346,7 +346,6 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: .LBB2_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48
-; CHECK-NEXT: add x13, x0, #8
; CHECK-NEXT: subs x12, x12, #4
; CHECK-NEXT: fcmgt v5.4s, v2.4s, v0.4s
; CHECK-NEXT: fcmgt v6.4s, v3.4s, v0.4s
@@ -367,8 +366,10 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: xtn v6.4h, v4.4s
; CHECK-NEXT: xtn v7.4h, v2.4s
; CHECK-NEXT: tbl v2.16b, { v5.16b, v6.16b, v7.16b }, v1.16b
-; CHECK-NEXT: st1 { v2.s }[2], [x13]
-; CHECK-NEXT: str d2, [x0], #12
+; CHECK-NEXT: mov s3, v2.s[2]
+; CHECK-NEXT: str d2, [x0]
+; CHECK-NEXT: str s3, [x0, #8]
+; CHECK-NEXT: add x0, x0, #12
; CHECK-NEXT: b.ne .LBB2_4
; CHECK-NEXT: // %bb.5: // %middle.block
; CHECK-NEXT: cmp x11, x10
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 184e8fff154b9..fd23f3da18cd7 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -706,11 +706,10 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q4, q0, [x0, #48]
-; CHECK-NEXT: add x9, x1, #10
-; CHECK-NEXT: ldr d1, [x0, #80]
+; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: ldp q3, q2, [x0]
+; CHECK-NEXT: ldr d1, [x0, #80]
; CHECK-NEXT: ldr q5, [x0, #32]
-; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: add x0, x0, #128
; CHECK-NEXT: uzp1.4s v0, v0, v1
; CHECK-NEXT: uzp1.4s v1, v5, v4
@@ -719,10 +718,12 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: uzp1.8h v1, v2, v1
; CHECK-NEXT: uzp1.8b v2, v0, v0
; CHECK-NEXT: uzp1.16b v0, v1, v0
-; CHECK-NEXT: st1.b { v2 }[2], [x9]
-; CHECK-NEXT: add x9, x1, #8
-; CHECK-NEXT: st1.h { v0 }[4], [x9]
-; CHECK-NEXT: str d0, [x1], #16
+; CHECK-NEXT: mov b1, v2[2]
+; CHECK-NEXT: mov h2, v0[4]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: stur b1, [x1, #10]
+; CHECK-NEXT: str h2, [x1, #8]
+; CHECK-NEXT: add x1, x1, #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -742,11 +743,10 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: ld1 { v2.2d }, [x9]
; CHECK-BE-NEXT: ldr d5, [x0, #80]
; CHECK-BE-NEXT: ld1 { v4.2d }, [x10]
-; CHECK-BE-NEXT: add x9, x1, #10
; CHECK-BE-NEXT: subs x8, x8, #1
+; CHECK-BE-NEXT: add x0, x0, #128
; CHECK-BE-NEXT: uzp1 v1.4s, v3.4s, v1.4s
; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v5.4s
-; CHECK-BE-NEXT: add x0, x0, #128
; CHECK-BE-NEXT: uzp1 v2.4s, v4.4s, v2.4s
; CHECK-BE-NEXT: xtn v0.4h, v0.4s
; CHECK-BE-NEXT: uzp1 v1.8h, v1.8h, v2.8h
@@ -754,10 +754,12 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-BE-NEXT: rev16 v2.16b, v1.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
-; CHECK-BE-NEXT: st1 { v0.b }[2], [x9]
-; CHECK-BE-NEXT: add x9, x1, #8
-; CHECK-BE-NEXT: st1 { v2.h }[4], [x9]
-; CHECK-BE-NEXT: str d1, [x1], #16
+; CHECK-BE-NEXT: mov b0, v0.b[2]
+; CHECK-BE-NEXT: mov h2, v2.h[4]
+; CHECK-BE-NEXT: str d1, [x1]
+; CHECK-BE-NEXT: stur b0, [x1, #10]
+; CHECK-BE-NEXT: str h2, [x1, #8]
+; CHECK-BE-NEXT: add x1, x1, #16
; CHECK-BE-NEXT: b.eq .LBB6_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
@@ -777,11 +779,10 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: ld1 { v2.2d }, [x9]
; CHECK-DISABLE-NEXT: ldr d5, [x0, #80]
; CHECK-DISABLE-NEXT: ld1 { v4.2d }, [x10]
-; CHECK-DISABLE-NEXT: add x9, x1, #10
; CHECK-DISABLE-NEXT: subs x8, x8, #1
+; CHECK-DISABLE-NEXT: add x0, x0, #128
; CHECK-DISABLE-NEXT: uzp1 v1.4s, v3.4s, v1.4s
; CHECK-DISABLE-NEXT: uzp1 v0.4s, v0.4s, v5.4s
-; CHECK-DISABLE-NEXT: add x0, x0, #128
; CHECK-DISABLE-NEXT: uzp1 v2.4s, v4.4s, v2.4s
; CHECK-DISABLE-NEXT: xtn v0.4h, v0.4s
; CHECK-DISABLE-NEXT: uzp1 v1.8h, v1.8h, v2.8h
@@ -789,10 +790,12 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b
; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b
-; CHECK-DISABLE-NEXT: st1 { v0.b }[2], [x9]
-; CHECK-DISABLE-NEXT: add x9, x1, #8
-; CHECK-DISABLE-NEXT: st1 { v2.h }[4], [x9]
-; CHECK-DISABLE-NEXT: str d1, [x1], #16
+; CHECK-DISABLE-NEXT: mov b0, v0.b[2]
+; CHECK-DISABLE-NEXT: mov h2, v2.h[4]
+; CHECK-DISABLE-NEXT: str d1, [x1]
+; CHECK-DISABLE-NEXT: stur b0, [x1, #10]
+; CHECK-DISABLE-NEXT: str h2, [x1, #8]
+; CHECK-DISABLE-NEXT: add x1, x1, #16
; CHECK-DISABLE-NEXT: b.eq .LBB6_1
; CHECK-DISABLE-NEXT: // %bb.2: // %exit
; CHECK-DISABLE-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index d0173307bd830..14a578fa317d0 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -198,10 +198,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x2]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -324,7 +323,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: uqadd v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index dc3ebfb0682ca..ddb3332abf5d0 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -197,10 +197,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v0.s[1], w10
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: mov s1, v0.s[1]
+; CHECK-SD-NEXT: str b0, [x2]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -321,7 +320,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: uqsub v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index dd7a9c6d7768b..3c42079dc8d8a 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -35,8 +35,7 @@ define void @store_8_elements(<8 x i16> %vec, ptr %out) {
; CHECK-NEXT: ldr q1, [x8, lCPI1_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addv.8h h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
@@ -56,8 +55,7 @@ define void @store_4_elements(<4 x i32> %vec, ptr %out) {
; CHECK-NEXT: ldr q1, [x8, lCPI2_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
@@ -77,8 +75,7 @@ define void @store_2_elements(<2 x i64> %vec, ptr %out) {
; CHECK-NEXT: ldr q1, [x8, lCPI3_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addp.2d d0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
@@ -99,8 +96,7 @@ define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) {
; CHECK-NEXT: cmlt.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9
@@ -121,8 +117,7 @@ define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) {
; CHECK-NEXT: cmlt.4h v0, v0, #0
; CHECK-NEXT: and.8b v0, v0, v1
; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh11
@@ -141,7 +136,7 @@ define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) {
; CHECK-NEXT: ldr d1, [x8, lCPI6_0 at PAGEOFF]
; CHECK-NEXT: bic.8b v0, v1, v0
; CHECK-NEXT: addv.8b b0, v0
-; CHECK-NEXT: st1.b { v0 }[0], [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13
@@ -161,8 +156,7 @@ define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) {
; CHECK-NEXT: ldr d1, [x8, lCPI7_0 at PAGEOFF]
; CHECK-NEXT: bic.8b v0, v1, v0
; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh15
@@ -182,8 +176,7 @@ define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) {
; CHECK-NEXT: ldr d1, [x8, lCPI8_0 at PAGEOFF]
; CHECK-NEXT: bic.8b v0, v1, v0
; CHECK-NEXT: addp.2s v0, v0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh17
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 45b7a2759b0b3..7d3f5bc270d6b 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -355,14 +355,14 @@ define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) {
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: store_trunc_from_64bits:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: add x8, x0, #4
+; CHECK-NEXT: ld1r.4h { v0 }, [x8]
; CHECK-NEXT: ldr w8, [x0]
-; CHECK-NEXT: add x9, x0, #4
-; CHECK-NEXT: ld1r.4h { v0 }, [x9]
; CHECK-NEXT: lsr w9, w8, #16
; CHECK-NEXT: strb w8, [x1]
-; CHECK-NEXT: add x8, x1, #2
+; CHECK-NEXT: mov b0, v0[4]
; CHECK-NEXT: strb w9, [x1, #1]
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: stur b0, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: store_trunc_from_64bits:
@@ -397,13 +397,13 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI11_0 at PAGE
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr d1, [x8, lCPI11_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x1, #1
; CHECK-NEXT: ld1.h { v0 }[2], [x9]
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: add.4h v0, v0, v1
-; CHECK-NEXT: st1.b { v0 }[2], [x8]
-; CHECK-NEXT: st1.b { v0 }[4], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: mov b1, v0[2]
+; CHECK-NEXT: mov b2, v0[4]
+; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
;
@@ -420,12 +420,12 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v1.4h }, [x8]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -443,11 +443,11 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
-; CHECK-NEXT: st1.h { v0 }[2], [x8]
+; CHECK-NEXT: mov h1, v0[2]
; CHECK-NEXT: str s0, [x1]
+; CHECK-NEXT: str h1, [x1, #4]
; CHECK-NEXT: ret
;
; BE-LABEL: load_ext_to_64bits:
@@ -461,11 +461,11 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: ld1 { v0.b }[4], [x8]
-; BE-NEXT: add x8, x1, #4
; BE-NEXT: bic v0.4h, #255, lsl #8
; BE-NEXT: rev32 v1.8h, v0.8h
-; BE-NEXT: st1 { v0.h }[2], [x8]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: str s1, [x1]
+; BE-NEXT: str h0, [x1, #4]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -479,23 +479,23 @@ define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits_default_align:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
-; CHECK-NEXT: st1.h { v0 }[2], [x8]
+; CHECK-NEXT: mov h1, v0[2]
; CHECK-NEXT: str s0, [x1]
+; CHECK-NEXT: str h1, [x1, #4]
; CHECK-NEXT: ret
;
; BE-LABEL: load_ext_to_64bits_default_align:
; BE: // %bb.0: // %entry
; BE-NEXT: ldr s0, [x0]
-; BE-NEXT: add x8, x1, #4
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: bic v0.4h, #255, lsl #8
; BE-NEXT: rev32 v1.8h, v0.8h
-; BE-NEXT: st1 { v0.h }[2], [x8]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: str s1, [x1]
+; BE-NEXT: str h0, [x1, #4]
; BE-NEXT: ret
entry:
%l = load <3 x i8>, ptr %src
@@ -508,23 +508,23 @@ define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits_align_4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
-; CHECK-NEXT: st1.h { v0 }[2], [x8]
+; CHECK-NEXT: mov h1, v0[2]
; CHECK-NEXT: str s0, [x1]
+; CHECK-NEXT: str h1, [x1, #4]
; CHECK-NEXT: ret
;
; BE-LABEL: load_ext_to_64bits_align_4:
; BE: // %bb.0: // %entry
; BE-NEXT: ldr s0, [x0]
-; BE-NEXT: add x8, x1, #4
; BE-NEXT: rev32 v0.8b, v0.8b
; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: bic v0.4h, #255, lsl #8
; BE-NEXT: rev32 v1.8h, v0.8h
-; BE-NEXT: st1 { v0.h }[2], [x8]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: str s1, [x1]
+; BE-NEXT: str h0, [x1, #4]
; BE-NEXT: ret
entry:
%l = load <3 x i8>, ptr %src, align 4
@@ -542,14 +542,14 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI15_0 at PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr d1, [x8, lCPI15_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: add.4h v0, v0, v1
-; CHECK-NEXT: st1.h { v0 }[2], [x8]
+; CHECK-NEXT: mov h1, v0[2]
; CHECK-NEXT: str s0, [x1]
+; CHECK-NEXT: str h1, [x1, #4]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
;
@@ -567,12 +567,12 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; BE-NEXT: adrp x8, .LCPI15_0
; BE-NEXT: add x8, x8, :lo12:.LCPI15_0
; BE-NEXT: ld1 { v1.4h }, [x8]
-; BE-NEXT: add x8, x1, #4
; BE-NEXT: bic v0.4h, #255, lsl #8
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: rev32 v1.8h, v0.8h
-; BE-NEXT: st1 { v0.h }[2], [x8]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: str s1, [x1]
+; BE-NEXT: str h0, [x1, #4]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -587,12 +587,12 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: mov b1, v0[4]
+; CHECK-NEXT: mov b2, v0[8]
+; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store:
@@ -602,12 +602,12 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -621,12 +621,12 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_default_align:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: mov b1, v0[4]
+; CHECK-NEXT: mov b2, v0[8]
+; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_default_align:
@@ -636,12 +636,12 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -655,12 +655,12 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_align_4:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: mov b1, v0[4]
+; CHECK-NEXT: mov b2, v0[8]
+; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_align_4:
@@ -670,12 +670,12 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -689,13 +689,12 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_const_offset_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #2
-; CHECK-NEXT: add x9, x1, #3
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: mov b1, v0[4]
+; CHECK-NEXT: mov b2, v0[8]
+; CHECK-NEXT: stur b0, [x1, #1]
+; CHECK-NEXT: stur b1, [x1, #2]
+; CHECK-NEXT: stur b2, [x1, #3]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_const_offset_1:
@@ -705,12 +704,12 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #3]
-; BE-NEXT: sturh w9, [x1, #1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #3]
+; BE-NEXT: sturh w8, [x1, #1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -725,13 +724,12 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_const_offset_3:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: add x9, x1, #5
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: add x8, x1, #3
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: mov b1, v0[4]
+; CHECK-NEXT: mov b2, v0[8]
+; CHECK-NEXT: stur b0, [x1, #3]
+; CHECK-NEXT: stur b1, [x1, #4]
+; CHECK-NEXT: stur b2, [x1, #5]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_const_offset_3:
@@ -741,12 +739,12 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #5]
-; BE-NEXT: sturh w9, [x1, #3]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #5]
+; BE-NEXT: sturh w8, [x1, #3]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -765,11 +763,11 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: shrn.4h v0, v0, #16
; CHECK-NEXT: uzp1.8b v1, v0, v0
-; CHECK-NEXT: umov.h w8, v0[2]
+; CHECK-NEXT: mov h0, v0[2]
; CHECK-NEXT: str s1, [sp, #12]
-; CHECK-NEXT: ldrh w9, [sp, #12]
-; CHECK-NEXT: strb w8, [x1, #2]
-; CHECK-NEXT: strh w9, [x1]
+; CHECK-NEXT: ldrh w8, [sp, #12]
+; CHECK-NEXT: stur b0, [x1, #2]
+; CHECK-NEXT: strh w8, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
@@ -780,12 +778,12 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -804,15 +802,15 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-NEXT: adrp x8, lCPI22_0 at PAGE
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q1, [x8, lCPI22_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
-; CHECK-NEXT: st1.b { v0 }[8], [x8]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: st1.b { v0 }[0], [x0]
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: mov b1, v0[8]
+; CHECK-NEXT: mov b2, v0[4]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: stur b1, [x0, #2]
+; CHECK-NEXT: stur b2, [x0, #1]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
;
@@ -832,12 +830,12 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w9, [sp, #8]
-; BE-NEXT: strb w8, [x0, #2]
-; BE-NEXT: strh w9, [x0]
+; BE-NEXT: ldrh w8, [sp, #8]
+; BE-NEXT: stur b0, [x0, #2]
+; BE-NEXT: strh w8, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
@@ -857,15 +855,15 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-NEXT: adrp x8, lCPI23_0 at PAGE
; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q1, [x8, lCPI23_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
-; CHECK-NEXT: st1.b { v0 }[8], [x8]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: st1.b { v0 }[0], [x0]
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: mov b1, v0[8]
+; CHECK-NEXT: mov b2, v0[4]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: stur b1, [x0, #2]
+; CHECK-NEXT: stur b2, [x0, #1]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
;
@@ -885,12 +883,12 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w9, [sp, #8]
-; BE-NEXT: strb w8, [x0, #2]
-; BE-NEXT: strh w9, [x0]
+; BE-NEXT: ldrh w8, [sp, #8]
+; BE-NEXT: stur b0, [x0, #2]
+; BE-NEXT: strh w8, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 37c6374215d81..b29195eed9149 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -50,10 +50,10 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; CHECK-LABEL: uaddo_v3i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: add x8, x0, #8
+; CHECK-NEXT: mov s2, v1.s[2]
; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: st1 { v1.s }[2], [x8]
; CHECK-NEXT: str d1, [x0]
+; CHECK-NEXT: str s2, [x0, #8]
; CHECK-NEXT: ret
%t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -212,26 +212,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: bic v1.4s, #255, lsl #24
; CHECK-NEXT: bic v0.4s, #255, lsl #24
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: mov v1.16b, v0.16b
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[1]
-; CHECK-NEXT: fmov w11, s0
-; CHECK-NEXT: bic v1.4s, #1, lsl #24
+; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: mov w10, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: str h1, [x0]
+; CHECK-NEXT: bic v0.4s, #1, lsl #24
; CHECK-NEXT: sturh w8, [x0, #9]
; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: strh w9, [x0, #6]
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: sturh w10, [x0, #3]
+; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
; CHECK-NEXT: strb w8, [x0, #11]
; CHECK-NEXT: lsr w8, w10, #16
-; CHECK-NEXT: strb w9, [x0, #8]
-; CHECK-NEXT: lsr w9, w11, #16
-; CHECK-NEXT: sturh w10, [x0, #3]
-; CHECK-NEXT: mvn v0.16b, v1.16b
-; CHECK-NEXT: strh w11, [x0]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: strh w9, [x0, #6]
+; CHECK-NEXT: lsr w9, w9, #16
; CHECK-NEXT: strb w8, [x0, #5]
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: strb w9, [x0, #8]
+; CHECK-NEXT: lsr w9, w10, #16
; CHECK-NEXT: strb w9, [x0, #2]
; CHECK-NEXT: ret
%t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
@@ -249,15 +249,14 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: shl v1.4h, v2.4h, #15
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: addv h1, v1.4h
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b1, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index 3a481efd9785a..12ea8862a03cd 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -55,13 +55,12 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: st1 { v1.s }[2], [x8]
; CHECK-NEXT: str d1, [x0]
-; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: cmtst v0.4s, v2.4s, v2.4s
+; CHECK-NEXT: mov s2, v1.s[2]
+; CHECK-NEXT: str s2, [x0, #8]
; CHECK-NEXT: ret
%t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -260,27 +259,27 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; CHECK-NEXT: bic v0.4s, #255, lsl #24
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uzp2 v1.4s, v3.4s, v2.4s
-; CHECK-NEXT: ushr v2.4s, v0.4s, #24
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[1]
-; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov w8, v1.s[3]
+; CHECK-NEXT: uzp2 v0.4s, v3.4s, v2.4s
+; CHECK-NEXT: ushr v2.4s, v1.4s, #24
+; CHECK-NEXT: mov w10, v1.s[1]
+; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: str h1, [x0]
; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
; CHECK-NEXT: sturh w8, [x0, #9]
; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: strh w9, [x0, #6]
-; CHECK-NEXT: lsr w9, w9, #16
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: sturh w10, [x0, #3]
; CHECK-NEXT: strb w8, [x0, #11]
; CHECK-NEXT: lsr w8, w10, #16
-; CHECK-NEXT: orn v0.16b, v2.16b, v1.16b
-; CHECK-NEXT: strb w9, [x0, #8]
-; CHECK-NEXT: lsr w9, w11, #16
-; CHECK-NEXT: sturh w10, [x0, #3]
-; CHECK-NEXT: strh w11, [x0]
+; CHECK-NEXT: fmov w10, s1
+; CHECK-NEXT: strh w9, [x0, #6]
+; CHECK-NEXT: lsr w9, w9, #16
+; CHECK-NEXT: orn v0.16b, v2.16b, v0.16b
; CHECK-NEXT: strb w8, [x0, #5]
+; CHECK-NEXT: strb w9, [x0, #8]
+; CHECK-NEXT: lsr w9, w10, #16
; CHECK-NEXT: strb w9, [x0, #2]
; CHECK-NEXT: ret
%t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
@@ -299,11 +298,10 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: addv h1, v0.4h
+; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: addv h1, v1.4h
+; CHECK-NEXT: str b1, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 710ea70d678c5..a580913d40d95 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -109,7 +109,7 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
; CHECK-NEXT: shl.16b v1, v1, #7
; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: str b0, [sp]
; CHECK-NEXT: mov x13, sp
; CHECK-NEXT: cmlt.16b v1, v1, #0
; CHECK-NEXT: umov.b w9, v1[0]
@@ -209,44 +209,44 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
; CHECK-NEXT: ; kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: umov.b w9, v2[0]
; CHECK-NEXT: umov.b w10, v2[1]
-; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: umov.b w11, v2[2]
+; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: umov.b w12, v2[2]
; CHECK-NEXT: umov.b w13, v2[3]
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: umov.b w14, v2[4]
+; CHECK-NEXT: umov.b w15, v2[4]
; CHECK-NEXT: str s0, [sp]
+; CHECK-NEXT: mov s3, v0[3]
; CHECK-NEXT: and x10, x10, #0x1
-; CHECK-NEXT: and x15, x9, #0x1
-; CHECK-NEXT: bfi x12, x9, #2, #1
-; CHECK-NEXT: and x9, x11, #0x1
-; CHECK-NEXT: add x10, x15, x10
-; CHECK-NEXT: umov.b w11, v2[5]
-; CHECK-NEXT: add x9, x10, x9
-; CHECK-NEXT: orr x15, x8, x10, lsl #2
-; CHECK-NEXT: umov.b w10, v2[6]
-; CHECK-NEXT: st1.s { v0 }[1], [x12]
-; CHECK-NEXT: add x12, x8, x9, lsl #2
-; CHECK-NEXT: and x13, x13, #0x1
-; CHECK-NEXT: st1.s { v0 }[2], [x15]
-; CHECK-NEXT: add x9, x9, x13
-; CHECK-NEXT: st1.s { v0 }[3], [x12]
-; CHECK-NEXT: and x12, x14, #0x1
-; CHECK-NEXT: and x11, x11, #0x1
+; CHECK-NEXT: and x14, x9, #0x1
+; CHECK-NEXT: bfi x11, x9, #2, #1
+; CHECK-NEXT: add x9, x14, x10
+; CHECK-NEXT: umov.b w10, v2[5]
+; CHECK-NEXT: st1.s { v0 }[1], [x11]
+; CHECK-NEXT: and x11, x12, #0x1
+; CHECK-NEXT: orr x14, x8, x9, lsl #2
+; CHECK-NEXT: and x12, x13, #0x1
+; CHECK-NEXT: add x9, x9, x11
+; CHECK-NEXT: umov.b w11, v2[6]
+; CHECK-NEXT: and x13, x15, #0x1
; CHECK-NEXT: add x12, x9, x12
-; CHECK-NEXT: and w10, w10, #0x1
-; CHECK-NEXT: and x9, x9, #0x7
-; CHECK-NEXT: add x11, x12, x11
+; CHECK-NEXT: st1.s { v0 }[2], [x14]
+; CHECK-NEXT: str s3, [x8, x9, lsl #2]
+; CHECK-NEXT: and x9, x10, #0x1
+; CHECK-NEXT: add x10, x12, x13
; CHECK-NEXT: and x12, x12, #0x7
-; CHECK-NEXT: str s1, [x8, x9, lsl #2]
-; CHECK-NEXT: add w10, w11, w10
-; CHECK-NEXT: and x11, x11, #0x7
-; CHECK-NEXT: add x12, x8, x12, lsl #2
+; CHECK-NEXT: add x9, x10, x9
; CHECK-NEXT: and x10, x10, #0x7
-; CHECK-NEXT: add x9, x8, x11, lsl #2
-; CHECK-NEXT: add x8, x8, x10, lsl #2
-; CHECK-NEXT: st1.s { v1 }[1], [x12]
-; CHECK-NEXT: st1.s { v1 }[2], [x9]
-; CHECK-NEXT: st1.s { v1 }[3], [x8]
+; CHECK-NEXT: str s1, [x8, x12, lsl #2]
+; CHECK-NEXT: and x12, x9, #0x7
+; CHECK-NEXT: mov s0, v1[3]
+; CHECK-NEXT: and w11, w11, #0x1
+; CHECK-NEXT: add x10, x8, x10, lsl #2
+; CHECK-NEXT: add x12, x8, x12, lsl #2
+; CHECK-NEXT: add w9, w9, w11
+; CHECK-NEXT: and x9, x9, #0x7
+; CHECK-NEXT: st1.s { v1 }[1], [x10]
+; CHECK-NEXT: st1.s { v1 }[2], [x12]
+; CHECK-NEXT: str s0, [x8, x9, lsl #2]
; CHECK-NEXT: ldp q0, q1, [sp], #32
; CHECK-NEXT: ret
%out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 6536f0c355b47..eb83aa5a13e52 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1570,36 +1570,36 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) {
; CHECK-NEXT: LBB16_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d0, [x0, x8]
-; CHECK-NEXT: add x9, x1, #112
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: str xzr, [x1, #120]
+; CHECK-NEXT: str xzr, [x1, #104]
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: str xzr, [x1, #104]
; CHECK-NEXT: str xzr, [x1, #88]
; CHECK-NEXT: str xzr, [x1, #72]
+; CHECK-NEXT: str xzr, [x1, #56]
; CHECK-NEXT: ushll2.4s v1, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: str xzr, [x1, #56]
; CHECK-NEXT: str xzr, [x1, #40]
; CHECK-NEXT: str xzr, [x1, #24]
+; CHECK-NEXT: str xzr, [x1, #8]
; CHECK-NEXT: ushll2.2d v2, v1, #0
; CHECK-NEXT: ushll.2d v1, v1, #0
; CHECK-NEXT: ushll2.2d v3, v0, #0
; CHECK-NEXT: ushll.2d v0, v0, #0
-; CHECK-NEXT: str xzr, [x1, #8]
-; CHECK-NEXT: st1.d { v2 }[1], [x9]
-; CHECK-NEXT: add x9, x1, #80
-; CHECK-NEXT: st1.d { v1 }[1], [x9]
-; CHECK-NEXT: add x9, x1, #48
; CHECK-NEXT: str d2, [x1, #96]
-; CHECK-NEXT: st1.d { v3 }[1], [x9]
-; CHECK-NEXT: add x9, x1, #16
+; CHECK-NEXT: mov d2, v2[1]
; CHECK-NEXT: str d1, [x1, #64]
+; CHECK-NEXT: mov d1, v1[1]
; CHECK-NEXT: str d3, [x1, #32]
+; CHECK-NEXT: mov d3, v3[1]
; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: mov d0, v0[1]
+; CHECK-NEXT: str d2, [x1, #112]
+; CHECK-NEXT: str d1, [x1, #80]
+; CHECK-NEXT: str d3, [x1, #48]
+; CHECK-NEXT: str d0, [x1, #16]
; CHECK-NEXT: add x1, x1, #256
-; CHECK-NEXT: st1.d { v0 }[1], [x9]
; CHECK-NEXT: b.ne LBB16_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -1612,10 +1612,9 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) {
; CHECK-BE-NEXT: add x9, x0, x8
; CHECK-BE-NEXT: add x8, x8, #16
; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
-; CHECK-BE-NEXT: add x9, x1, #120
; CHECK-BE-NEXT: str xzr, [x1, #112]
-; CHECK-BE-NEXT: str xzr, [x1, #96]
; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: str xzr, [x1, #96]
; CHECK-BE-NEXT: str xzr, [x1, #80]
; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT: str xzr, [x1, #64]
@@ -1629,18 +1628,19 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) {
; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-BE-NEXT: st1 { v2.d }[1], [x9]
-; CHECK-BE-NEXT: add x9, x1, #88
-; CHECK-BE-NEXT: st1 { v1.d }[1], [x9]
-; CHECK-BE-NEXT: add x9, x1, #56
; CHECK-BE-NEXT: str d2, [x1, #104]
-; CHECK-BE-NEXT: st1 { v3.d }[1], [x9]
-; CHECK-BE-NEXT: add x9, x1, #24
+; CHECK-BE-NEXT: mov d2, v2.d[1]
; CHECK-BE-NEXT: str d1, [x1, #72]
+; CHECK-BE-NEXT: mov d1, v1.d[1]
; CHECK-BE-NEXT: str d3, [x1, #40]
+; CHECK-BE-NEXT: mov d3, v3.d[1]
; CHECK-BE-NEXT: str d0, [x1, #8]
+; CHECK-BE-NEXT: mov d0, v0.d[1]
+; CHECK-BE-NEXT: str d2, [x1, #120]
+; CHECK-BE-NEXT: str d1, [x1, #88]
+; CHECK-BE-NEXT: str d3, [x1, #56]
+; CHECK-BE-NEXT: str d0, [x1, #24]
; CHECK-BE-NEXT: add x1, x1, #256
-; CHECK-BE-NEXT: st1 { v0.d }[1], [x9]
; CHECK-BE-NEXT: b.ne .LBB16_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
@@ -2031,58 +2031,7 @@ exit:
ret void
}
-; CHECK-LABEL: lCPI20_0:
-; CHECK-NEXT: .byte 0 ; 0x0
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 1 ; 0x1
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 2 ; 0x2
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 3 ; 0x3
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT:lCPI20_1:
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 7 ; 0x7
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 8 ; 0x8
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 9 ; 0x9
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 10 ; 0xa
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT:lCPI20_2:
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 11 ; 0xb
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 12 ; 0xc
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 13 ; 0xd
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 14 ; 0xe
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 15 ; 0xf
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT:lCPI20_3:
+; CHECK-LABEL: lCPI20_0:
; CHECK-NEXT: .byte 0 ; 0x0
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
@@ -2099,6 +2048,57 @@ exit:
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: lCPI20_1:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 1 ; 0x1
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 2 ; 0x2
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 3 ; 0x3
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 4 ; 0x4
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 5 ; 0x5
+; CHECK-NEXT: lCPI20_2:
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 6 ; 0x6
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 7 ; 0x7
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 8 ; 0x8
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 9 ; 0x9
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 10 ; 0xa
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: lCPI20_3:
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 11 ; 0xb
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 12 ; 0xc
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 13 ; 0xd
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 14 ; 0xe
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 15 ; 0xf
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-BE-LABEL: .LCPI20_0:
; CHECK-BE-NEXT: .byte 255 // 0xff
@@ -2193,18 +2193,18 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8
; CHECK-NEXT: add x8, x8, #16
-; CHECK-NEXT: ldp q5, q4, [x9]
-; CHECK-NEXT: add x9, x1, #56
+; CHECK-NEXT: ldp q4, q5, [x9]
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: tbl.16b v4, { v4 }, v3
-; CHECK-NEXT: tbl.16b v6, { v5 }, v2
-; CHECK-NEXT: tbl.16b v7, { v5 }, v1
; CHECK-NEXT: tbl.16b v5, { v5 }, v0
+; CHECK-NEXT: tbl.16b v6, { v4 }, v3
+; CHECK-NEXT: tbl.16b v7, { v4 }, v2
+; CHECK-NEXT: tbl.16b v4, { v4 }, v1
; CHECK-NEXT: stp q7, q6, [x1, #16]
-; CHECK-NEXT: str q5, [x1]
-; CHECK-NEXT: str d4, [x1, #48]
+; CHECK-NEXT: mov s6, v5[2]
+; CHECK-NEXT: str q4, [x1]
+; CHECK-NEXT: str d5, [x1, #48]
+; CHECK-NEXT: str s6, [x1, #56]
; CHECK-NEXT: add x1, x1, #64
-; CHECK-NEXT: st1.s { v4 }[2], [x9]
; CHECK-NEXT: b.ne LBB20_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -2239,19 +2239,19 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
; CHECK-BE-NEXT: ld1 { v4.16b }, [x10]
; CHECK-BE-NEXT: cmp x8, #128
; CHECK-BE-NEXT: tbl v6.16b, { v5.16b }, v3.16b
-; CHECK-BE-NEXT: tbl v7.16b, { v5.16b }, v2.16b
+; CHECK-BE-NEXT: tbl v16.16b, { v5.16b }, v2.16b
; CHECK-BE-NEXT: tbl v5.16b, { v5.16b }, v1.16b
; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT: st1 { v6.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, #16
-; CHECK-BE-NEXT: rev32 v16.16b, v4.16b
+; CHECK-BE-NEXT: rev32 v7.16b, v4.16b
; CHECK-BE-NEXT: rev64 v4.16b, v4.16b
-; CHECK-BE-NEXT: st1 { v7.16b }, [x9]
-; CHECK-BE-NEXT: add x9, x1, #56
; CHECK-BE-NEXT: st1 { v5.16b }, [x1]
+; CHECK-BE-NEXT: st1 { v16.16b }, [x9]
+; CHECK-BE-NEXT: mov s6, v7.s[2]
; CHECK-BE-NEXT: str d4, [x1, #48]
+; CHECK-BE-NEXT: str s6, [x1, #56]
; CHECK-BE-NEXT: add x1, x1, #64
-; CHECK-BE-NEXT: st1 { v16.s }[2], [x9]
; CHECK-BE-NEXT: b.ne .LBB20_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
@@ -2592,36 +2592,36 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
; CHECK-BE-NEXT: ld1 { v7.16b }, [x9]
; CHECK-BE-NEXT: add x9, x9, #16
; CHECK-BE-NEXT: cmp x8, #128
-; CHECK-BE-NEXT: ld1 { v17.16b }, [x9]
+; CHECK-BE-NEXT: ld1 { v16.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, #80
-; CHECK-BE-NEXT: tbl v16.16b, { v7.16b }, v6.16b
+; CHECK-BE-NEXT: tbl v17.16b, { v7.16b }, v6.16b
; CHECK-BE-NEXT: tbl v18.16b, { v7.16b }, v5.16b
-; CHECK-BE-NEXT: tbl v19.16b, { v7.16b }, v4.16b
-; CHECK-BE-NEXT: tbl v20.16b, { v7.16b }, v3.16b
-; CHECK-BE-NEXT: tbl v21.16b, { v17.16b }, v0.16b
-; CHECK-BE-NEXT: st1 { v16.16b }, [x9]
+; CHECK-BE-NEXT: tbl v20.16b, { v7.16b }, v4.16b
+; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v0.16b
+; CHECK-BE-NEXT: tbl v21.16b, { v7.16b }, v3.16b
+; CHECK-BE-NEXT: st1 { v17.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, #64
-; CHECK-BE-NEXT: tbl v16.16b, { v7.16b }, v2.16b
+; CHECK-BE-NEXT: tbl v17.16b, { v7.16b }, v2.16b
; CHECK-BE-NEXT: st1 { v18.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, #48
-; CHECK-BE-NEXT: tbl v18.16b, { v17.16b }, v2.16b
-; CHECK-BE-NEXT: st1 { v19.16b }, [x9]
-; CHECK-BE-NEXT: add x9, x1, #32
-; CHECK-BE-NEXT: tbl v17.16b, { v17.16b }, v1.16b
+; CHECK-BE-NEXT: rev16 v18.16b, v19.16b
; CHECK-BE-NEXT: st1 { v20.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #32
+; CHECK-BE-NEXT: tbl v20.16b, { v16.16b }, v2.16b
+; CHECK-BE-NEXT: st1 { v21.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, #16
-; CHECK-BE-NEXT: rev64 v19.16b, v21.16b
-; CHECK-BE-NEXT: st1 { v16.16b }, [x9]
-; CHECK-BE-NEXT: rev16 v16.16b, v21.16b
+; CHECK-BE-NEXT: tbl v16.16b, { v16.16b }, v1.16b
+; CHECK-BE-NEXT: st1 { v17.16b }, [x9]
+; CHECK-BE-NEXT: rev64 v17.16b, v19.16b
; CHECK-BE-NEXT: add x9, x1, #112
-; CHECK-BE-NEXT: st1 { v18.16b }, [x9]
-; CHECK-BE-NEXT: add x9, x1, #96
; CHECK-BE-NEXT: tbl v7.16b, { v7.16b }, v1.16b
-; CHECK-BE-NEXT: st1 { v17.16b }, [x9]
-; CHECK-BE-NEXT: add x9, x1, #136
-; CHECK-BE-NEXT: st1 { v16.h }[4], [x9]
-; CHECK-BE-NEXT: fmov x9, d19
+; CHECK-BE-NEXT: mov h18, v18.h[4]
+; CHECK-BE-NEXT: st1 { v20.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #96
+; CHECK-BE-NEXT: st1 { v16.16b }, [x9]
+; CHECK-BE-NEXT: fmov x9, d17
; CHECK-BE-NEXT: st1 { v7.16b }, [x1]
+; CHECK-BE-NEXT: str h18, [x1, #136]
; CHECK-BE-NEXT: str x9, [x1, #128]!
; CHECK-BE-NEXT: b.ne .LBB21_1
; CHECK-BE-NEXT: // %bb.2: // %exit
More information about the llvm-commits
mailing list