[llvm] [AArch64] Optimize extending loads of small vectors (PR #163064)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 2 07:52:46 PST 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/163064
>From 5eb00c06f66b7d1e5c088442d04a99f710dccd13 Mon Sep 17 00:00:00 2001
From: Guy David <guyda at apple.com>
Date: Sun, 2 Nov 2025 16:34:50 +0200
Subject: [PATCH] [AArch64] Optimize extending loads of small vectors
Reduces the total amount of loads and the amount of moves between SIMD
registers and general-purpose registers.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 156 +++++++++---
llvm/test/CodeGen/AArch64/aarch64-load-ext.ll | 226 +++++++++++++++---
llvm/test/CodeGen/AArch64/aarch64-smull.ll | 12 +-
llvm/test/CodeGen/AArch64/add.ll | 27 +--
llvm/test/CodeGen/AArch64/andorxor.ll | 81 +++----
llvm/test/CodeGen/AArch64/bitcast.ll | 6 +-
llvm/test/CodeGen/AArch64/ctlz.ll | 18 +-
llvm/test/CodeGen/AArch64/ctpop.ll | 18 +-
llvm/test/CodeGen/AArch64/cttz.ll | 16 +-
llvm/test/CodeGen/AArch64/extbinopload.ll | 26 +-
llvm/test/CodeGen/AArch64/load.ll | 11 +-
llvm/test/CodeGen/AArch64/mul.ll | 27 +--
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 26 +-
llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll | 8 +-
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 26 +-
llvm/test/CodeGen/AArch64/store.ll | 7 +-
llvm/test/CodeGen/AArch64/sub.ll | 27 +--
.../AArch64/sve-fixed-length-masked-gather.ll | 13 +-
.../sve-fixed-length-masked-scatter.ll | 13 +-
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 26 +-
llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 26 +-
llvm/test/CodeGen/AArch64/v3f-to-int.ll | 15 +-
.../AArch64/vec-combine-compare-to-bitmask.ll | 7 +-
.../AArch64/vec3-loads-ext-trunc-stores.ll | 73 +++---
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 12 +-
25 files changed, 539 insertions(+), 364 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..c3198df9bd168 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1438,12 +1438,22 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+ // Marked as Legal so the DAGCombiner will fold [zs]ext loads. These are
+ // later decomposed into scalar loads in `performSmallVectorLoadExtCombine`.
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
// ADDP custom lowering
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6744,8 +6754,35 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
}
+/// Helper function to check if a small vector load can be optimized.
+static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD,
+ const AArch64Subtarget &Subtarget) {
+ if (!Subtarget.isNeonAvailable())
+ return false;
+ if (LD->isVolatile())
+ return false;
+
+ EVT MemVT = LD->getMemoryVT();
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16 &&
+ MemVT != MVT::v4i16)
+ return false;
+
+ Align Alignment = LD->getAlign();
+ Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
+ if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
+ return false;
+
+ return true;
+}
+
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT ExtVT = ExtVal.getValueType();
+ // Small, illegal vectors can be extended inreg.
+ if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
+ if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
+ isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
+ return true;
+ }
if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
return false;
@@ -7228,37 +7265,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
return DAG.getMergeValues({Loaded, Chain}, DL);
}
- // Custom lowering for extending v4i8 vector loads.
- EVT VT = Op->getValueType(0);
- assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
-
- if (LoadNode->getMemoryVT() != MVT::v4i8)
- return SDValue();
-
- // Avoid generating unaligned loads.
- if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
- return SDValue();
-
- unsigned ExtType;
- if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
- ExtType = ISD::SIGN_EXTEND;
- else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
- LoadNode->getExtensionType() == ISD::EXTLOAD)
- ExtType = ISD::ZERO_EXTEND;
- else
- return SDValue();
-
- SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
- LoadNode->getBasePtr(), MachinePointerInfo());
- SDValue Chain = Load.getValue(1);
- SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
- SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
- SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
- Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
- DAG.getConstant(0, DL, MVT::i64));
- if (VT == MVT::v4i32)
- Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
- return DAG.getMergeValues({Ext, Chain}, DL);
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
@@ -23300,6 +23307,78 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
}
+/// Helper function to optimize loads of extended small vectors.
+/// These patterns would otherwise get scalarized into inefficient sequences.
+static SDValue performSmallVectorLoadExtCombine(LoadSDNode *Load,
+ SelectionDAG &DAG) {
+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
+ return SDValue();
+
+ EVT MemVT = Load->getMemoryVT();
+ EVT ResVT = Load->getValueType(0);
+ unsigned NumElts = ResVT.getVectorNumElements();
+ unsigned DstEltBits = ResVT.getScalarSizeInBits();
+ unsigned SrcEltBits = MemVT.getScalarSizeInBits();
+
+ unsigned ExtOpcode;
+ switch (Load->getExtensionType()) {
+ case ISD::EXTLOAD:
+ case ISD::ZEXTLOAD:
+ ExtOpcode = ISD::ZERO_EXTEND;
+ break;
+ case ISD::SEXTLOAD:
+ ExtOpcode = ISD::SIGN_EXTEND;
+ break;
+ case ISD::NON_EXTLOAD:
+ return SDValue();
+ }
+
+ SDLoc DL(Load);
+ SDValue Chain = Load->getChain();
+ SDValue BasePtr = Load->getBasePtr();
+ const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
+ Align Alignment = Load->getAlign();
+
+ // Load the data as an FP scalar to avoid issues with integer loads.
+ unsigned LoadBits = MemVT.getStoreSizeInBits();
+ MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
+ SDValue ScalarLoad =
+ DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
+
+ MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
+ SDValue ScalarToVec =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
+ MVT BitcastTy =
+ MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
+ SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
+
+ SDValue Res = Bitcast;
+ unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
+ unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
+ while (CurrentEltBits < DstEltBits) {
+ if (Res.getValueSizeInBits() >= 128) {
+ CurrentNumElts = CurrentNumElts / 2;
+ MVT ExtractVT =
+ MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+ CurrentEltBits = CurrentEltBits * 2;
+ MVT ExtVT =
+ MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+ Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
+ }
+
+ if (CurrentNumElts != NumElts) {
+ MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+
+ return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -24426,6 +24505,9 @@ static SDValue performLOADCombine(SDNode *N,
}
}
+ if (SDValue Result = performSmallVectorLoadExtCombine(LD, DAG))
+ return Result;
+
if (LD->isVolatile() || !Subtarget->isLittleEndian())
return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad0..2ebc482e54f78 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) {
define <2 x i16> @test1(ptr %v2i16_ptr) {
; CHECK-LE-LABEL: test1:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #2
-; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test1:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #2
-; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) {
define <2 x i8> @test3(ptr %v2i8_ptr) {
; CHECK-LE-LABEL: test3:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #1
-; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test3:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #1
-; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) {
define <2 x i32> @fsext_v2i32(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i32:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i32:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) {
define <2 x i16> @fsext_v2i16(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i16:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i16:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -497,3 +495,175 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric
%v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
ret <4 x i8> %v4i8
}
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 0cd885e599817..2cd54d4113542 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,21 +222,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
-; CHECK-NEON-NEXT: ldr h0, [x0]
+; CHECK-NEON-NEXT: ldr s0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: mov v0.d[1], x8
-; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
-; CHECK-SVE-NEXT: ldr h0, [x0]
+; CHECK-SVE-NEXT: ldr s0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: mov v0.d[1], x8
-; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index 96168cb80196f..7502db4c5aa93 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -56,13 +56,11 @@ entry:
define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -228,13 +225,9 @@ entry:
define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index a7875dbebd0e6..d8d003c85eed6 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -176,12 +176,12 @@ entry:
define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: and_v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
@@ -212,12 +212,12 @@ entry:
define void @or_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: or_v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
@@ -248,12 +248,12 @@ entry:
define void @xor_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: xor_v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
@@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -698,12 +695,10 @@ entry:
define void @and_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: and_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
@@ -734,12 +729,10 @@ entry:
define void @or_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: or_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
@@ -770,12 +763,10 @@ entry:
define void @xor_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: xor_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 20f19fddf790a..002e6cd509bec 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: add x8, sp, #12
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: str s0, [sp, #12]
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x8]
-; CHECK-SD-NEXT: orr x8, x8, #0x2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 04124609eec74..b1b869ec9e1ff 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -6,11 +6,10 @@
define void @v2i8(ptr %p1) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x0, #1]
+; CHECK-SD-NEXT: ldr h1, [x0]
; CHECK-SD-NEXT: movi v0.2s, #24
-; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: mov v1.s[1], w9
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
@@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -145,11 +143,9 @@ entry:
define void @v2i16(ptr %p1) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x0, #2]
+; CHECK-SD-NEXT: ldr s1, [x0]
; CHECK-SD-NEXT: movi v0.2s, #16
-; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: mov v1.s[1], w9
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index d547b6bec5b83..9c59f1b233b5d 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -6,10 +6,9 @@
define void @v2i8(ptr %p1) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x0, #1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: mov v0.s[1], w9
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
@@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -143,10 +141,8 @@ entry:
define void @v2i16(ptr %p1) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x0, #2]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: mov v0.s[1], w9
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index fc9bf2c0aca65..c9181b4c312d1 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -6,10 +6,10 @@
define void @v2i8(ptr %p1) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #1
+; CHECK-SD-NEXT: ldr h0, [x0]
; CHECK-SD-NEXT: movi v1.2s, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: orr v0.2s, #1, lsl #8
; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b
@@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -219,10 +218,9 @@ entry:
define void @v2i16(ptr %p1) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #2
+; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: movi v1.2s, #1
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: orr v0.2s, #1, lsl #16
; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7278e40..d18cff51c6101 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) {
define <2 x i16> @std_v2i8_v2i16(ptr %p) {
; CHECK-LABEL: std_v2i8_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrb w9, [x0, #3]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: mov v1.s[1], w9
+; CHECK-NEXT: ldr h0, [x0, #2]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.2s, v0.2s, #3
-; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%l1 = load <2 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 2
@@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) {
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: ldr s1, [x0, #4]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ldr s0, [x0, #4]
+; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #3
-; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #3
+; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%l1b = load volatile float, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e37d6eaf..b138fa4085427 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) {
define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
; CHECK-SD-LABEL: load_v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: ret
;
@@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) {
define <2 x i16> @load_v2i16(ptr %ptr) {
; CHECK-SD-LABEL: load_v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9c69a6f03b858..475bd22c6ebcb 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -68,13 +68,11 @@ entry:
define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: stur b1, [x0, #1]
@@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -240,13 +237,9 @@ entry:
define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 1c4a504d0ab70..9e321bbecb80b 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -115,8 +115,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
; CHECK-SD-NEXT: sqadd v0.4h, v0.4h, v1.4h
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
index 3e708b0678fbc..297b25ed075e4 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
@@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB3_1: // %loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #1
-; CHECK-NEXT: ldrsb w10, [x9]
-; CHECK-NEXT: ldrsb w9, [x9, #1]
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ldr h0, [x0, x8, lsl #1]
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: scvtf v0.2d, v0.2d
; CHECK-NEXT: str q0, [x1, x8, lsl #4]
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 3af858713525b..a30e9045c6a20 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -115,8 +115,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
; CHECK-SD-NEXT: sqsub v0.4h, v0.4h, v1.4h
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 3a9f12b838702..1dc55fccc3dac 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: strb w2, [x3, #2]
; CHECK-SD-NEXT: mov v0.h[1], w1
; CHECK-SD-NEXT: mov v0.h[2], w2
; CHECK-SD-NEXT: xtn v0.8b, v0.8h
-; CHECK-SD-NEXT: str s0, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
-; CHECK-SD-NEXT: strb w2, [x3, #2]
-; CHECK-SD-NEXT: strh w8, [x3]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: str h0, [x3]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 5e278d59b6591..dd920b98e18eb 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -56,13 +56,11 @@ entry:
define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -228,13 +225,9 @@ entry:
define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 6fd5b820a2242..b457e0307fbe1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: ldrb w9, [x0, #1]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
@@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ed03f9b322432..4fb3bf7392d4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: ldrb w9, [x0, #1]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: sshll v1.2d, v1.2s, #0
@@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: sshll v1.2d, v1.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 3cfb24aaccb11..cd02d18e61643 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x1]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT: ldrb w10, [x0, #1]
-; CHECK-SD-NEXT: ldrb w11, [x1, #1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
@@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x1]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT: ldrh w10, [x0, #2]
-; CHECK-SD-NEXT: ldrh w11, [x1, #2]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a71cf95a728db..ef70137e6deee 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x1]
-; CHECK-SD-NEXT: ldrb w10, [x0, #1]
-; CHECK-SD-NEXT: ldrb w11, [x1, #1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
@@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x1]
-; CHECK-SD-NEXT: ldrh w10, [x0, #2]
-; CHECK-SD-NEXT: ldrh w11, [x1, #2]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index f6553b6acec9d..6d4061fb02cff 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
-; CHECK-LABEL: convert_v3f32
-; CHECK: strb
-; CHECK: strh
define void @convert_v3f32() {
+; CHECK-LABEL: convert_v3f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: str wzr, [sp, #12]
+; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: strb wzr, [x8]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: str h0, [x8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
entry:
br label %bb
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 80029fb717575..ee74984125f77 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
; CHECK-SD-NEXT: shl.16b v0, v0, #7
; CHECK-SD-NEXT: adrp x8, lCPI20_0 at PAGE
; CHECK-SD-NEXT: ldr q1, [x8, lCPI20_0 at PAGEOFF]
-; CHECK-SD-NEXT: add x8, sp, #14
; CHECK-SD-NEXT: cmlt.16b v0, v0, #0
; CHECK-SD-NEXT: and.16b v0, v0, v1
; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8
; CHECK-SD-NEXT: zip1.16b v0, v0, v1
; CHECK-SD-NEXT: addv.8h h0, v0
-; CHECK-SD-NEXT: str h0, [sp, #14]
-; CHECK-SD-NEXT: ld1.b { v0 }[0], [x8]
-; CHECK-SD-NEXT: orr x8, x8, #0x1
-; CHECK-SD-NEXT: ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT: ushll.8h v0, v0, #0
+; CHECK-SD-NEXT: ushll.4s v0, v0, #0
; CHECK-SD-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 7d3f5bc270d6b..a5a26c185fdb5 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -338,7 +338,7 @@ define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: shl v0.4s, v0.4s, #24
@@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: ldr s0, [x0]
; BE-NEXT: ldrh w8, [x0, #4]
; BE-NEXT: rev32 v0.4h, v0.4h
+; BE-NEXT: strb w8, [x1, #2]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; BE-NEXT: rev32 v0.16b, v0.16b
-; BE-NEXT: str s0, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: rev32 v0.4h, v0.4h
+; BE-NEXT: ushll v0.4s, v0.4h, #0
+; BE-NEXT: str h0, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -459,7 +459,7 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: bic v0.4h, #255, lsl #8
; BE-NEXT: rev32 v1.8h, v0.8h
@@ -562,7 +562,7 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: adrp x8, .LCPI15_0
; BE-NEXT: add x8, x8, :lo12:.LCPI15_0
@@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #3]
-; BE-NEXT: sturh w8, [x1, #1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: stur h1, [x1, #1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #5]
-; BE-NEXT: sturh w8, [x1, #3]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: stur h1, [x1, #3]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; CHECK-NEXT: shrn.4h v0, v0, #16
; CHECK-NEXT: uzp1.8b v1, v0, v0
; CHECK-NEXT: mov h0, v0[2]
-; CHECK-NEXT: str s1, [sp, #12]
-; CHECK-NEXT: ldrh w8, [sp, #12]
+; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: stur b0, [x1, #2]
-; CHECK-NEXT: strh w8, [x1]
+; CHECK-NEXT: str h1, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
@@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w8, [sp, #8]
; BE-NEXT: stur b0, [x0, #2]
-; BE-NEXT: strh w8, [x0]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
@@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w8, [sp, #8]
; BE-NEXT: stur b0, [x0, #2]
-; BE-NEXT: strh w8, [x0]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 74a717f1635a3..7cba0d608cd4f 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1012,18 +1012,16 @@ define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) {
;
; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: adrp x8, .LCPI11_0
-; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI11_0
-; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT: mov x8, xzr
; CHECK-BE-NEXT: .LBB11_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldr s1, [x0, x8]
+; CHECK-BE-NEXT: ldr s0, [x0, x8]
; CHECK-BE-NEXT: add x8, x8, #16
; CHECK-BE-NEXT: cmp x8, #128
-; CHECK-BE-NEXT: rev32 v1.16b, v1.16b
-; CHECK-BE-NEXT: tbl v1.16b, { v1.16b }, v0.16b
-; CHECK-BE-NEXT: st1 { v1.16b }, [x1]
+; CHECK-BE-NEXT: rev32 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v0.4s }, [x1]
; CHECK-BE-NEXT: add x1, x1, #64
; CHECK-BE-NEXT: b.ne .LBB11_1
; CHECK-BE-NEXT: // %bb.2: // %exit
More information about the llvm-commits
mailing list