[llvm] [AArch64] Scalarize extracted vector loads. (PR #159714)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 04:35:24 PDT 2025
https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/159714
>From 6dd360723f5de673dbeeba305f0545871312f574 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 19 Sep 2025 08:36:39 +0100
Subject: [PATCH 1/2] [AArch64] Scalarize extracted vector loads.
Given a vector load that is only extracted from, it is more efficient to
perform the individual loads than a single load and many extracts. This adds a
late optimization for scalarizing extracted vector loads that do not have any
other uses and will not be more efficiently kept in scalar registers.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 76 +++
.../CodeGen/AArch64/arm64-convert-v4f64.ll | 17 +-
.../AArch64/arm64-i16-subreg-extract.ll | 6 +-
.../test/CodeGen/AArch64/arm64-ldp-cluster.ll | 14 +-
.../test/CodeGen/AArch64/complex-int-to-fp.ll | 8 +-
.../CodeGen/AArch64/extract-vector-elt.ll | 14 +-
llvm/test/CodeGen/AArch64/itofp-bf16.ll | 570 +++++++++---------
.../AArch64/ragreedy-local-interval-cost.ll | 343 +++++------
.../CodeGen/AArch64/scalarize-vector-load.ll | 394 +++++-------
.../AArch64/sve-fixed-length-ext-loads.ll | 4 +-
.../AArch64/sve-fixed-length-masked-gather.ll | 3 +-
llvm/test/CodeGen/AArch64/vector-compress.ll | 7 +-
12 files changed, 708 insertions(+), 748 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cd7f0e719ad0c..f3b01baadb141 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20467,6 +20467,82 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
}
}
+ // Given an extract(load) or extract(extend(load)), produce a scalar load
+ // instead to avoid the cross-register-bank copies.
+ if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
+ VT.isInteger() && isa<ConstantSDNode>(N1) &&
+ !N0.getValueType().isScalableVector()) {
+ SDValue LoadN0 = N0;
+ // Look through sext/zext and extract_subvector / insert_subvector if
+ // required.
+ if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND ||
+ N0.getOpcode() == ISD::ANY_EXTEND) &&
+ N0.getOperand(0).hasOneUse())
+ LoadN0 = N0.getOperand(0);
+ unsigned OffsetElts = 0;
+ if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ !LoadN0.getOperand(0).getValueType().isScalableVector()) {
+ OffsetElts = LoadN0.getConstantOperandVal(1);
+ LoadN0 = LoadN0.getOperand(0);
+ }
+ if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ LoadN0.getOperand(0).isUndef() &&
+ isNullConstant(LoadN0.getOperand(2)) &&
+ LoadN0.getOperand(1).hasOneUse())
+ LoadN0 = LoadN0.getOperand(1);
+
+ // Check all the uses are valid and can be scalarized. We check that all the
+ // uses are extracts and those extracts are not re-inserted into an
+ // operation best treated as a vector register.
+ auto Load = dyn_cast<LoadSDNode>(LoadN0);
+ if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
+ Load->getMemoryVT().isByteSized() &&
+ all_of(N0->uses(), [&](const SDUse &U) {
+ return U.getResNo() != N0.getResNo() ||
+ (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ !any_of(U.getUser()->uses(), [](const SDUse &U2) {
+ return U2.getUser()->getOpcode() ==
+ ISD::INSERT_VECTOR_ELT ||
+ U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
+ U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
+ }));
+ })) {
+
+ SDLoc DL(Load);
+ EVT ScalarVT = Load->getValueType(0).getScalarType();
+ if (ScalarVT.getSizeInBits() < 32)
+ ScalarVT = MVT::i32;
+
+ // Generate a new scalar load.
+ unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
+ Load->getValueType(0).getScalarSizeInBits() / 8;
+ SDValue BasePtr = DAG.getObjectPtrOffset(
+ DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
+ ISD::LoadExtType ExtType =
+ N0.getOpcode() == ISD::ZERO_EXTEND
+ ? ISD::ZEXTLOAD
+ : (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD
+ : ISD::EXTLOAD);
+ SDValue ScalarLoad =
+ DAG.getExtLoad(ExtType, DL, ScalarVT, Load->getChain(), BasePtr,
+ Load->getPointerInfo().getWithOffset(Offset),
+ Load->getValueType(0).getScalarType(),
+ commonAlignment(Load->getAlign(), Offset),
+ Load->getMemOperand()->getFlags(), Load->getAAInfo());
+ DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
+
+ // Extend back to the original type if we looked through an extend above.
+ if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND ||
+ N0.getOpcode() == ISD::ANY_EXTEND) &&
+ ScalarVT.getScalarSizeInBits() < VT.getScalarSizeInBits())
+ ScalarLoad = DAG.getNode(N0.getOpcode(), DL, VT, ScalarLoad);
+
+ return ScalarLoad;
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 2b9e334cc7812..2b313fa8ce55f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -53,18 +53,15 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q2, [x0]
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: ucvtf s1, x9
-; CHECK-NEXT: mov x9, v2.d[1]
-; CHECK-NEXT: ucvtf s0, x8
-; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: ucvtf s2, x8
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: movi v2.4s, #127, msl #8
+; CHECK-NEXT: ucvtf s0, x9
+; CHECK-NEXT: ucvtf s1, x8
+; CHECK-NEXT: ldp x8, x9, [x0, #16]
; CHECK-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-NEXT: ucvtf s0, x8
+; CHECK-NEXT: mov v1.s[2], v0.s[0]
; CHECK-NEXT: ucvtf s0, x9
-; CHECK-NEXT: mov v1.s[2], v2.s[0]
-; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: mov v1.s[3], v0.s[0]
; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ushr v3.4s, v1.4s, #16
diff --git a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
index 59f887a1143c0..a93203793307a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -4,10 +4,8 @@
define i32 @foo(ptr %__a) nounwind {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: umov.h w8, v0[0]
-; CHECK-NEXT: umov.h w9, v0[0]
-; CHECK-NEXT: add w0, w9, w8, uxth #1
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: add w0, w8, w8, lsl #1
; CHECK-NEXT: ret
%tmp18 = load <4 x i16>, ptr %__a, align 8
%vget_lane = extractelement <4 x i16> %tmp18, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 114203e46f196..13093cb2204ce 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -105,13 +105,13 @@ define i32 @ldr_int_volatile(ptr %a) nounwind {
; CHECK: Cluster ld/st SU(1) - SU(3)
; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui
; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui
-define <2 x i64> @ldq_cluster(ptr %p) {
- %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
+define <4 x i32> @ldq_cluster(ptr %p) {
+ %tmp1 = load <4 x i32>, ptr %p, align 8
%add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
- %tmp2 = add nsw <2 x i64> %tmp1, %tmp1
- %tmp3 = load <2 x i64>, ptr %add.ptr2, align 8
- %res = mul nsw <2 x i64> %tmp2, %tmp3
- ret <2 x i64> %res
+ %tmp2 = add nsw <4 x i32> %tmp1, %tmp1
+ %tmp3 = load <4 x i32>, ptr %add.ptr2, align 8
+ %res = mul nsw <4 x i32> %tmp2, %tmp3
+ ret <4 x i32> %res
}
; CHECK: ********** MI Scheduling **********
@@ -215,7 +215,7 @@ exit:
; CHECK: ********** MI Scheduling **********
; CHECK: LDURXi_LDRXui:%bb.0 entry
; CHECK: Cluster ld/st SU(3) - SU(4)
-; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
+; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
;
define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
index baca159f9dd55..02dfaa19acc9d 100644
--- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -4,11 +4,9 @@
define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
; CHECK-LABEL: autogen_SD19655:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov.d x8, v0[1]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: scvtf s1, x9
-; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: scvtf s1, x8
; CHECK-NEXT: mov.s v1[1], v0[0]
; CHECK-NEXT: str d1, [x1]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 6ab703c08b837..121cc30692124 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -1114,16 +1114,10 @@ entry:
}
define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) {
-; CHECK-SD-LABEL: v3ext:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldr d0, [sp]
-; CHECK-SD-NEXT: fmov x0, d0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: v3ext:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr x0, [sp]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: v3ext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr x0, [sp]
+; CHECK-NEXT: ret
entry:
%c = extractelement <3 x ptr> %x, i32 2
ret ptr %c
diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
index 42641693c4081..0d3ae559449a4 100644
--- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll
+++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
@@ -740,162 +740,151 @@ entry:
define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
; CHECK-LABEL: stofp_v32i64_v32bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov x10, d2
-; CHECK-NEXT: mov x9, v3.d[1]
-; CHECK-NEXT: mov x8, v2.d[1]
-; CHECK-NEXT: fmov x11, d3
-; CHECK-NEXT: fmov x12, d0
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: scvtf s2, x10
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: scvtf s19, x9
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: scvtf s16, x11
-; CHECK-NEXT: mov x11, v6.d[1]
-; CHECK-NEXT: scvtf s0, x12
-; CHECK-NEXT: scvtf s18, x8
-; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: ldp x8, x9, [sp, #32]
+; CHECK-NEXT: mov x13, v2.d[1]
+; CHECK-NEXT: ldp x10, x12, [sp, #96]
+; CHECK-NEXT: fmov x14, d3
+; CHECK-NEXT: movi v17.4s, #1
+; CHECK-NEXT: scvtf s18, x9
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: ldp x8, x9, [sp, #48]
+; CHECK-NEXT: scvtf s23, x12
; CHECK-NEXT: scvtf s20, x10
-; CHECK-NEXT: scvtf s17, x9
-; CHECK-NEXT: mov x9, v7.d[1]
-; CHECK-NEXT: mov x10, v4.d[1]
-; CHECK-NEXT: scvtf s21, x11
-; CHECK-NEXT: fmov x11, d6
-; CHECK-NEXT: mov v2.s[1], v18.s[0]
-; CHECK-NEXT: scvtf s25, x8
-; CHECK-NEXT: movi v6.4s, #127, msl #8
-; CHECK-NEXT: mov v0.s[1], v20.s[0]
-; CHECK-NEXT: ldp q24, q20, [sp, #32]
-; CHECK-NEXT: scvtf s22, x9
-; CHECK-NEXT: fmov x9, d4
-; CHECK-NEXT: scvtf s1, x11
-; CHECK-NEXT: scvtf s26, x10
-; CHECK-NEXT: fmov x11, d7
-; CHECK-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-NEXT: ldp q18, q16, [sp]
-; CHECK-NEXT: mov x8, v24.d[1]
-; CHECK-NEXT: scvtf s4, x9
-; CHECK-NEXT: fmov x9, d5
-; CHECK-NEXT: mov v0.s[2], v17.s[0]
-; CHECK-NEXT: mov v1.s[1], v21.s[0]
-; CHECK-NEXT: scvtf s23, x11
-; CHECK-NEXT: mov x11, v5.d[1]
-; CHECK-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: scvtf s27, x13
; CHECK-NEXT: scvtf s21, x8
-; CHECK-NEXT: mov x8, v20.d[1]
-; CHECK-NEXT: scvtf s17, x9
-; CHECK-NEXT: fmov x9, d24
-; CHECK-NEXT: mov v4.s[1], v26.s[0]
-; CHECK-NEXT: mov v0.s[3], v25.s[0]
-; CHECK-NEXT: ldp q26, q24, [sp, #96]
-; CHECK-NEXT: mov v1.s[2], v23.s[0]
-; CHECK-NEXT: ldp q25, q23, [sp, #64]
-; CHECK-NEXT: scvtf s7, x11
-; CHECK-NEXT: scvtf s27, x8
-; CHECK-NEXT: fmov x8, d18
-; CHECK-NEXT: scvtf s5, x9
-; CHECK-NEXT: mov x10, v26.d[1]
-; CHECK-NEXT: mov x9, v18.d[1]
-; CHECK-NEXT: fmov x11, d20
-; CHECK-NEXT: mov v4.s[2], v17.s[0]
-; CHECK-NEXT: mov v1.s[3], v22.s[0]
-; CHECK-NEXT: ushr v19.4s, v2.4s, #16
-; CHECK-NEXT: scvtf s17, x8
-; CHECK-NEXT: fmov x8, d26
-; CHECK-NEXT: add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT: ldp x8, x11, [sp]
+; CHECK-NEXT: mov v16.s[1], v18.s[0]
+; CHECK-NEXT: scvtf s24, x9
+; CHECK-NEXT: movi v18.4s, #127, msl #8
+; CHECK-NEXT: mov v20.s[1], v23.s[0]
; CHECK-NEXT: scvtf s22, x11
-; CHECK-NEXT: mov x11, v25.d[1]
-; CHECK-NEXT: mov v5.s[1], v21.s[0]
-; CHECK-NEXT: scvtf s28, x10
-; CHECK-NEXT: fmov x10, d16
-; CHECK-NEXT: scvtf s21, x9
-; CHECK-NEXT: fmov x9, d25
-; CHECK-NEXT: scvtf s18, x8
-; CHECK-NEXT: mov x8, v16.d[1]
-; CHECK-NEXT: mov v4.s[3], v7.s[0]
-; CHECK-NEXT: and v19.16b, v19.16b, v3.16b
-; CHECK-NEXT: scvtf s16, x10
-; CHECK-NEXT: fmov x10, d24
+; CHECK-NEXT: ldp x11, x12, [sp, #16]
+; CHECK-NEXT: scvtf s19, x8
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov v16.s[2], v21.s[0]
; CHECK-NEXT: scvtf s25, x11
-; CHECK-NEXT: scvtf s20, x9
-; CHECK-NEXT: mov x9, v24.d[1]
-; CHECK-NEXT: mov v17.s[1], v21.s[0]
-; CHECK-NEXT: fmov x11, d23
-; CHECK-NEXT: mov v18.s[1], v28.s[0]
-; CHECK-NEXT: scvtf s24, x8
-; CHECK-NEXT: scvtf s21, x10
-; CHECK-NEXT: mov x10, v23.d[1]
-; CHECK-NEXT: mov v5.s[2], v22.s[0]
-; CHECK-NEXT: ushr v22.4s, v1.4s, #16
-; CHECK-NEXT: ushr v28.4s, v0.4s, #16
+; CHECK-NEXT: ldp x9, x11, [sp, #112]
+; CHECK-NEXT: mov v19.s[1], v22.s[0]
+; CHECK-NEXT: scvtf s22, x12
+; CHECK-NEXT: scvtf s26, x9
+; CHECK-NEXT: ldp x9, x12, [sp, #64]
; CHECK-NEXT: scvtf s23, x11
-; CHECK-NEXT: mov v20.s[1], v25.s[0]
-; CHECK-NEXT: scvtf s25, x9
-; CHECK-NEXT: mov v17.s[2], v16.s[0]
-; CHECK-NEXT: add v16.4s, v19.4s, v26.4s
-; CHECK-NEXT: ushr v26.4s, v4.4s, #16
-; CHECK-NEXT: mov v18.s[2], v21.s[0]
-; CHECK-NEXT: scvtf s7, x10
-; CHECK-NEXT: and v22.16b, v22.16b, v3.16b
-; CHECK-NEXT: mov v5.s[3], v27.s[0]
-; CHECK-NEXT: and v21.16b, v28.16b, v3.16b
-; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s
-; CHECK-NEXT: mov v20.s[2], v23.s[0]
-; CHECK-NEXT: add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT: mov v16.s[3], v24.s[0]
+; CHECK-NEXT: fmov x11, d2
+; CHECK-NEXT: scvtf s24, x12
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: ldp x12, x13, [sp, #80]
+; CHECK-NEXT: scvtf s21, x11
+; CHECK-NEXT: mov x11, v4.d[1]
+; CHECK-NEXT: mov v19.s[2], v25.s[0]
+; CHECK-NEXT: mov v20.s[2], v26.s[0]
+; CHECK-NEXT: ushr v25.4s, v16.4s, #16
+; CHECK-NEXT: scvtf s26, x14
+; CHECK-NEXT: scvtf s3, x12
+; CHECK-NEXT: mov v2.s[1], v24.s[0]
+; CHECK-NEXT: scvtf s24, x10
+; CHECK-NEXT: fmov x10, d6
+; CHECK-NEXT: fmov x12, d0
+; CHECK-NEXT: scvtf s6, x9
+; CHECK-NEXT: mov v21.s[1], v27.s[0]
+; CHECK-NEXT: scvtf s27, x11
+; CHECK-NEXT: fmov x11, d7
+; CHECK-NEXT: mov v19.s[3], v22.s[0]
+; CHECK-NEXT: mov v20.s[3], v23.s[0]
+; CHECK-NEXT: add v22.4s, v16.4s, v18.4s
+; CHECK-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-NEXT: scvtf s3, x10
+; CHECK-NEXT: fmov x10, d4
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: and v23.16b, v25.16b, v17.16b
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: fmov x12, d5
+; CHECK-NEXT: mov v21.s[2], v26.s[0]
+; CHECK-NEXT: scvtf s25, x13
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: add v26.4s, v20.4s, v18.4s
+; CHECK-NEXT: mov v3.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s6, x11
+; CHECK-NEXT: mov x11, v5.d[1]
+; CHECK-NEXT: scvtf s5, x8
+; CHECK-NEXT: mov v0.s[1], v24.s[0]
+; CHECK-NEXT: add v22.4s, v23.4s, v22.4s
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: scvtf s7, x12
+; CHECK-NEXT: mov v4.s[1], v27.s[0]
+; CHECK-NEXT: ushr v23.4s, v19.4s, #16
+; CHECK-NEXT: mov v2.s[3], v25.s[0]
+; CHECK-NEXT: mov v3.s[2], v6.s[0]
+; CHECK-NEXT: add v25.4s, v19.4s, v18.4s
+; CHECK-NEXT: ushr v24.4s, v20.4s, #16
+; CHECK-NEXT: mov v21.s[3], v5.s[0]
+; CHECK-NEXT: scvtf s5, x11
+; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s
+; CHECK-NEXT: scvtf s6, x10
+; CHECK-NEXT: and v23.16b, v23.16b, v17.16b
+; CHECK-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v4.s[2], v7.s[0]
+; CHECK-NEXT: and v24.16b, v24.16b, v17.16b
+; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s
+; CHECK-NEXT: orr v16.4s, #64, lsl #16
+; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s
+; CHECK-NEXT: add v27.4s, v21.4s, v18.4s
+; CHECK-NEXT: orr v20.4s, #64, lsl #16
+; CHECK-NEXT: mov v3.s[3], v6.s[0]
+; CHECK-NEXT: add v6.4s, v23.4s, v25.4s
+; CHECK-NEXT: ushr v23.4s, v21.4s, #16
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-NEXT: ushr v1.4s, v2.4s, #16
+; CHECK-NEXT: add v24.4s, v24.4s, v26.4s
+; CHECK-NEXT: add v25.4s, v2.4s, v18.4s
+; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s
+; CHECK-NEXT: and v23.16b, v23.16b, v17.16b
+; CHECK-NEXT: orr v19.4s, #64, lsl #16
; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: mov v17.s[3], v24.s[0]
-; CHECK-NEXT: add v24.4s, v1.4s, v6.4s
-; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s
-; CHECK-NEXT: mov v18.s[3], v25.s[0]
-; CHECK-NEXT: add v25.4s, v4.4s, v6.4s
-; CHECK-NEXT: orr v1.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b
-; CHECK-NEXT: mov v20.s[3], v7.s[0]
-; CHECK-NEXT: add v22.4s, v22.4s, v24.4s
-; CHECK-NEXT: add v7.4s, v21.4s, v23.4s
-; CHECK-NEXT: ushr v24.4s, v17.4s, #16
-; CHECK-NEXT: and v23.16b, v26.16b, v3.16b
-; CHECK-NEXT: ushr v26.4s, v5.4s, #16
-; CHECK-NEXT: ushr v28.4s, v18.4s, #16
-; CHECK-NEXT: add v30.4s, v17.4s, v6.4s
-; CHECK-NEXT: add v31.4s, v18.4s, v6.4s
-; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b
-; CHECK-NEXT: ushr v29.4s, v20.4s, #16
-; CHECK-NEXT: and v24.16b, v24.16b, v3.16b
-; CHECK-NEXT: add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT: and v28.16b, v28.16b, v3.16b
-; CHECK-NEXT: and v25.16b, v26.16b, v3.16b
-; CHECK-NEXT: add v26.4s, v5.4s, v6.4s
-; CHECK-NEXT: add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT: and v3.16b, v29.16b, v3.16b
-; CHECK-NEXT: add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s
-; CHECK-NEXT: add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s
-; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT: ushr v28.4s, v3.4s, #16
+; CHECK-NEXT: and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b
+; CHECK-NEXT: ushr v26.4s, v0.4s, #16
+; CHECK-NEXT: ushr v30.4s, v4.4s, #16
+; CHECK-NEXT: add v23.4s, v23.4s, v27.4s
+; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b
+; CHECK-NEXT: mov v6.16b, v29.16b
+; CHECK-NEXT: and v27.16b, v28.16b, v17.16b
+; CHECK-NEXT: add v28.4s, v3.4s, v18.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v25.4s
+; CHECK-NEXT: and v25.16b, v26.16b, v17.16b
+; CHECK-NEXT: add v26.4s, v0.4s, v18.4s
+; CHECK-NEXT: and v17.16b, v30.16b, v17.16b
+; CHECK-NEXT: add v18.4s, v4.4s, v18.4s
+; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s
+; CHECK-NEXT: orr v21.4s, #64, lsl #16
+; CHECK-NEXT: add v27.4s, v27.4s, v28.4s
+; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s
+; CHECK-NEXT: orr v3.4s, #64, lsl #16
; CHECK-NEXT: add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s
+; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-NEXT: add v17.4s, v17.4s, v18.4s
+; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s
; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: add v3.4s, v3.4s, v6.4s
-; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s
-; CHECK-NEXT: orr v5.4s, #64, lsl #16
-; CHECK-NEXT: orr v17.4s, #64, lsl #16
-; CHECK-NEXT: orr v18.4s, #64, lsl #16
-; CHECK-NEXT: orr v20.4s, #64, lsl #16
-; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b
-; CHECK-NEXT: mov v7.16b, v30.16b
-; CHECK-NEXT: mov v16.16b, v31.16b
-; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b
-; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b
-; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b
-; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h
-; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h
-; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h
+; CHECK-NEXT: mov v16.16b, v30.16b
+; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b
+; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b
+; CHECK-NEXT: mov v19.16b, v28.16b
+; CHECK-NEXT: uzp2 v2.8h, v5.8h, v7.8h
+; CHECK-NEXT: bit v0.16b, v25.16b, v26.16b
+; CHECK-NEXT: bsl v16.16b, v23.16b, v21.16b
+; CHECK-NEXT: bit v4.16b, v17.16b, v18.16b
+; CHECK-NEXT: bsl v19.16b, v27.16b, v3.16b
+; CHECK-NEXT: uzp2 v3.8h, v1.8h, v6.8h
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v16.8h
+; CHECK-NEXT: uzp2 v1.8h, v4.8h, v19.8h
; CHECK-NEXT: ret
entry:
%c = sitofp <32 x i64> %a to <32 x bfloat>
@@ -905,162 +894,151 @@ entry:
define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
; CHECK-LABEL: utofp_v32i64_v32bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov x10, d2
-; CHECK-NEXT: mov x9, v3.d[1]
-; CHECK-NEXT: mov x8, v2.d[1]
-; CHECK-NEXT: fmov x11, d3
-; CHECK-NEXT: fmov x12, d0
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: ucvtf s2, x10
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: ucvtf s19, x9
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: ucvtf s16, x11
-; CHECK-NEXT: mov x11, v6.d[1]
-; CHECK-NEXT: ucvtf s0, x12
-; CHECK-NEXT: ucvtf s18, x8
-; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: ldp x8, x9, [sp, #32]
+; CHECK-NEXT: mov x13, v2.d[1]
+; CHECK-NEXT: ldp x10, x12, [sp, #96]
+; CHECK-NEXT: fmov x14, d3
+; CHECK-NEXT: movi v17.4s, #1
+; CHECK-NEXT: ucvtf s18, x9
+; CHECK-NEXT: ucvtf s16, x8
+; CHECK-NEXT: ldp x8, x9, [sp, #48]
+; CHECK-NEXT: ucvtf s23, x12
; CHECK-NEXT: ucvtf s20, x10
-; CHECK-NEXT: ucvtf s17, x9
-; CHECK-NEXT: mov x9, v7.d[1]
-; CHECK-NEXT: mov x10, v4.d[1]
-; CHECK-NEXT: ucvtf s21, x11
-; CHECK-NEXT: fmov x11, d6
-; CHECK-NEXT: mov v2.s[1], v18.s[0]
-; CHECK-NEXT: ucvtf s25, x8
-; CHECK-NEXT: movi v6.4s, #127, msl #8
-; CHECK-NEXT: mov v0.s[1], v20.s[0]
-; CHECK-NEXT: ldp q24, q20, [sp, #32]
-; CHECK-NEXT: ucvtf s22, x9
-; CHECK-NEXT: fmov x9, d4
-; CHECK-NEXT: ucvtf s1, x11
-; CHECK-NEXT: ucvtf s26, x10
-; CHECK-NEXT: fmov x11, d7
-; CHECK-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-NEXT: ldp q18, q16, [sp]
-; CHECK-NEXT: mov x8, v24.d[1]
-; CHECK-NEXT: ucvtf s4, x9
-; CHECK-NEXT: fmov x9, d5
-; CHECK-NEXT: mov v0.s[2], v17.s[0]
-; CHECK-NEXT: mov v1.s[1], v21.s[0]
-; CHECK-NEXT: ucvtf s23, x11
-; CHECK-NEXT: mov x11, v5.d[1]
-; CHECK-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: ucvtf s27, x13
; CHECK-NEXT: ucvtf s21, x8
-; CHECK-NEXT: mov x8, v20.d[1]
-; CHECK-NEXT: ucvtf s17, x9
-; CHECK-NEXT: fmov x9, d24
-; CHECK-NEXT: mov v4.s[1], v26.s[0]
-; CHECK-NEXT: mov v0.s[3], v25.s[0]
-; CHECK-NEXT: ldp q26, q24, [sp, #96]
-; CHECK-NEXT: mov v1.s[2], v23.s[0]
-; CHECK-NEXT: ldp q25, q23, [sp, #64]
-; CHECK-NEXT: ucvtf s7, x11
-; CHECK-NEXT: ucvtf s27, x8
-; CHECK-NEXT: fmov x8, d18
-; CHECK-NEXT: ucvtf s5, x9
-; CHECK-NEXT: mov x10, v26.d[1]
-; CHECK-NEXT: mov x9, v18.d[1]
-; CHECK-NEXT: fmov x11, d20
-; CHECK-NEXT: mov v4.s[2], v17.s[0]
-; CHECK-NEXT: mov v1.s[3], v22.s[0]
-; CHECK-NEXT: ushr v19.4s, v2.4s, #16
-; CHECK-NEXT: ucvtf s17, x8
-; CHECK-NEXT: fmov x8, d26
-; CHECK-NEXT: add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT: ldp x8, x11, [sp]
+; CHECK-NEXT: mov v16.s[1], v18.s[0]
+; CHECK-NEXT: ucvtf s24, x9
+; CHECK-NEXT: movi v18.4s, #127, msl #8
+; CHECK-NEXT: mov v20.s[1], v23.s[0]
; CHECK-NEXT: ucvtf s22, x11
-; CHECK-NEXT: mov x11, v25.d[1]
-; CHECK-NEXT: mov v5.s[1], v21.s[0]
-; CHECK-NEXT: ucvtf s28, x10
-; CHECK-NEXT: fmov x10, d16
-; CHECK-NEXT: ucvtf s21, x9
-; CHECK-NEXT: fmov x9, d25
-; CHECK-NEXT: ucvtf s18, x8
-; CHECK-NEXT: mov x8, v16.d[1]
-; CHECK-NEXT: mov v4.s[3], v7.s[0]
-; CHECK-NEXT: and v19.16b, v19.16b, v3.16b
-; CHECK-NEXT: ucvtf s16, x10
-; CHECK-NEXT: fmov x10, d24
+; CHECK-NEXT: ldp x11, x12, [sp, #16]
+; CHECK-NEXT: ucvtf s19, x8
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov v16.s[2], v21.s[0]
; CHECK-NEXT: ucvtf s25, x11
-; CHECK-NEXT: ucvtf s20, x9
-; CHECK-NEXT: mov x9, v24.d[1]
-; CHECK-NEXT: mov v17.s[1], v21.s[0]
-; CHECK-NEXT: fmov x11, d23
-; CHECK-NEXT: mov v18.s[1], v28.s[0]
-; CHECK-NEXT: ucvtf s24, x8
-; CHECK-NEXT: ucvtf s21, x10
-; CHECK-NEXT: mov x10, v23.d[1]
-; CHECK-NEXT: mov v5.s[2], v22.s[0]
-; CHECK-NEXT: ushr v22.4s, v1.4s, #16
-; CHECK-NEXT: ushr v28.4s, v0.4s, #16
+; CHECK-NEXT: ldp x9, x11, [sp, #112]
+; CHECK-NEXT: mov v19.s[1], v22.s[0]
+; CHECK-NEXT: ucvtf s22, x12
+; CHECK-NEXT: ucvtf s26, x9
+; CHECK-NEXT: ldp x9, x12, [sp, #64]
; CHECK-NEXT: ucvtf s23, x11
-; CHECK-NEXT: mov v20.s[1], v25.s[0]
-; CHECK-NEXT: ucvtf s25, x9
-; CHECK-NEXT: mov v17.s[2], v16.s[0]
-; CHECK-NEXT: add v16.4s, v19.4s, v26.4s
-; CHECK-NEXT: ushr v26.4s, v4.4s, #16
-; CHECK-NEXT: mov v18.s[2], v21.s[0]
-; CHECK-NEXT: ucvtf s7, x10
-; CHECK-NEXT: and v22.16b, v22.16b, v3.16b
-; CHECK-NEXT: mov v5.s[3], v27.s[0]
-; CHECK-NEXT: and v21.16b, v28.16b, v3.16b
-; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s
-; CHECK-NEXT: mov v20.s[2], v23.s[0]
-; CHECK-NEXT: add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT: mov v16.s[3], v24.s[0]
+; CHECK-NEXT: fmov x11, d2
+; CHECK-NEXT: ucvtf s24, x12
+; CHECK-NEXT: ucvtf s2, x9
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: ldp x12, x13, [sp, #80]
+; CHECK-NEXT: ucvtf s21, x11
+; CHECK-NEXT: mov x11, v4.d[1]
+; CHECK-NEXT: mov v19.s[2], v25.s[0]
+; CHECK-NEXT: mov v20.s[2], v26.s[0]
+; CHECK-NEXT: ushr v25.4s, v16.4s, #16
+; CHECK-NEXT: ucvtf s26, x14
+; CHECK-NEXT: ucvtf s3, x12
+; CHECK-NEXT: mov v2.s[1], v24.s[0]
+; CHECK-NEXT: ucvtf s24, x10
+; CHECK-NEXT: fmov x10, d6
+; CHECK-NEXT: fmov x12, d0
+; CHECK-NEXT: ucvtf s6, x9
+; CHECK-NEXT: mov v21.s[1], v27.s[0]
+; CHECK-NEXT: ucvtf s27, x11
+; CHECK-NEXT: fmov x11, d7
+; CHECK-NEXT: mov v19.s[3], v22.s[0]
+; CHECK-NEXT: mov v20.s[3], v23.s[0]
+; CHECK-NEXT: add v22.4s, v16.4s, v18.4s
+; CHECK-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-NEXT: ucvtf s3, x10
+; CHECK-NEXT: fmov x10, d4
+; CHECK-NEXT: ucvtf s0, x12
+; CHECK-NEXT: and v23.16b, v25.16b, v17.16b
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: fmov x12, d5
+; CHECK-NEXT: mov v21.s[2], v26.s[0]
+; CHECK-NEXT: ucvtf s25, x13
+; CHECK-NEXT: ucvtf s4, x10
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: add v26.4s, v20.4s, v18.4s
+; CHECK-NEXT: mov v3.s[1], v6.s[0]
+; CHECK-NEXT: ucvtf s6, x11
+; CHECK-NEXT: mov x11, v5.d[1]
+; CHECK-NEXT: ucvtf s5, x8
+; CHECK-NEXT: mov v0.s[1], v24.s[0]
+; CHECK-NEXT: add v22.4s, v23.4s, v22.4s
+; CHECK-NEXT: ucvtf s1, x10
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: ucvtf s7, x12
+; CHECK-NEXT: mov v4.s[1], v27.s[0]
+; CHECK-NEXT: ushr v23.4s, v19.4s, #16
+; CHECK-NEXT: mov v2.s[3], v25.s[0]
+; CHECK-NEXT: mov v3.s[2], v6.s[0]
+; CHECK-NEXT: add v25.4s, v19.4s, v18.4s
+; CHECK-NEXT: ushr v24.4s, v20.4s, #16
+; CHECK-NEXT: mov v21.s[3], v5.s[0]
+; CHECK-NEXT: ucvtf s5, x11
+; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s
+; CHECK-NEXT: ucvtf s6, x10
+; CHECK-NEXT: and v23.16b, v23.16b, v17.16b
+; CHECK-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-NEXT: ucvtf s1, x9
+; CHECK-NEXT: mov v4.s[2], v7.s[0]
+; CHECK-NEXT: and v24.16b, v24.16b, v17.16b
+; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s
+; CHECK-NEXT: orr v16.4s, #64, lsl #16
+; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s
+; CHECK-NEXT: add v27.4s, v21.4s, v18.4s
+; CHECK-NEXT: orr v20.4s, #64, lsl #16
+; CHECK-NEXT: mov v3.s[3], v6.s[0]
+; CHECK-NEXT: add v6.4s, v23.4s, v25.4s
+; CHECK-NEXT: ushr v23.4s, v21.4s, #16
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-NEXT: ushr v1.4s, v2.4s, #16
+; CHECK-NEXT: add v24.4s, v24.4s, v26.4s
+; CHECK-NEXT: add v25.4s, v2.4s, v18.4s
+; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s
+; CHECK-NEXT: and v23.16b, v23.16b, v17.16b
+; CHECK-NEXT: orr v19.4s, #64, lsl #16
; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: mov v17.s[3], v24.s[0]
-; CHECK-NEXT: add v24.4s, v1.4s, v6.4s
-; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s
-; CHECK-NEXT: mov v18.s[3], v25.s[0]
-; CHECK-NEXT: add v25.4s, v4.4s, v6.4s
-; CHECK-NEXT: orr v1.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b
-; CHECK-NEXT: mov v20.s[3], v7.s[0]
-; CHECK-NEXT: add v22.4s, v22.4s, v24.4s
-; CHECK-NEXT: add v7.4s, v21.4s, v23.4s
-; CHECK-NEXT: ushr v24.4s, v17.4s, #16
-; CHECK-NEXT: and v23.16b, v26.16b, v3.16b
-; CHECK-NEXT: ushr v26.4s, v5.4s, #16
-; CHECK-NEXT: ushr v28.4s, v18.4s, #16
-; CHECK-NEXT: add v30.4s, v17.4s, v6.4s
-; CHECK-NEXT: add v31.4s, v18.4s, v6.4s
-; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b
-; CHECK-NEXT: ushr v29.4s, v20.4s, #16
-; CHECK-NEXT: and v24.16b, v24.16b, v3.16b
-; CHECK-NEXT: add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT: and v28.16b, v28.16b, v3.16b
-; CHECK-NEXT: and v25.16b, v26.16b, v3.16b
-; CHECK-NEXT: add v26.4s, v5.4s, v6.4s
-; CHECK-NEXT: add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT: and v3.16b, v29.16b, v3.16b
-; CHECK-NEXT: add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s
-; CHECK-NEXT: add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s
-; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT: ushr v28.4s, v3.4s, #16
+; CHECK-NEXT: and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b
+; CHECK-NEXT: ushr v26.4s, v0.4s, #16
+; CHECK-NEXT: ushr v30.4s, v4.4s, #16
+; CHECK-NEXT: add v23.4s, v23.4s, v27.4s
+; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b
+; CHECK-NEXT: mov v6.16b, v29.16b
+; CHECK-NEXT: and v27.16b, v28.16b, v17.16b
+; CHECK-NEXT: add v28.4s, v3.4s, v18.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v25.4s
+; CHECK-NEXT: and v25.16b, v26.16b, v17.16b
+; CHECK-NEXT: add v26.4s, v0.4s, v18.4s
+; CHECK-NEXT: and v17.16b, v30.16b, v17.16b
+; CHECK-NEXT: add v18.4s, v4.4s, v18.4s
+; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s
+; CHECK-NEXT: orr v21.4s, #64, lsl #16
+; CHECK-NEXT: add v27.4s, v27.4s, v28.4s
+; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s
+; CHECK-NEXT: orr v3.4s, #64, lsl #16
; CHECK-NEXT: add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s
+; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-NEXT: add v17.4s, v17.4s, v18.4s
+; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s
; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: add v3.4s, v3.4s, v6.4s
-; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s
-; CHECK-NEXT: orr v5.4s, #64, lsl #16
-; CHECK-NEXT: orr v17.4s, #64, lsl #16
-; CHECK-NEXT: orr v18.4s, #64, lsl #16
-; CHECK-NEXT: orr v20.4s, #64, lsl #16
-; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b
-; CHECK-NEXT: mov v7.16b, v30.16b
-; CHECK-NEXT: mov v16.16b, v31.16b
-; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b
-; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b
-; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b
-; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h
-; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h
-; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h
+; CHECK-NEXT: mov v16.16b, v30.16b
+; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b
+; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b
+; CHECK-NEXT: mov v19.16b, v28.16b
+; CHECK-NEXT: uzp2 v2.8h, v5.8h, v7.8h
+; CHECK-NEXT: bit v0.16b, v25.16b, v26.16b
+; CHECK-NEXT: bsl v16.16b, v23.16b, v21.16b
+; CHECK-NEXT: bit v4.16b, v17.16b, v18.16b
+; CHECK-NEXT: bsl v19.16b, v27.16b, v3.16b
+; CHECK-NEXT: uzp2 v3.8h, v1.8h, v6.8h
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v16.8h
+; CHECK-NEXT: uzp2 v1.8h, v4.8h, v19.8h
; CHECK-NEXT: ret
entry:
%c = uitofp <32 x i64> %a to <32 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index c91de8f3a0a47..e3c623371448b 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -8,224 +8,209 @@
define dso_local void @run_test() local_unnamed_addr uwtable {
; CHECK-LABEL: run_test:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #208
-; CHECK-NEXT: .cfi_def_cfa_offset 208
-; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT: str x23, [sp, #160] // 8-byte Folded Spill
-; CHECK-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #192
+; CHECK-NEXT: .cfi_def_cfa_offset 192
+; CHECK-NEXT: stp d15, d14, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #128] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #144] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w21, -24
-; CHECK-NEXT: .cfi_offset w22, -32
-; CHECK-NEXT: .cfi_offset w23, -48
-; CHECK-NEXT: .cfi_offset b8, -56
-; CHECK-NEXT: .cfi_offset b9, -64
-; CHECK-NEXT: .cfi_offset b10, -72
-; CHECK-NEXT: .cfi_offset b11, -80
-; CHECK-NEXT: .cfi_offset b12, -88
-; CHECK-NEXT: .cfi_offset b13, -96
-; CHECK-NEXT: .cfi_offset b14, -104
-; CHECK-NEXT: .cfi_offset b15, -112
-; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: // implicit-def: $q1
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: adrp x14, B+48
+; CHECK-NEXT: add x14, x14, :lo12:B+48
+; CHECK-NEXT: // implicit-def: $q18
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: adrp x9, B+48
-; CHECK-NEXT: add x9, x9, :lo12:B+48
+; CHECK-NEXT: mov w9, #8 // =0x8
; CHECK-NEXT: adrp x10, A
; CHECK-NEXT: add x10, x10, :lo12:A
; CHECK-NEXT: mov x11, xzr
-; CHECK-NEXT: // kill: killed $q1
-; CHECK-NEXT: // implicit-def: $q1
+; CHECK-NEXT: // kill: killed $q18
+; CHECK-NEXT: // implicit-def: $q18
; CHECK-NEXT: mov x12, xzr
+; CHECK-NEXT: mov x13, x14
; CHECK-NEXT: // implicit-def: $q0
+; CHECK-NEXT: // implicit-def: $q2
; CHECK-NEXT: // implicit-def: $q3
; CHECK-NEXT: // implicit-def: $q4
; CHECK-NEXT: // implicit-def: $q5
-; CHECK-NEXT: // implicit-def: $q7
-; CHECK-NEXT: // implicit-def: $q10
-; CHECK-NEXT: // implicit-def: $q17
; CHECK-NEXT: // implicit-def: $q6
-; CHECK-NEXT: // implicit-def: $q18
+; CHECK-NEXT: // implicit-def: $q16
+; CHECK-NEXT: // implicit-def: $q17
+; CHECK-NEXT: // implicit-def: $q7
; CHECK-NEXT: // implicit-def: $q19
; CHECK-NEXT: // implicit-def: $q20
; CHECK-NEXT: // implicit-def: $q21
; CHECK-NEXT: // implicit-def: $q22
-; CHECK-NEXT: // implicit-def: $q23
; CHECK-NEXT: // implicit-def: $q24
-; CHECK-NEXT: // implicit-def: $q9
+; CHECK-NEXT: // implicit-def: $q23
+; CHECK-NEXT: // implicit-def: $q25
+; CHECK-NEXT: // implicit-def: $q26
; CHECK-NEXT: // implicit-def: $q27
-; CHECK-NEXT: // implicit-def: $q12
-; CHECK-NEXT: // implicit-def: $q28
-; CHECK-NEXT: // implicit-def: $q14
-; CHECK-NEXT: // implicit-def: $q15
-; CHECK-NEXT: // implicit-def: $q29
; CHECK-NEXT: // implicit-def: $q30
+; CHECK-NEXT: // implicit-def: $q8
; CHECK-NEXT: // implicit-def: $q11
-; CHECK-NEXT: // implicit-def: $q31
+; CHECK-NEXT: // implicit-def: $q12
+; CHECK-NEXT: // implicit-def: $q29
; CHECK-NEXT: // implicit-def: $q13
-; CHECK-NEXT: // kill: killed $q1
-; CHECK-NEXT: // implicit-def: $q1
-; CHECK-NEXT: // kill: killed $q1
+; CHECK-NEXT: // implicit-def: $q10
+; CHECK-NEXT: // implicit-def: $q15
+; CHECK-NEXT: // kill: killed $q18
+; CHECK-NEXT: // implicit-def: $q18
+; CHECK-NEXT: // kill: killed $q18
; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill
-; CHECK-NEXT: ldr q15, [x8]
+; CHECK-NEXT: ldr x17, [x8]
; CHECK-NEXT: ldr x15, [x8]
-; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: add x20, x10, x11
-; CHECK-NEXT: mov v8.16b, v28.16b
-; CHECK-NEXT: fmov x2, d15
-; CHECK-NEXT: mov x17, v15.d[1]
-; CHECK-NEXT: ldr q14, [x8]
+; CHECK-NEXT: mov v18.16b, v0.16b
+; CHECK-NEXT: ldr x16, [x9]
+; CHECK-NEXT: stp q15, q4, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: add x5, x10, x11
+; CHECK-NEXT: mul x1, x15, x17
+; CHECK-NEXT: ldr x2, [x13], #64
+; CHECK-NEXT: ldr x5, [x5, #128]
+; CHECK-NEXT: stp q7, q23, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT: ldr x14, [x14, #8]
+; CHECK-NEXT: mul x0, x17, x17
+; CHECK-NEXT: ldr q23, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: mov v9.16b, v30.16b
+; CHECK-NEXT: mov v30.16b, v25.16b
+; CHECK-NEXT: mov v25.16b, v20.16b
+; CHECK-NEXT: mov v20.16b, v6.16b
+; CHECK-NEXT: mul x18, x16, x17
+; CHECK-NEXT: mov v6.16b, v1.16b
; CHECK-NEXT: mov v28.16b, v24.16b
-; CHECK-NEXT: mov v24.16b, v20.16b
-; CHECK-NEXT: mov v20.16b, v17.16b
-; CHECK-NEXT: fmov x13, d14
-; CHECK-NEXT: mov x16, v14.d[1]
-; CHECK-NEXT: mov v17.16b, v5.16b
-; CHECK-NEXT: mul x3, x2, x15
-; CHECK-NEXT: ldr q14, [x9], #64
-; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x6, [x8]
-; CHECK-NEXT: ldr x20, [x20, #128]
-; CHECK-NEXT: mul x1, x17, x15
-; CHECK-NEXT: mov x14, v14.d[1]
-; CHECK-NEXT: fmov x5, d14
-; CHECK-NEXT: mov v29.16b, v21.16b
-; CHECK-NEXT: mov v21.16b, v0.16b
-; CHECK-NEXT: mov v25.16b, v6.16b
-; CHECK-NEXT: mul x18, x13, x15
-; CHECK-NEXT: mov v6.16b, v2.16b
-; CHECK-NEXT: mov v26.16b, v22.16b
-; CHECK-NEXT: fmov d15, x3
-; CHECK-NEXT: mov v22.16b, v18.16b
-; CHECK-NEXT: mov v18.16b, v7.16b
-; CHECK-NEXT: mul x0, x16, x15
-; CHECK-NEXT: mov v7.16b, v3.16b
-; CHECK-NEXT: mov v16.16b, v4.16b
+; CHECK-NEXT: fmov d14, x1
+; CHECK-NEXT: mov v24.16b, v19.16b
+; CHECK-NEXT: mov v19.16b, v5.16b
+; CHECK-NEXT: mul x4, x2, x17
+; CHECK-NEXT: mov v31.16b, v26.16b
+; CHECK-NEXT: mov v26.16b, v21.16b
+; CHECK-NEXT: fmov d15, x0
+; CHECK-NEXT: mov v21.16b, v16.16b
+; CHECK-NEXT: mov v16.16b, v2.16b
+; CHECK-NEXT: mov v0.16b, v14.16b
+; CHECK-NEXT: mul x20, x2, x5
+; CHECK-NEXT: mov v7.16b, v10.16b
+; CHECK-NEXT: mov v10.16b, v17.16b
+; CHECK-NEXT: mov v17.16b, v3.16b
; CHECK-NEXT: add x11, x11, #8
-; CHECK-NEXT: add x12, x12, #1
-; CHECK-NEXT: mov v15.d[1], x1
-; CHECK-NEXT: mul x4, x14, x15
+; CHECK-NEXT: mov v15.d[1], x18
+; CHECK-NEXT: mul x3, x14, x17
; CHECK-NEXT: cmp x11, #64
-; CHECK-NEXT: fmov d14, x18
-; CHECK-NEXT: mul x15, x5, x15
-; CHECK-NEXT: add v5.2d, v5.2d, v15.2d
-; CHECK-NEXT: mul x21, x2, x6
-; CHECK-NEXT: mov v14.d[1], x0
-; CHECK-NEXT: mul x2, x2, x20
-; CHECK-NEXT: fmov d0, x15
-; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: mul x22, x13, x20
-; CHECK-NEXT: add v5.2d, v5.2d, v14.2d
-; CHECK-NEXT: fmov d3, x21
-; CHECK-NEXT: mul x19, x17, x6
-; CHECK-NEXT: mov v0.d[1], x4
-; CHECK-NEXT: fmov d1, x2
-; CHECK-NEXT: mul x17, x17, x20
-; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: add v5.2d, v13.2d, v14.2d
-; CHECK-NEXT: fmov d2, x22
-; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: mul x7, x16, x6
-; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload
-; CHECK-NEXT: mov v3.d[1], x19
-; CHECK-NEXT: add v13.2d, v13.2d, v0.2d
-; CHECK-NEXT: mul x16, x16, x20
-; CHECK-NEXT: mov v1.d[1], x17
-; CHECK-NEXT: mul x23, x5, x20
-; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: mov v13.16b, v5.16b
-; CHECK-NEXT: mov v5.16b, v17.16b
-; CHECK-NEXT: mov v17.16b, v20.16b
-; CHECK-NEXT: mov v20.16b, v24.16b
-; CHECK-NEXT: mul x13, x13, x6
-; CHECK-NEXT: mov v24.16b, v28.16b
-; CHECK-NEXT: add v11.2d, v11.2d, v3.2d
-; CHECK-NEXT: mov v2.d[1], x16
+; CHECK-NEXT: mov v0.d[1], x1
+; CHECK-NEXT: fmov d1, x4
+; CHECK-NEXT: add x12, x12, #1
+; CHECK-NEXT: mul x17, x17, x5
+; CHECK-NEXT: fmov d5, x20
+; CHECK-NEXT: mul x6, x15, x15
+; CHECK-NEXT: add v23.2d, v23.2d, v0.2d
+; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: mov v1.d[1], x3
+; CHECK-NEXT: mul x7, x15, x5
+; CHECK-NEXT: add v0.2d, v0.2d, v15.2d
+; CHECK-NEXT: fmov d2, x17
+; CHECK-NEXT: mul x0, x14, x5
+; CHECK-NEXT: fmov d4, x6
+; CHECK-NEXT: mul x19, x16, x5
+; CHECK-NEXT: stp q0, q23, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: fmov d3, x7
+; CHECK-NEXT: ldr q23, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: mul x17, x2, x15
+; CHECK-NEXT: add v0.2d, v0.2d, v15.2d
+; CHECK-NEXT: ldr q15, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: mov v5.d[1], x0
+; CHECK-NEXT: mov v4.d[1], x6
+; CHECK-NEXT: mul x16, x16, x15
+; CHECK-NEXT: mov v3.d[1], x7
; CHECK-NEXT: add v15.2d, v15.2d, v1.2d
-; CHECK-NEXT: add v27.2d, v27.2d, v3.2d
-; CHECK-NEXT: mul x18, x14, x20
-; CHECK-NEXT: add v23.2d, v23.2d, v3.2d
-; CHECK-NEXT: add v19.2d, v19.2d, v3.2d
-; CHECK-NEXT: fmov d4, x23
-; CHECK-NEXT: add v10.2d, v10.2d, v3.2d
-; CHECK-NEXT: mul x15, x5, x6
-; CHECK-NEXT: fmov d0, x13
-; CHECK-NEXT: add v14.2d, v14.2d, v2.2d
-; CHECK-NEXT: add v2.2d, v6.2d, v3.2d
-; CHECK-NEXT: mul x14, x14, x6
-; CHECK-NEXT: mov v3.16b, v7.16b
-; CHECK-NEXT: mov v7.16b, v18.16b
-; CHECK-NEXT: mov v4.d[1], x18
-; CHECK-NEXT: mov v18.16b, v22.16b
-; CHECK-NEXT: mov v0.d[1], x7
-; CHECK-NEXT: fmov d1, x15
-; CHECK-NEXT: add v28.2d, v8.2d, v4.2d
-; CHECK-NEXT: mov v1.d[1], x14
-; CHECK-NEXT: add v31.2d, v31.2d, v0.2d
-; CHECK-NEXT: add v30.2d, v30.2d, v0.2d
+; CHECK-NEXT: mov v2.d[1], x19
+; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: mov v1.16b, v6.16b
+; CHECK-NEXT: mul x14, x14, x15
+; CHECK-NEXT: mov v6.16b, v20.16b
+; CHECK-NEXT: mov v20.16b, v25.16b
+; CHECK-NEXT: fmov d0, x17
+; CHECK-NEXT: mov v25.16b, v30.16b
+; CHECK-NEXT: add v30.2d, v9.2d, v5.2d
+; CHECK-NEXT: mov v5.16b, v19.16b
+; CHECK-NEXT: mov v19.16b, v24.16b
+; CHECK-NEXT: add v11.2d, v11.2d, v3.2d
+; CHECK-NEXT: mov v14.d[1], x16
+; CHECK-NEXT: mov v3.16b, v17.16b
+; CHECK-NEXT: mov v17.16b, v10.16b
+; CHECK-NEXT: mov v10.16b, v7.16b
+; CHECK-NEXT: add v8.2d, v8.2d, v2.2d
+; CHECK-NEXT: mov v2.16b, v16.16b
+; CHECK-NEXT: mov v0.d[1], x14
+; CHECK-NEXT: mov v16.16b, v21.16b
+; CHECK-NEXT: mov v21.16b, v26.16b
+; CHECK-NEXT: add v13.2d, v13.2d, v4.2d
+; CHECK-NEXT: add v26.2d, v31.2d, v4.2d
+; CHECK-NEXT: add v24.2d, v28.2d, v4.2d
+; CHECK-NEXT: add v19.2d, v19.2d, v4.2d
+; CHECK-NEXT: add v6.2d, v6.2d, v4.2d
+; CHECK-NEXT: add v1.2d, v1.2d, v4.2d
+; CHECK-NEXT: ldp q4, q7, [sp, #16] // 32-byte Folded Reload
+; CHECK-NEXT: add v10.2d, v10.2d, v14.2d
+; CHECK-NEXT: add v29.2d, v29.2d, v14.2d
+; CHECK-NEXT: add v27.2d, v27.2d, v14.2d
+; CHECK-NEXT: add v23.2d, v23.2d, v14.2d
+; CHECK-NEXT: add v22.2d, v22.2d, v14.2d
+; CHECK-NEXT: add v20.2d, v20.2d, v14.2d
+; CHECK-NEXT: add v16.2d, v16.2d, v14.2d
+; CHECK-NEXT: add v7.2d, v7.2d, v14.2d
+; CHECK-NEXT: add v5.2d, v5.2d, v14.2d
+; CHECK-NEXT: add v3.2d, v3.2d, v14.2d
+; CHECK-NEXT: add v2.2d, v2.2d, v14.2d
; CHECK-NEXT: add v12.2d, v12.2d, v0.2d
-; CHECK-NEXT: add v24.2d, v24.2d, v0.2d
-; CHECK-NEXT: add v22.2d, v26.2d, v0.2d
-; CHECK-NEXT: add v20.2d, v20.2d, v0.2d
-; CHECK-NEXT: add v18.2d, v18.2d, v0.2d
+; CHECK-NEXT: add v25.2d, v25.2d, v0.2d
+; CHECK-NEXT: add v21.2d, v21.2d, v0.2d
; CHECK-NEXT: add v17.2d, v17.2d, v0.2d
-; CHECK-NEXT: add v7.2d, v7.2d, v0.2d
-; CHECK-NEXT: add v4.2d, v16.2d, v0.2d
-; CHECK-NEXT: add v3.2d, v3.2d, v0.2d
-; CHECK-NEXT: mov v0.16b, v21.16b
-; CHECK-NEXT: mov v21.16b, v29.16b
-; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: add v9.2d, v9.2d, v1.2d
-; CHECK-NEXT: add v6.2d, v25.2d, v1.2d
-; CHECK-NEXT: add v5.2d, v5.2d, v1.2d
-; CHECK-NEXT: add v29.2d, v29.2d, v1.2d
-; CHECK-NEXT: add v21.2d, v21.2d, v1.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: add v4.2d, v4.2d, v0.2d
+; CHECK-NEXT: add v0.2d, v18.2d, v0.2d
+; CHECK-NEXT: mov x14, x13
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp q28, q18, [sp, #64] // 32-byte Folded Reload
; CHECK-NEXT: adrp x8, C
; CHECK-NEXT: add x8, x8, :lo12:C
-; CHECK-NEXT: stp q11, q30, [x8, #80]
-; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT: str q1, [x8]
-; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x23, [sp, #160] // 8-byte Folded Reload
-; CHECK-NEXT: stp q15, q14, [x8, #144]
-; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: stp q1, q13, [x8, #16]
-; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: stp q28, q12, [x8, #176]
-; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT: stp q1, q31, [x8, #48]
-; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT: stp q9, q24, [x8, #240]
-; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT: stp q19, q18, [x8, #336]
-; CHECK-NEXT: stp q10, q7, [x8, #400]
-; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT: str q29, [x8, #112]
-; CHECK-NEXT: str q27, [x8, #208]
-; CHECK-NEXT: stp q23, q22, [x8, #272]
+; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT: stp q10, q13, [x8, #64]
+; CHECK-NEXT: stp q28, q18, [x8]
+; CHECK-NEXT: ldr q18, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: stp q29, q12, [x8, #96]
+; CHECK-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT: stp q18, q15, [x8, #32]
+; CHECK-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT: stp q11, q8, [x8, #144]
+; CHECK-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT: stp q30, q27, [x8, #176]
+; CHECK-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload
+; CHECK-NEXT: str q26, [x8, #208]
+; CHECK-NEXT: stp q25, q23, [x8, #240]
+; CHECK-NEXT: stp q24, q22, [x8, #272]
; CHECK-NEXT: stp q21, q20, [x8, #304]
-; CHECK-NEXT: stp q6, q17, [x8, #368]
-; CHECK-NEXT: stp q5, q4, [x8, #432]
-; CHECK-NEXT: stp q2, q3, [x8, #464]
+; CHECK-NEXT: stp q19, q7, [x8, #336]
+; CHECK-NEXT: stp q17, q16, [x8, #368]
+; CHECK-NEXT: stp q6, q5, [x8, #400]
+; CHECK-NEXT: stp q4, q3, [x8, #432]
+; CHECK-NEXT: stp q1, q2, [x8, #464]
; CHECK-NEXT: str q0, [x8, #496]
-; CHECK-NEXT: add sp, sp, #208
+; CHECK-NEXT: add sp, sp, #192
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: .cfi_restore w19
; CHECK-NEXT: .cfi_restore w20
-; CHECK-NEXT: .cfi_restore w21
-; CHECK-NEXT: .cfi_restore w22
-; CHECK-NEXT: .cfi_restore w23
; CHECK-NEXT: .cfi_restore b8
; CHECK-NEXT: .cfi_restore b9
; CHECK-NEXT: .cfi_restore b10
diff --git a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll
index eb3a0391eb79e..0ed29b48cf2f8 100644
--- a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll
+++ b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll
@@ -4,36 +4,35 @@
define i8 @scalarize_v16i8(ptr %p) {
; CHECK-LABEL: scalarize_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: umov w8, v0.b[0]
-; CHECK-NEXT: umov w9, v0.b[1]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: umov w15, v0.b[7]
-; CHECK-NEXT: umov w16, v0.b[8]
-; CHECK-NEXT: umov w17, v0.b[9]
-; CHECK-NEXT: umov w18, v0.b[10]
-; CHECK-NEXT: umov w0, v0.b[11]
-; CHECK-NEXT: umov w1, v0.b[12]
-; CHECK-NEXT: umov w2, v0.b[13]
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: umov w3, v0.b[14]
-; CHECK-NEXT: umov w4, v0.b[15]
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w10, w12, w13
-; CHECK-NEXT: add w11, w14, w15
+; CHECK-NEXT: ldrb w8, [x0, #3]
+; CHECK-NEXT: ldrb w9, [x0, #2]
+; CHECK-NEXT: ldrb w10, [x0, #1]
+; CHECK-NEXT: ldrb w11, [x0]
+; CHECK-NEXT: ldrb w13, [x0, #5]
+; CHECK-NEXT: ldrb w14, [x0, #4]
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: ldrb w12, [x0, #15]
+; CHECK-NEXT: ldrb w15, [x0, #11]
+; CHECK-NEXT: add w10, w11, w10
+; CHECK-NEXT: add w9, w14, w13
+; CHECK-NEXT: ldrb w11, [x0, #10]
+; CHECK-NEXT: ldrb w13, [x0, #9]
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: ldrb w14, [x0, #8]
+; CHECK-NEXT: ldrb w16, [x0, #7]
+; CHECK-NEXT: add w11, w11, w15
+; CHECK-NEXT: ldrb w17, [x0, #6]
+; CHECK-NEXT: ldrb w18, [x0, #14]
+; CHECK-NEXT: add w13, w14, w13
+; CHECK-NEXT: ldrb w1, [x0, #13]
+; CHECK-NEXT: ldrb w0, [x0, #12]
+; CHECK-NEXT: add w16, w17, w16
+; CHECK-NEXT: add w10, w13, w11
+; CHECK-NEXT: add w12, w18, w12
+; CHECK-NEXT: add w9, w9, w16
+; CHECK-NEXT: add w14, w0, w1
; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w12, w16, w17
-; CHECK-NEXT: add w13, w18, w0
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w14, w1, w2
-; CHECK-NEXT: add w10, w12, w13
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w15, w3, w4
-; CHECK-NEXT: add w11, w14, w15
+; CHECK-NEXT: add w11, w14, w12
; CHECK-NEXT: add w9, w10, w11
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
@@ -75,22 +74,21 @@ define i8 @scalarize_v16i8(ptr %p) {
define i8 @scalarize_v8i8(ptr %p) {
; CHECK-LABEL: scalarize_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: umov w8, v0.b[0]
-; CHECK-NEXT: umov w9, v0.b[1]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: umov w15, v0.b[7]
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w10, w12, w13
-; CHECK-NEXT: add w11, w14, w15
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ldrb w8, [x0, #7]
+; CHECK-NEXT: ldrb w9, [x0, #6]
+; CHECK-NEXT: ldrb w10, [x0, #5]
+; CHECK-NEXT: ldrb w11, [x0, #1]
+; CHECK-NEXT: ldrb w12, [x0]
+; CHECK-NEXT: ldrb w13, [x0, #4]
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: ldrb w14, [x0, #3]
+; CHECK-NEXT: ldrb w15, [x0, #2]
+; CHECK-NEXT: add w11, w12, w11
+; CHECK-NEXT: add w10, w13, w10
+; CHECK-NEXT: add w12, w15, w14
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: add w9, w11, w12
+; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%wide.load = load <8 x i8>, ptr %p, align 4
%l0 = extractelement <8 x i8> %wide.load, i32 0
@@ -114,22 +112,21 @@ define i8 @scalarize_v8i8(ptr %p) {
define i16 @scalarize_v8i16(ptr %p) {
; CHECK-LABEL: scalarize_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: umov w8, v0.h[0]
-; CHECK-NEXT: umov w9, v0.h[1]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: umov w12, v0.h[4]
-; CHECK-NEXT: umov w13, v0.h[5]
-; CHECK-NEXT: umov w14, v0.h[6]
-; CHECK-NEXT: umov w15, v0.h[7]
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w10, w12, w13
-; CHECK-NEXT: add w11, w14, w15
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ldrh w8, [x0, #14]
+; CHECK-NEXT: ldrh w9, [x0, #12]
+; CHECK-NEXT: ldrh w10, [x0, #10]
+; CHECK-NEXT: ldrh w11, [x0, #2]
+; CHECK-NEXT: ldrh w12, [x0]
+; CHECK-NEXT: ldrh w13, [x0, #8]
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: ldrh w14, [x0, #6]
+; CHECK-NEXT: ldrh w15, [x0, #4]
+; CHECK-NEXT: add w11, w12, w11
+; CHECK-NEXT: add w10, w13, w10
+; CHECK-NEXT: add w12, w15, w14
+; CHECK-NEXT: add w8, w10, w8
+; CHECK-NEXT: add w9, w11, w12
+; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%wide.load = load <8 x i16>, ptr %p, align 4
%l0 = extractelement <8 x i16> %wide.load, i32 0
@@ -153,14 +150,13 @@ define i16 @scalarize_v8i16(ptr %p) {
define i16 @scalarize_v4i16(ptr %p) {
; CHECK-LABEL: scalarize_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: umov w8, v0.h[0]
-; CHECK-NEXT: umov w9, v0.h[1]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: add w9, w10, w11
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ldrh w8, [x0, #6]
+; CHECK-NEXT: ldrh w9, [x0, #4]
+; CHECK-NEXT: ldrh w10, [x0, #2]
+; CHECK-NEXT: ldrh w11, [x0]
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: add w10, w11, w10
+; CHECK-NEXT: add w0, w10, w8
; CHECK-NEXT: ret
%wide.load = load <4 x i16>, ptr %p, align 4
%l0 = extractelement <4 x i16> %wide.load, i32 0
@@ -176,13 +172,10 @@ define i16 @scalarize_v4i16(ptr %p) {
define i32 @scalarize_v4i32(ptr %p) {
; CHECK-LABEL: scalarize_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[3]
-; CHECK-NEXT: fmov w11, s0
-; CHECK-NEXT: add w8, w11, w8
-; CHECK-NEXT: add w9, w9, w10
+; CHECK-NEXT: ldp w9, w8, [x0]
+; CHECK-NEXT: ldp w10, w11, [x0, #8]
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: add w9, w10, w11
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%wide.load = load <4 x i32>, ptr %p, align 4
@@ -199,11 +192,10 @@ define i32 @scalarize_v4i32(ptr %p) {
define i64 @scalarize_v4i64(ptr %p) {
; CHECK-LABEL: scalarize_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q0, [x0]
-; CHECK-NEXT: addp d1, v1.2d
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: ldp x10, x11, [x0, #16]
+; CHECK-NEXT: add x8, x8, x9
+; CHECK-NEXT: add x9, x10, x11
; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
%wide.load = load <4 x i64>, ptr %p, align 4
@@ -220,14 +212,11 @@ define i64 @scalarize_v4i64(ptr %p) {
define i64 @scalarize_v4i32_sext(ptr %p) {
; CHECK-LABEL: scalarize_v4i32_sext:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0
-; CHECK-NEXT: sshll v0.2d, v0.2s, #0
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: addp d1, v1.2d
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ldpsw x9, x8, [x0, #8]
+; CHECK-NEXT: ldpsw x11, x10, [x0]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: add x10, x11, x10
+; CHECK-NEXT: add x0, x10, x8
; CHECK-NEXT: ret
%wide.load = load <4 x i32>, ptr %p, align 4
%ext = sext <4 x i32> %wide.load to <4 x i64>
@@ -244,14 +233,11 @@ define i64 @scalarize_v4i32_sext(ptr %p) {
define i64 @scalarize_v4i32_zext(ptr %p) {
; CHECK-LABEL: scalarize_v4i32_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: addp d1, v1.2d
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ldp w9, w8, [x0, #8]
+; CHECK-NEXT: ldp w11, w10, [x0]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: add x10, x11, x10
+; CHECK-NEXT: add x0, x10, x8
; CHECK-NEXT: ret
%wide.load = load <4 x i32>, ptr %p, align 4
%ext = zext <4 x i32> %wide.load to <4 x i64>
@@ -340,55 +326,43 @@ define double @scalarize_v4f64(ptr %p) {
define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
; CHECK-LABEL: scalarize_into_load:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q1, q0, [x1]
-; CHECK-NEXT: ldp q3, q2, [x1, #96]
-; CHECK-NEXT: ldp q5, q4, [x1, #64]
-; CHECK-NEXT: ldp q7, q6, [x1, #32]
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: mov x1, v3.d[1]
-; CHECK-NEXT: mov x4, v2.d[1]
-; CHECK-NEXT: mov x16, v5.d[1]
-; CHECK-NEXT: mov x18, v4.d[1]
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: mov x12, v7.d[1]
-; CHECK-NEXT: mov x14, v6.d[1]
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: fmov x13, d7
-; CHECK-NEXT: fmov x15, d6
-; CHECK-NEXT: fmov x17, d5
-; CHECK-NEXT: fmov x0, d4
-; CHECK-NEXT: fmov x3, d3
-; CHECK-NEXT: fmov x5, d2
-; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
-; CHECK-NEXT: ldr s1, [x2, x8, lsl #2]
-; CHECK-NEXT: ldr s2, [x2, x11, lsl #2]
-; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
-; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
-; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
-; CHECK-NEXT: ldr s6, [x2, x15, lsl #2]
-; CHECK-NEXT: ldr s7, [x2, x14, lsl #2]
-; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
-; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
-; CHECK-NEXT: ldr s18, [x2, x0, lsl #2]
-; CHECK-NEXT: ldr s19, [x2, x18, lsl #2]
-; CHECK-NEXT: ldr s20, [x2, x3, lsl #2]
-; CHECK-NEXT: ldr s21, [x2, x1, lsl #2]
-; CHECK-NEXT: ldr s22, [x2, x5, lsl #2]
-; CHECK-NEXT: ldr s23, [x2, x4, lsl #2]
+; CHECK-NEXT: ldp x8, x9, [x1]
+; CHECK-NEXT: ldp x10, x11, [x1, #16]
+; CHECK-NEXT: ldp x12, x13, [x1, #64]
+; CHECK-NEXT: ldr s0, [x2, x8, lsl #2]
+; CHECK-NEXT: ldr s1, [x2, x9, lsl #2]
+; CHECK-NEXT: ldp x8, x9, [x1, #32]
+; CHECK-NEXT: ldr s2, [x2, x10, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x11, lsl #2]
; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ldr s6, [x2, x12, lsl #2]
+; CHECK-NEXT: ldp x10, x11, [x1, #48]
+; CHECK-NEXT: ldr s7, [x2, x13, lsl #2]
; CHECK-NEXT: fadd s1, s2, s3
-; CHECK-NEXT: fadd s2, s4, s5
-; CHECK-NEXT: fadd s3, s6, s7
-; CHECK-NEXT: fadd s4, s16, s17
-; CHECK-NEXT: fadd s5, s18, s19
-; CHECK-NEXT: fadd s6, s20, s21
-; CHECK-NEXT: fadd s7, s22, s23
+; CHECK-NEXT: ldr s2, [x2, x8, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x9, lsl #2]
+; CHECK-NEXT: ldp x14, x15, [x1, #80]
+; CHECK-NEXT: fadd s2, s2, s3
+; CHECK-NEXT: ldr s4, [x2, x10, lsl #2]
+; CHECK-NEXT: ldr s5, [x2, x11, lsl #2]
+; CHECK-NEXT: ldp x16, x17, [x1, #96]
+; CHECK-NEXT: fadd s3, s4, s5
+; CHECK-NEXT: fadd s4, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ldp x18, x0, [x1, #112]
+; CHECK-NEXT: ldr s16, [x2, x14, lsl #2]
+; CHECK-NEXT: ldr s17, [x2, x15, lsl #2]
+; CHECK-NEXT: ldr s18, [x2, x16, lsl #2]
+; CHECK-NEXT: ldr s19, [x2, x17, lsl #2]
+; CHECK-NEXT: ldr s20, [x2, x18, lsl #2]
+; CHECK-NEXT: ldr s21, [x2, x0, lsl #2]
+; CHECK-NEXT: fadd s5, s16, s17
+; CHECK-NEXT: fadd s6, s18, s19
; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: fadd s7, s20, s21
; CHECK-NEXT: fadd s2, s4, s5
-; CHECK-NEXT: fadd s3, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fadd s3, s6, s7
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
@@ -463,57 +437,39 @@ entry:
define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
; CHECK-LABEL: scalarize_into_load_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q0, q2, [x1]
-; CHECK-NEXT: ldp q4, q1, [x1, #32]
-; CHECK-NEXT: sshll v3.2d, v0.2s, #0
-; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0
-; CHECK-NEXT: sshll2 v6.2d, v2.4s, #0
-; CHECK-NEXT: sshll2 v5.2d, v1.4s, #0
-; CHECK-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-NEXT: sshll v2.2d, v2.2s, #0
-; CHECK-NEXT: sshll2 v7.2d, v4.4s, #0
-; CHECK-NEXT: sshll v4.2d, v4.2s, #0
-; CHECK-NEXT: mov x8, v3.d[1]
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: mov x14, v6.d[1]
-; CHECK-NEXT: mov x12, v2.d[1]
-; CHECK-NEXT: mov x1, v1.d[1]
-; CHECK-NEXT: mov x4, v5.d[1]
-; CHECK-NEXT: mov x16, v4.d[1]
-; CHECK-NEXT: mov x18, v7.d[1]
-; CHECK-NEXT: fmov x9, d3
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: fmov x13, d2
-; CHECK-NEXT: fmov x15, d6
-; CHECK-NEXT: fmov x17, d4
-; CHECK-NEXT: fmov x0, d7
-; CHECK-NEXT: ldr s2, [x2, x8, lsl #2]
-; CHECK-NEXT: fmov x3, d1
-; CHECK-NEXT: fmov x5, d5
+; CHECK-NEXT: ldpsw x9, x8, [x1]
+; CHECK-NEXT: ldpsw x11, x10, [x1, #8]
+; CHECK-NEXT: ldpsw x13, x12, [x1, #24]
; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
-; CHECK-NEXT: ldr s1, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s1, [x2, x8, lsl #2]
+; CHECK-NEXT: ldpsw x9, x8, [x1, #56]
+; CHECK-NEXT: ldr s2, [x2, x11, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ldpsw x11, x10, [x1, #48]
+; CHECK-NEXT: ldpsw x15, x14, [x1, #16]
+; CHECK-NEXT: ldpsw x17, x16, [x1, #40]
+; CHECK-NEXT: ldpsw x0, x18, [x1, #32]
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: ldr s2, [x2, x15, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
-; CHECK-NEXT: ldr s6, [x2, x15, lsl #2]
-; CHECK-NEXT: ldr s7, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
+; CHECK-NEXT: ldr s6, [x2, x0, lsl #2]
+; CHECK-NEXT: fadd s2, s2, s3
+; CHECK-NEXT: ldr s7, [x2, x18, lsl #2]
; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
-; CHECK-NEXT: ldr s18, [x2, x0, lsl #2]
-; CHECK-NEXT: ldr s19, [x2, x18, lsl #2]
-; CHECK-NEXT: ldr s20, [x2, x3, lsl #2]
-; CHECK-NEXT: ldr s21, [x2, x1, lsl #2]
-; CHECK-NEXT: ldr s22, [x2, x5, lsl #2]
-; CHECK-NEXT: ldr s23, [x2, x4, lsl #2]
-; CHECK-NEXT: fadd s0, s0, s2
-; CHECK-NEXT: fadd s1, s1, s3
-; CHECK-NEXT: fadd s2, s4, s5
-; CHECK-NEXT: fadd s3, s6, s7
-; CHECK-NEXT: fadd s4, s16, s17
-; CHECK-NEXT: fadd s5, s18, s19
-; CHECK-NEXT: fadd s6, s20, s21
-; CHECK-NEXT: fadd s7, s22, s23
+; CHECK-NEXT: fadd s3, s4, s5
+; CHECK-NEXT: ldr s18, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s19, [x2, x10, lsl #2]
+; CHECK-NEXT: fadd s4, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ldr s20, [x2, x9, lsl #2]
+; CHECK-NEXT: ldr s21, [x2, x8, lsl #2]
+; CHECK-NEXT: fadd s5, s16, s17
+; CHECK-NEXT: fadd s6, s18, s19
+; CHECK-NEXT: fadd s7, s20, s21
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s2, s4, s5
; CHECK-NEXT: fadd s3, s6, s7
@@ -593,57 +549,39 @@ entry:
define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
; CHECK-LABEL: scalarize_into_load_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q0, q2, [x1]
-; CHECK-NEXT: ldp q4, q1, [x1, #32]
-; CHECK-NEXT: ushll v3.2d, v0.2s, #0
-; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT: ushll2 v6.2d, v2.4s, #0
-; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0
-; CHECK-NEXT: ushll v4.2d, v4.2s, #0
-; CHECK-NEXT: mov x8, v3.d[1]
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: mov x14, v6.d[1]
-; CHECK-NEXT: mov x12, v2.d[1]
-; CHECK-NEXT: mov x1, v1.d[1]
-; CHECK-NEXT: mov x4, v5.d[1]
-; CHECK-NEXT: mov x16, v4.d[1]
-; CHECK-NEXT: mov x18, v7.d[1]
-; CHECK-NEXT: fmov x9, d3
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: fmov x13, d2
-; CHECK-NEXT: fmov x15, d6
-; CHECK-NEXT: fmov x17, d4
-; CHECK-NEXT: fmov x0, d7
-; CHECK-NEXT: ldr s2, [x2, x8, lsl #2]
-; CHECK-NEXT: fmov x3, d1
-; CHECK-NEXT: fmov x5, d5
+; CHECK-NEXT: ldp w9, w8, [x1]
+; CHECK-NEXT: ldp w11, w10, [x1, #8]
+; CHECK-NEXT: ldp w13, w12, [x1, #24]
; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
-; CHECK-NEXT: ldr s1, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s1, [x2, x8, lsl #2]
+; CHECK-NEXT: ldp w9, w8, [x1, #56]
+; CHECK-NEXT: ldr s2, [x2, x11, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ldp w11, w10, [x1, #48]
+; CHECK-NEXT: ldp w15, w14, [x1, #16]
+; CHECK-NEXT: ldp w17, w16, [x1, #40]
+; CHECK-NEXT: ldp w0, w18, [x1, #32]
+; CHECK-NEXT: fadd s1, s2, s3
+; CHECK-NEXT: ldr s2, [x2, x15, lsl #2]
+; CHECK-NEXT: ldr s3, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
-; CHECK-NEXT: ldr s6, [x2, x15, lsl #2]
-; CHECK-NEXT: ldr s7, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
+; CHECK-NEXT: ldr s6, [x2, x0, lsl #2]
+; CHECK-NEXT: fadd s2, s2, s3
+; CHECK-NEXT: ldr s7, [x2, x18, lsl #2]
; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
-; CHECK-NEXT: ldr s18, [x2, x0, lsl #2]
-; CHECK-NEXT: ldr s19, [x2, x18, lsl #2]
-; CHECK-NEXT: ldr s20, [x2, x3, lsl #2]
-; CHECK-NEXT: ldr s21, [x2, x1, lsl #2]
-; CHECK-NEXT: ldr s22, [x2, x5, lsl #2]
-; CHECK-NEXT: ldr s23, [x2, x4, lsl #2]
-; CHECK-NEXT: fadd s0, s0, s2
-; CHECK-NEXT: fadd s1, s1, s3
-; CHECK-NEXT: fadd s2, s4, s5
-; CHECK-NEXT: fadd s3, s6, s7
-; CHECK-NEXT: fadd s4, s16, s17
-; CHECK-NEXT: fadd s5, s18, s19
-; CHECK-NEXT: fadd s6, s20, s21
-; CHECK-NEXT: fadd s7, s22, s23
+; CHECK-NEXT: fadd s3, s4, s5
+; CHECK-NEXT: ldr s18, [x2, x11, lsl #2]
+; CHECK-NEXT: ldr s19, [x2, x10, lsl #2]
+; CHECK-NEXT: fadd s4, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ldr s20, [x2, x9, lsl #2]
+; CHECK-NEXT: ldr s21, [x2, x8, lsl #2]
+; CHECK-NEXT: fadd s5, s16, s17
+; CHECK-NEXT: fadd s6, s18, s19
+; CHECK-NEXT: fadd s7, s20, s21
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s2, s4, s5
; CHECK-NEXT: fadd s3, s6, s7
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index f6ed2e6a787f0..ba7bee9a94bac 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -19,14 +19,12 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 {
; CHECK-LABEL: load_zext_v2i64i256:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldp x0, x4, [x0]
; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: mov x2, xzr
; CHECK-NEXT: mov x3, xzr
; CHECK-NEXT: mov x5, xzr
; CHECK-NEXT: mov x6, xzr
-; CHECK-NEXT: mov x4, v0.d[1]
-; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: mov x7, xzr
; CHECK-NEXT: ret
%a = load <2 x i64>, ptr %ap
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index ebd32c73ec65b..6fd5b820a2242 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -438,8 +438,7 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: ldr x8, [x0]
; CHECK-NEXT: // implicit-def: $d0
; CHECK-NEXT: cbnz x8, .LBB15_2
; CHECK-NEXT: // %bb.1: // %cond.load
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 67a0379d05244..7cec64d1ac2a7 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -462,10 +462,9 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
; CHECK-NEXT: orr x8, x9, x8, lsl #1
; CHECK-NEXT: strh w1, [x10]
; CHECK-NEXT: strh w2, [x8]
-; CHECK-NEXT: ldr d0, [sp, #8]
-; CHECK-NEXT: umov.h w0, v0[0]
-; CHECK-NEXT: umov.h w1, v0[1]
-; CHECK-NEXT: umov.h w2, v0[2]
+; CHECK-NEXT: ldrh w0, [sp, #8]
+; CHECK-NEXT: ldrh w1, [sp, #10]
+; CHECK-NEXT: ldrh w2, [sp, #12]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
>From 94c694fd3cceaef59d8072a793231929c709ac71 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 22 Sep 2025 12:35:03 +0100
Subject: [PATCH 2/2] Add support for scalable vectors
---
.../Target/AArch64/AArch64ISelLowering.cpp | 6 +-
.../AArch64/sme-streaming-interface.ll | 4 +-
.../AArch64/sve-fixed-length-splat-vector.ll | 16 ++---
...e-streaming-mode-fixed-length-ext-loads.ll | 16 ++---
...-streaming-mode-fixed-length-ld2-alloca.ll | 4 +-
...eaming-mode-fixed-length-vector-shuffle.ll | 64 ++++++++-----------
6 files changed, 48 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f3b01baadb141..cdc2c90c93627 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20470,8 +20470,7 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// Given an extract(load) or extract(extend(load)), produce a scalar load
// instead to avoid the cross-register-bank copies.
if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
- VT.isInteger() && isa<ConstantSDNode>(N1) &&
- !N0.getValueType().isScalableVector()) {
+ VT.isInteger() && isa<ConstantSDNode>(N1)) {
SDValue LoadN0 = N0;
// Look through sext/zext and extract_subvector / insert_subvector if
// required.
@@ -20481,8 +20480,7 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
N0.getOperand(0).hasOneUse())
LoadN0 = N0.getOperand(0);
unsigned OffsetElts = 0;
- if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- !LoadN0.getOperand(0).getValueType().isScalableVector()) {
+ if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
OffsetElts = LoadN0.getConstantOperandVal(1);
LoadN0 = LoadN0.getOperand(0);
}
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 505a40c16653b..d00efa7d99d53 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -374,8 +374,8 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl foo
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp, #2, mul vl]
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: addvl x8, sp, #2
+; CHECK-NEXT: ldrb w0, [x8]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index a69808d32ed73..4f5a5a6dee257 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -727,8 +727,8 @@ define void @load_splat_v4f64(ptr %a, ptr %b) vscale_range(2,2) #0 {
define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr z0, [x0]
-; CHECK-NEXT: mov z0.b, b0
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0]
; CHECK-NEXT: str z0, [x1]
; CHECK-NEXT: ret
%v = load <32 x i8>, ptr %a
@@ -740,8 +740,8 @@ define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 {
define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr z0, [x0]
-; CHECK-NEXT: mov z0.h, h0
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
; CHECK-NEXT: str z0, [x1]
; CHECK-NEXT: ret
%v = load <16 x i16>, ptr %a
@@ -753,8 +753,8 @@ define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 {
define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr z0, [x0]
-; CHECK-NEXT: mov z0.s, s0
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
; CHECK-NEXT: str z0, [x1]
; CHECK-NEXT: ret
%v = load <8 x i32>, ptr %a
@@ -766,8 +766,8 @@ define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 {
define void @load_splat_v4i64(ptr %a, ptr %b) vscale_range(2,2) #0 {
; CHECK-LABEL: load_splat_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr z0, [x0]
-; CHECK-NEXT: mov z0.d, d0
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
; CHECK-NEXT: str z0, [x1]
; CHECK-NEXT: ret
%v = load <4 x i64>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 4d524bc848de6..e433786cfdd1f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -99,16 +99,14 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
; CHECK-LABEL: load_zext_v2i64i256:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldp x8, x4, [x0]
; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: mov x2, xzr
; CHECK-NEXT: mov x3, xzr
; CHECK-NEXT: mov x5, xzr
; CHECK-NEXT: mov x6, xzr
-; CHECK-NEXT: mov z1.d, z0.d[1]
-; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: mov x7, xzr
-; CHECK-NEXT: fmov x4, d1
+; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
@@ -282,14 +280,12 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
; CHECK-LABEL: load_sext_v2i64i256:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov z1.d, z0.d[1]
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: fmov x4, d1
-; CHECK-NEXT: asr x1, x0, #63
+; CHECK-NEXT: ldp x8, x4, [x0]
+; CHECK-NEXT: asr x1, x8, #63
+; CHECK-NEXT: asr x5, x4, #63
+; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: mov x2, x1
; CHECK-NEXT: mov x3, x1
-; CHECK-NEXT: asr x5, x4, #63
; CHECK-NEXT: mov x6, x5
; CHECK-NEXT: mov x7, x5
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index e6c6003ee6c69..094eaad0cfe80 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -115,9 +115,9 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: ldrb w8, [sp, #16]
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: stur b1, [x19, #8]
+; CHECK-NEXT: strb w8, [x19, #8]
; CHECK-NEXT: str d0, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index ea6123edc8b4c..7b9b69e0d9b4d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -101,15 +101,13 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
; CHECK-LABEL: shuffle_ext_byone_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0, #16]
-; CHECK-NEXT: ldp q1, q3, [x1]
-; CHECK-NEXT: mov z0.b, z0.b[15]
-; CHECK-NEXT: mov z2.b, z1.b[15]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: insr z1.b, w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: insr z3.b, w8
-; CHECK-NEXT: stp q1, q3, [x0]
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: ldrb w8, [x0, #31]
+; CHECK-NEXT: mov z1.b, z0.b[15]
+; CHECK-NEXT: insr z0.b, w8
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: insr z2.b, w8
+; CHECK-NEXT: stp q0, q2, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
@@ -238,15 +236,13 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
; CHECK-LABEL: shuffle_ext_byone_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0, #16]
-; CHECK-NEXT: ldp q1, q3, [x1]
-; CHECK-NEXT: mov z0.h, z0.h[7]
-; CHECK-NEXT: mov z2.h, z1.h[7]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: insr z1.h, w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: insr z3.h, w8
-; CHECK-NEXT: stp q1, q3, [x0]
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: ldrh w8, [x0, #30]
+; CHECK-NEXT: mov z1.h, z0.h[7]
+; CHECK-NEXT: insr z0.h, w8
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: insr z2.h, w8
+; CHECK-NEXT: stp q0, q2, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
@@ -341,15 +337,13 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
; CHECK-LABEL: shuffle_ext_byone_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0, #16]
-; CHECK-NEXT: ldp q1, q3, [x1]
-; CHECK-NEXT: mov z0.s, z0.s[3]
-; CHECK-NEXT: mov z2.s, z1.s[3]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: insr z1.s, w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: insr z3.s, w8
-; CHECK-NEXT: stp q1, q3, [x0]
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: ldr w8, [x0, #28]
+; CHECK-NEXT: mov z1.s, z0.s[3]
+; CHECK-NEXT: insr z0.s, w8
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: insr z2.s, w8
+; CHECK-NEXT: stp q0, q2, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
@@ -409,15 +403,13 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
; CHECK-LABEL: shuffle_ext_byone_v4i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0, #16]
-; CHECK-NEXT: ldp q1, q3, [x1]
-; CHECK-NEXT: mov z0.d, z0.d[1]
-; CHECK-NEXT: mov z2.d, z1.d[1]
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: insr z1.d, x8
-; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: insr z3.d, x8
-; CHECK-NEXT: stp q1, q3, [x0]
+; CHECK-NEXT: ldp q0, q2, [x1]
+; CHECK-NEXT: ldr x8, [x0, #24]
+; CHECK-NEXT: mov z1.d, z0.d[1]
+; CHECK-NEXT: insr z0.d, x8
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: insr z2.d, x8
+; CHECK-NEXT: stp q0, q2, [x0]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
More information about the llvm-commits
mailing list