[llvm] [AArch64] Custom lower v4i8 subreg extract. (PR #133438)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 28 05:43:34 PDT 2025
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/133438
A v4i8 extract will usually be scalarized. This prevents that during lowering, converting it to an anyext and larger v4i16 subvector extract. There are a few minor regressions that are fixed up in the last patch in this commit.
The fold in DAGCombine was not handling the extended BUILDVECTORs that we see when i8/i16 are not legal types. Using isConstOrConstSplat(N1, false, true) allows it to match truncated constants. The other changes in visitAND are to make sure that truncated values in N1C are treated correctly, the fold we are mostly interested in is:
```
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() && N1C &&
ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
```
I couldn't find any other places where this triggered, as buildvectors are usually optimized before type legalization.
This also currently includes #133433.
>From f61ef9c6ac5267a2cabcb0b8c68abfc17513bbff Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 28 Mar 2025 08:55:26 +0000
Subject: [PATCH 1/3] [AArch64] Prefer zip over ushll for anyext.
Many CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some
tablegen patterns for preferring zip in anyext patterns.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 17 +++++++++++++++++
llvm/test/CodeGen/AArch64/andorxor.ll | 12 ++++++------
.../CodeGen/AArch64/bitcast-promote-widen.ll | 3 +--
llvm/test/CodeGen/AArch64/bitcast.ll | 3 +--
llvm/test/CodeGen/AArch64/extbinopload.ll | 4 ++--
.../CodeGen/AArch64/extract-subvec-combine.ll | 19 +++++++++++++------
llvm/test/CodeGen/AArch64/neon-bitcast.ll | 5 ++---
.../sve-fixed-length-extract-subvector.ll | 4 +---
.../AArch64/vec3-loads-ext-trunc-stores.ll | 2 +-
llvm/test/CodeGen/AArch64/zext.ll | 2 +-
10 files changed, 45 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c61e3a613f6f..f291589e04c6b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors
(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
(UZP2v4i32 V128:$Vn, V128:$Vm)>;
+// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise
+// the extract_subvector can be free.
+let HasOneUse = 1 in
+def anyext_oneuse: PatFrag<(ops node:$src0), (anyext $src0)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 0))),
+ (ZIP1v8i8 V64:$Vn, V64:$Vn)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 0))),
+ (ZIP1v4i16 V64:$Vn, V64:$Vn)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 0))),
+ (ZIP1v2i32 V64:$Vn, V64:$Vn)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 4))),
+ (ZIP2v8i8 V64:$Vn, V64:$Vn)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 2))),
+ (ZIP2v4i16 V64:$Vn, V64:$Vn)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 1))),
+ (ZIP2v2i32 V64:$Vn, V64:$Vn)>;
+
//----------------------------------------------------------------------------
// AdvSIMD TBL/TBX instructions
//----------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 24f2549cce785..0384848082caa 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -433,8 +433,8 @@ define void @and_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
@@ -482,8 +482,8 @@ define void @or_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
@@ -531,8 +531,8 @@ define void @xor_v4i8(ptr %p1, ptr %p2) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: str s0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
index 864ddc2967c18..90fa294505c84 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll
@@ -6,8 +6,7 @@
define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
; CHECK-LABEL: bitcast_v2i16_v2f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%y = bitcast <2 x half> %x to <2 x i16>
ret <2 x i16> %y
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index d9199ce2c79de..d54cc4adb81b3 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -125,8 +125,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w8, w0, w1
; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bitcast_i32_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 72f4d58a425e7..82114d60c4a93 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -649,7 +649,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: str s1, [x4]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-NEXT: ldr s0, [x2]
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: umov w9, v2.h[0]
@@ -659,7 +659,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: mov v0.b[9], w10
; CHECK-NEXT: umov w10, v2.h[3]
; CHECK-NEXT: ldr s2, [x1]
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b
; CHECK-NEXT: mov v0.b[10], w9
; CHECK-NEXT: add x9, x1, #4
; CHECK-NEXT: mov v1.d[1], v2.d[0]
diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
index 75d55773b3681..368103bf2f2fe 100644
--- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
+++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll
@@ -104,12 +104,19 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind {
; Negative test, combine should not fire if sign extension is for a different width.
define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind {
-; CHECK-LABEL: sext_extract_zext_idx0_negtest:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: shl v0.2s, v0.2s, #17
-; CHECK-NEXT: sshr v0.2s, v0.2s, #17
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sext_extract_zext_idx0_negtest:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: shl v0.2s, v0.2s, #17
+; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #17
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sext_extract_zext_idx0_negtest:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: shl v0.2s, v0.2s, #17
+; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #17
+; CHECK-GI-NEXT: ret
%zext = zext <4 x i16> %vec to <4 x i32>
%extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0)
%sext_inreg_step0 = shl <2 x i32> %extract, <i32 17, i32 17>
diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
index d06612e2332e6..07772b716ec58 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -518,15 +518,14 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) {
; CHECK-LE-LABEL: bitcast_i32_to_v2i16:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: fmov s0, w0
-; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: bitcast_i32_to_v2i16:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: fmov s0, w0
; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
-; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%ret = bitcast i32 %word to <2 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index 8fac0e1067684..bda7ff9115e09 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -88,9 +88,7 @@ define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: extract_subvector_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: zip2 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index b52cbfe08156b..45b7a2759b0b3 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -303,7 +303,7 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: and v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index e40b9cb5c8482..962486afa3bb8 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -447,7 +447,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff
; CHECK-SD-NEXT: dup v2.2d, x8
; CHECK-SD-NEXT: mov v0.s[1], w1
-; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0
+; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b
>From 5ebd1c538723201e66b25d55045fd75875b77abf Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 28 Mar 2025 12:38:11 +0000
Subject: [PATCH 2/3] [AArch64] Custom lower v4i8 subreg extract.
A v4i8 extract will usually be scalarized. This prevents that during lowering,
converting it to an anyext and larger v4i16 subvector extract. There are a few
minor regressions that are fixed up in a followup.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 14 +-
llvm/test/CodeGen/AArch64/aarch64-load-ext.ll | 9 +-
.../aarch64-neon-vector-insert-uaddlv.ll | 14 +-
llvm/test/CodeGen/AArch64/add.ll | 4 +-
llvm/test/CodeGen/AArch64/andorxor.ll | 6 +-
llvm/test/CodeGen/AArch64/ctlz.ll | 4 +-
llvm/test/CodeGen/AArch64/extbinopload.ll | 4 +-
llvm/test/CodeGen/AArch64/insert-subvector.ll | 2 +-
llvm/test/CodeGen/AArch64/itofp.ll | 814 +++++++-----------
llvm/test/CodeGen/AArch64/load.ll | 7 +-
llvm/test/CodeGen/AArch64/mul.ll | 6 +-
llvm/test/CodeGen/AArch64/sub.ll | 4 +-
.../AArch64/sve-fixed-length-masked-gather.ll | 18 +-
.../sve-fixed-length-masked-scatter.ll | 19 +-
llvm/test/CodeGen/AArch64/vector-fcvt.ll | 308 +++----
15 files changed, 477 insertions(+), 756 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1c8e3afdfd718..2ef1d9a6bff23 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1426,6 +1426,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i8, Custom);
+
setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
@@ -27309,12 +27311,22 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
+ SDLoc DL(N);
+
+ if (N->getValueType(0) == MVT::v4i8 &&
+ N->getOperand(0).getValueType() == MVT::v8i8 &&
+ (N->getConstantOperandVal(1) == 0 || N->getConstantOperandVal(1) == 4)) {
+ SDValue Ext =
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::v8i16, N->getOperand(0));
+ Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
+ N->getOperand(1));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i8, Ext));
+ }
// Common code will handle these just fine.
if (!InVT.isScalableVector() || !InVT.isInteger())
return;
- SDLoc DL(N);
EVT VT = N->getValueType(0);
// The following checks bail if this is not a halving operation.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad0..1818307be2e10 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -273,18 +273,15 @@ define <3 x i16> @fsext_v3i16(ptr %a) {
; CHECK-LE-LABEL: fsext_v3i16:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: ldr s0, [x0]
-; CHECK-LE-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-LE-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-LE-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v3i16:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: ldr s0, [x0]
; CHECK-BE-NEXT: rev32 v0.8b, v0.8b
-; CHECK-BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-BE-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-BE-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BE-NEXT: rev64 v0.4h, v0.4h
; CHECK-BE-NEXT: ret
%x = load <3 x i8>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 91eda8d552397..412f39f8adc1b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -281,7 +281,8 @@ define void @insert_vec_v16i8_uaddlv_from_v8i8(ptr %0) {
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.8b h1, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
-; CHECK-NEXT: mov.h v2[0], v1[0]
+; CHECK-NEXT: mov.b v2[0], v1[0]
+; CHECK-NEXT: zip1.8b v2, v2, v2
; CHECK-NEXT: bic.4h v2, #255, lsl #8
; CHECK-NEXT: ushll.4s v2, v2, #0
; CHECK-NEXT: ucvtf.4s v2, v2
@@ -303,8 +304,9 @@ define void @insert_vec_v8i8_uaddlv_from_v8i8(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: uaddlv.8b h1, v0
-; CHECK-NEXT: mov.h v0[0], v1[0]
-; CHECK-NEXT: bic.4h v0, #7, lsl #8
+; CHECK-NEXT: mov.b v0[0], v1[0]
+; CHECK-NEXT: zip1.8b v0, v0, v0
+; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
@@ -433,7 +435,8 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d0, v0
-; CHECK-NEXT: mov.h v1[0], v0[0]
+; CHECK-NEXT: mov.b v1[0], v0[0]
+; CHECK-NEXT: zip1.8b v1, v1, v1
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
@@ -457,7 +460,8 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: stp q2, q2, [x0, #32]
-; CHECK-NEXT: mov.h v1[0], v0[0]
+; CHECK-NEXT: mov.b v1[0], v0[0]
+; CHECK-NEXT: zip1.8b v1, v1, v1
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index fc0ba336b21cc..4ecd17ae3b9af 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -97,9 +97,7 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: umov w8, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 0384848082caa..439351cf0d2ac 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -292,7 +292,7 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: umov w8, v0.h[2]
@@ -340,7 +340,7 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: umov w8, v0.h[2]
@@ -388,7 +388,7 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: umov w8, v0.h[2]
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 742433c50d390..afdeff06fdef6 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -42,9 +42,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: movi v0.4h, #8
; CHECK-SD-NEXT: ldr s1, [x0]
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: movi v0.4h, #8
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT: clz v1.4h, v1.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 82114d60c4a93..47de51c6cdc0d 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -1366,11 +1366,11 @@ define <4 x i32> @atomic(ptr %p) {
; CHECK-LABEL: atomic:
; CHECK: // %bb.0:
; CHECK-NEXT: ldar w8, [x0]
-; CHECK-NEXT: movi v0.2d, #0x0000ff000000ff
; CHECK-NEXT: ldr s1, [x0, #4]
+; CHECK-NEXT: movi v0.2d, #0x0000ff000000ff
; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: zip1 v2.8b, v2.8b, v0.8b
+; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b
; CHECK-NEXT: ushll v1.4s, v1.4h, #3
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index 6828fa9f1508c..a91ff58d9e99a 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -465,7 +465,7 @@ define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
+; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 07957c117868d..e4fb2b7c2a3c7 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3442,39 +3442,27 @@ entry:
define <8 x double> @stofp_v8i8_v8f64(<8 x i8> %a) {
; CHECK-SD-LABEL: stofp_v8i8_v8f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: umov w8, v0.b[0]
-; CHECK-SD-NEXT: umov w9, v0.b[2]
-; CHECK-SD-NEXT: umov w11, v0.b[4]
-; CHECK-SD-NEXT: umov w12, v0.b[6]
-; CHECK-SD-NEXT: umov w10, v0.b[1]
-; CHECK-SD-NEXT: umov w13, v0.b[3]
-; CHECK-SD-NEXT: umov w14, v0.b[5]
-; CHECK-SD-NEXT: umov w15, v0.b[7]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: fmov s2, w11
-; CHECK-SD-NEXT: fmov s3, w12
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w13
-; CHECK-SD-NEXT: mov v2.s[1], w14
-; CHECK-SD-NEXT: mov v3.s[1], w15
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
+; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: shl v2.2s, v2.2s, #24
; CHECK-SD-NEXT: shl v3.2s, v3.2s, #24
-; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: sshr v2.2s, v2.2s, #24
; CHECK-SD-NEXT: sshr v3.2s, v3.2s, #24
-; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT: sshll v3.2d, v3.2s, #0
; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v3.2d, v3.2d
+; CHECK-SD-NEXT: sshll v4.2d, v2.2s, #0
+; CHECK-SD-NEXT: sshll v5.2d, v3.2s, #0
+; CHECK-SD-NEXT: scvtf v2.2d, v1.2d
+; CHECK-SD-NEXT: scvtf v3.2d, v4.2d
+; CHECK-SD-NEXT: scvtf v1.2d, v5.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v8i8_v8f64:
@@ -3499,36 +3487,24 @@ entry:
define <8 x double> @utofp_v8i8_v8f64(<8 x i8> %a) {
; CHECK-SD-LABEL: utofp_v8i8_v8f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: umov w8, v0.b[0]
-; CHECK-SD-NEXT: umov w9, v0.b[2]
-; CHECK-SD-NEXT: umov w11, v0.b[4]
-; CHECK-SD-NEXT: umov w12, v0.b[6]
-; CHECK-SD-NEXT: umov w10, v0.b[1]
-; CHECK-SD-NEXT: umov w13, v0.b[3]
-; CHECK-SD-NEXT: umov w14, v0.b[5]
-; CHECK-SD-NEXT: umov w15, v0.b[7]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s2, w9
-; CHECK-SD-NEXT: fmov s3, w11
-; CHECK-SD-NEXT: fmov s4, w12
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v2.s[1], w13
-; CHECK-SD-NEXT: mov v3.s[1], w14
-; CHECK-SD-NEXT: mov v4.s[1], w15
+; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-SD-NEXT: ext v4.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: and v2.8b, v2.8b, v1.8b
+; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: and v3.8b, v3.8b, v1.8b
; CHECK-SD-NEXT: and v1.8b, v4.8b, v1.8b
-; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT: ushll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT: ushll v4.2d, v1.2s, #0
; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ucvtf v1.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v3.2d
-; CHECK-SD-NEXT: ucvtf v3.2d, v4.2d
+; CHECK-SD-NEXT: ushll v3.2d, v3.2s, #0
+; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-SD-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v8i8_v8f64:
@@ -3553,71 +3529,48 @@ entry:
define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) {
; CHECK-SD-LABEL: stofp_v16i8_v16f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: umov w8, v0.b[0]
-; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: umov w9, v0.b[1]
-; CHECK-SD-NEXT: umov w10, v0.b[2]
-; CHECK-SD-NEXT: umov w12, v0.b[4]
-; CHECK-SD-NEXT: umov w14, v0.b[6]
-; CHECK-SD-NEXT: umov w11, v0.b[3]
-; CHECK-SD-NEXT: umov w13, v0.b[5]
-; CHECK-SD-NEXT: fmov s2, w8
-; CHECK-SD-NEXT: umov w15, v1.b[0]
-; CHECK-SD-NEXT: umov w17, v1.b[2]
-; CHECK-SD-NEXT: umov w0, v1.b[4]
-; CHECK-SD-NEXT: umov w16, v1.b[1]
-; CHECK-SD-NEXT: umov w18, v1.b[3]
-; CHECK-SD-NEXT: umov w8, v0.b[7]
-; CHECK-SD-NEXT: fmov s0, w10
-; CHECK-SD-NEXT: umov w10, v1.b[5]
-; CHECK-SD-NEXT: mov v2.s[1], w9
-; CHECK-SD-NEXT: umov w9, v1.b[6]
-; CHECK-SD-NEXT: fmov s3, w12
-; CHECK-SD-NEXT: umov w12, v1.b[7]
-; CHECK-SD-NEXT: fmov s1, w14
-; CHECK-SD-NEXT: fmov s4, w15
-; CHECK-SD-NEXT: fmov s5, w17
-; CHECK-SD-NEXT: fmov s6, w0
-; CHECK-SD-NEXT: mov v0.s[1], w11
-; CHECK-SD-NEXT: mov v3.s[1], w13
-; CHECK-SD-NEXT: fmov s7, w9
-; CHECK-SD-NEXT: mov v1.s[1], w8
-; CHECK-SD-NEXT: mov v4.s[1], w16
-; CHECK-SD-NEXT: mov v5.s[1], w18
-; CHECK-SD-NEXT: mov v6.s[1], w10
+; CHECK-SD-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-SD-NEXT: ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT: ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-SD-NEXT: ext v7.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: shl v2.2s, v2.2s, #24
+; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
-; CHECK-SD-NEXT: mov v7.s[1], w12
; CHECK-SD-NEXT: shl v3.2s, v3.2s, #24
-; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v4.2s, v4.2s, #24
-; CHECK-SD-NEXT: sshr v2.2s, v2.2s, #24
; CHECK-SD-NEXT: shl v5.2s, v5.2s, #24
; CHECK-SD-NEXT: shl v6.2s, v6.2s, #24
+; CHECK-SD-NEXT: shl v7.2s, v7.2s, #24
+; CHECK-SD-NEXT: sshr v2.2s, v2.2s, #24
+; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24
; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sshr v3.2s, v3.2s, #24
-; CHECK-SD-NEXT: shl v7.2s, v7.2s, #24
; CHECK-SD-NEXT: sshr v4.2s, v4.2s, #24
-; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24
; CHECK-SD-NEXT: sshr v5.2s, v5.2s, #24
; CHECK-SD-NEXT: sshr v6.2s, v6.2s, #24
+; CHECK-SD-NEXT: sshr v7.2s, v7.2s, #24
; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-SD-NEXT: sshll v16.2d, v0.2s, #0
-; CHECK-SD-NEXT: sshll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT: sshr v7.2s, v7.2s, #24
-; CHECK-SD-NEXT: sshll v4.2d, v4.2s, #0
-; CHECK-SD-NEXT: sshll v17.2d, v1.2s, #0
+; CHECK-SD-NEXT: sshll v18.2d, v3.2s, #0
+; CHECK-SD-NEXT: sshll v17.2d, v4.2s, #0
; CHECK-SD-NEXT: sshll v5.2d, v5.2s, #0
-; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT: scvtf v0.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v16.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v3.2d
-; CHECK-SD-NEXT: sshll v7.2d, v7.2s, #0
-; CHECK-SD-NEXT: scvtf v4.2d, v4.2d
+; CHECK-SD-NEXT: sshll v19.2d, v6.2s, #0
+; CHECK-SD-NEXT: sshll v20.2d, v7.2s, #0
+; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT: scvtf v0.2d, v1.2d
+; CHECK-SD-NEXT: scvtf v4.2d, v16.2d
+; CHECK-SD-NEXT: scvtf v6.2d, v18.2d
; CHECK-SD-NEXT: scvtf v3.2d, v17.2d
-; CHECK-SD-NEXT: scvtf v5.2d, v5.2d
-; CHECK-SD-NEXT: scvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: scvtf v7.2d, v7.2d
+; CHECK-SD-NEXT: scvtf v1.2d, v5.2d
+; CHECK-SD-NEXT: scvtf v7.2d, v19.2d
+; CHECK-SD-NEXT: scvtf v5.2d, v20.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v16i8_v16f64:
@@ -3653,64 +3606,41 @@ entry:
define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) {
; CHECK-SD-LABEL: utofp_v16i8_v16f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: umov w8, v0.b[0]
-; CHECK-SD-NEXT: umov w10, v0.b[2]
-; CHECK-SD-NEXT: umov w9, v0.b[1]
-; CHECK-SD-NEXT: umov w12, v0.b[4]
-; CHECK-SD-NEXT: umov w11, v0.b[3]
-; CHECK-SD-NEXT: umov w13, v0.b[5]
-; CHECK-SD-NEXT: umov w18, v0.b[6]
+; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT: umov w14, v2.b[0]
-; CHECK-SD-NEXT: umov w16, v2.b[2]
-; CHECK-SD-NEXT: umov w0, v2.b[4]
-; CHECK-SD-NEXT: fmov s3, w8
-; CHECK-SD-NEXT: umov w8, v0.b[7]
-; CHECK-SD-NEXT: fmov s0, w10
-; CHECK-SD-NEXT: umov w10, v2.b[6]
-; CHECK-SD-NEXT: umov w15, v2.b[1]
-; CHECK-SD-NEXT: umov w17, v2.b[3]
-; CHECK-SD-NEXT: fmov s4, w12
-; CHECK-SD-NEXT: umov w12, v2.b[5]
-; CHECK-SD-NEXT: fmov s7, w18
-; CHECK-SD-NEXT: mov v3.s[1], w9
-; CHECK-SD-NEXT: umov w9, v2.b[7]
-; CHECK-SD-NEXT: fmov s2, w14
-; CHECK-SD-NEXT: fmov s5, w16
-; CHECK-SD-NEXT: fmov s6, w0
-; CHECK-SD-NEXT: mov v0.s[1], w11
-; CHECK-SD-NEXT: fmov s16, w10
-; CHECK-SD-NEXT: mov v4.s[1], w13
-; CHECK-SD-NEXT: mov v7.s[1], w8
-; CHECK-SD-NEXT: mov v2.s[1], w15
-; CHECK-SD-NEXT: mov v5.s[1], w17
-; CHECK-SD-NEXT: mov v6.s[1], w12
+; CHECK-SD-NEXT: ushll2 v3.4s, v2.8h, #0
+; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll2 v4.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ext v5.16b, v3.16b, v3.16b, #8
+; CHECK-SD-NEXT: ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-SD-NEXT: ext v7.16b, v4.16b, v4.16b, #8
+; CHECK-SD-NEXT: ext v16.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: and v3.8b, v3.8b, v1.8b
-; CHECK-SD-NEXT: mov v16.s[1], w9
+; CHECK-SD-NEXT: and v2.8b, v2.8b, v1.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: and v4.8b, v4.8b, v1.8b
-; CHECK-SD-NEXT: and v7.8b, v7.8b, v1.8b
-; CHECK-SD-NEXT: and v2.8b, v2.8b, v1.8b
-; CHECK-SD-NEXT: ushll v3.2d, v3.2s, #0
; CHECK-SD-NEXT: and v5.8b, v5.8b, v1.8b
; CHECK-SD-NEXT: and v6.8b, v6.8b, v1.8b
+; CHECK-SD-NEXT: and v7.8b, v7.8b, v1.8b
; CHECK-SD-NEXT: and v1.8b, v16.8b, v1.8b
-; CHECK-SD-NEXT: ushll v16.2d, v0.2s, #0
-; CHECK-SD-NEXT: ushll v17.2d, v4.2s, #0
-; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0
-; CHECK-SD-NEXT: ucvtf v0.2d, v3.2d
+; CHECK-SD-NEXT: ushll v3.2d, v3.2s, #0
+; CHECK-SD-NEXT: ushll v16.2d, v2.2s, #0
+; CHECK-SD-NEXT: ushll v17.2d, v0.2s, #0
+; CHECK-SD-NEXT: ushll v18.2d, v4.2s, #0
; CHECK-SD-NEXT: ushll v5.2d, v5.2s, #0
; CHECK-SD-NEXT: ushll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT: ushll v18.2d, v1.2s, #0
-; CHECK-SD-NEXT: ucvtf v1.2d, v16.2d
-; CHECK-SD-NEXT: ucvtf v4.2d, v2.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v17.2d
-; CHECK-SD-NEXT: ucvtf v3.2d, v7.2d
-; CHECK-SD-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT: ucvtf v7.2d, v18.2d
+; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT: ushll v19.2d, v1.2s, #0
+; CHECK-SD-NEXT: ucvtf v2.2d, v3.2d
+; CHECK-SD-NEXT: ucvtf v0.2d, v16.2d
+; CHECK-SD-NEXT: ucvtf v4.2d, v17.2d
+; CHECK-SD-NEXT: ucvtf v3.2d, v5.2d
+; CHECK-SD-NEXT: ucvtf v1.2d, v6.2d
+; CHECK-SD-NEXT: ucvtf v6.2d, v18.2d
+; CHECK-SD-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-SD-NEXT: ucvtf v5.2d, v19.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v16i8_v16f64:
@@ -3746,144 +3676,98 @@ entry:
define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-LABEL: stofp_v32i8_v32f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: umov w9, v3.b[0]
-; CHECK-SD-NEXT: umov w11, v3.b[4]
-; CHECK-SD-NEXT: umov w13, v3.b[6]
-; CHECK-SD-NEXT: umov w18, v2.b[2]
-; CHECK-SD-NEXT: umov w10, v3.b[2]
-; CHECK-SD-NEXT: umov w12, v3.b[1]
-; CHECK-SD-NEXT: umov w16, v2.b[0]
-; CHECK-SD-NEXT: umov w14, v3.b[3]
-; CHECK-SD-NEXT: umov w15, v3.b[5]
-; CHECK-SD-NEXT: umov w17, v3.b[7]
-; CHECK-SD-NEXT: fmov s6, w9
-; CHECK-SD-NEXT: fmov s5, w11
-; CHECK-SD-NEXT: fmov s7, w13
-; CHECK-SD-NEXT: umov w13, v2.b[4]
-; CHECK-SD-NEXT: umov w11, v2.b[3]
-; CHECK-SD-NEXT: umov w9, v2.b[6]
-; CHECK-SD-NEXT: fmov s17, w18
-; CHECK-SD-NEXT: fmov s4, w10
-; CHECK-SD-NEXT: umov w10, v2.b[1]
-; CHECK-SD-NEXT: mov v6.s[1], w12
-; CHECK-SD-NEXT: fmov s3, w16
-; CHECK-SD-NEXT: umov w12, v2.b[5]
-; CHECK-SD-NEXT: mov v5.s[1], w15
-; CHECK-SD-NEXT: umov w15, v1.b[0]
-; CHECK-SD-NEXT: umov w16, v0.b[6]
-; CHECK-SD-NEXT: fmov s16, w13
-; CHECK-SD-NEXT: umov w13, v1.b[2]
-; CHECK-SD-NEXT: mov v17.s[1], w11
-; CHECK-SD-NEXT: umov w11, v1.b[6]
-; CHECK-SD-NEXT: fmov s18, w9
-; CHECK-SD-NEXT: umov w9, v1.b[4]
-; CHECK-SD-NEXT: mov v3.s[1], w10
-; CHECK-SD-NEXT: umov w10, v0.b[0]
-; CHECK-SD-NEXT: mov v4.s[1], w14
-; CHECK-SD-NEXT: mov v16.s[1], w12
-; CHECK-SD-NEXT: umov w12, v1.b[7]
-; CHECK-SD-NEXT: umov w14, v1.b[5]
-; CHECK-SD-NEXT: fmov s20, w13
-; CHECK-SD-NEXT: umov w13, v1.b[3]
-; CHECK-SD-NEXT: fmov s22, w15
-; CHECK-SD-NEXT: fmov s21, w11
-; CHECK-SD-NEXT: umov w11, v1.b[1]
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: fmov s19, w10
-; CHECK-SD-NEXT: umov w10, v0.b[4]
-; CHECK-SD-NEXT: umov w9, v0.b[7]
-; CHECK-SD-NEXT: fmov s23, w16
-; CHECK-SD-NEXT: mov v7.s[1], w17
+; CHECK-SD-NEXT: ushll2 v2.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll2 v3.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v4.4s, v2.8h, #0
+; CHECK-SD-NEXT: ushll2 v5.4s, v3.8h, #0
+; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll v16.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT: ushll v7.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT: ext v6.16b, v4.16b, v4.16b, #8
+; CHECK-SD-NEXT: ext v17.16b, v5.16b, v5.16b, #8
+; CHECK-SD-NEXT: shl v4.2s, v4.2s, #24
+; CHECK-SD-NEXT: ext v19.16b, v2.16b, v2.16b, #8
+; CHECK-SD-NEXT: shl v5.2s, v5.2s, #24
+; CHECK-SD-NEXT: shl v2.2s, v2.2s, #24
+; CHECK-SD-NEXT: ext v23.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT: ext v21.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: ext v22.16b, v3.16b, v3.16b, #8
+; CHECK-SD-NEXT: sshr v4.2s, v4.2s, #24
+; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
+; CHECK-SD-NEXT: ext v18.16b, v7.16b, v7.16b, #8
+; CHECK-SD-NEXT: shl v6.2s, v6.2s, #24
; CHECK-SD-NEXT: shl v17.2s, v17.2s, #24
-; CHECK-SD-NEXT: mov v21.s[1], w12
-; CHECK-SD-NEXT: mov v1.s[1], w14
-; CHECK-SD-NEXT: umov w14, v0.b[5]
-; CHECK-SD-NEXT: umov w12, v0.b[1]
-; CHECK-SD-NEXT: mov v20.s[1], w13
-; CHECK-SD-NEXT: umov w13, v0.b[2]
-; CHECK-SD-NEXT: mov v22.s[1], w11
-; CHECK-SD-NEXT: umov w11, v0.b[3]
-; CHECK-SD-NEXT: fmov s0, w10
-; CHECK-SD-NEXT: mov v23.s[1], w9
-; CHECK-SD-NEXT: umov w9, v2.b[7]
-; CHECK-SD-NEXT: shl v16.2s, v16.2s, #24
+; CHECK-SD-NEXT: sshr v5.2s, v5.2s, #24
+; CHECK-SD-NEXT: shl v19.2s, v19.2s, #24
+; CHECK-SD-NEXT: sshr v2.2s, v2.2s, #24
+; CHECK-SD-NEXT: ext v20.16b, v16.16b, v16.16b, #8
+; CHECK-SD-NEXT: sshll v4.2d, v4.2s, #0
+; CHECK-SD-NEXT: shl v23.2s, v23.2s, #24
; CHECK-SD-NEXT: shl v21.2s, v21.2s, #24
-; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
+; CHECK-SD-NEXT: sshr v6.2s, v6.2s, #24
; CHECK-SD-NEXT: sshr v17.2s, v17.2s, #24
-; CHECK-SD-NEXT: mov v0.s[1], w14
-; CHECK-SD-NEXT: fmov s24, w13
-; CHECK-SD-NEXT: mov v19.s[1], w12
-; CHECK-SD-NEXT: sshr v16.2s, v16.2s, #24
-; CHECK-SD-NEXT: shl v6.2s, v6.2s, #24
-; CHECK-SD-NEXT: shl v20.2s, v20.2s, #24
-; CHECK-SD-NEXT: sshr v21.2s, v21.2s, #24
-; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24
-; CHECK-SD-NEXT: shl v2.2s, v23.2s, #24
-; CHECK-SD-NEXT: mov v18.s[1], w9
-; CHECK-SD-NEXT: mov v24.s[1], w11
+; CHECK-SD-NEXT: sshll v5.2d, v5.2s, #0
+; CHECK-SD-NEXT: sshr v19.2s, v19.2s, #24
+; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
; CHECK-SD-NEXT: shl v22.2s, v22.2s, #24
+; CHECK-SD-NEXT: scvtf v4.2d, v4.2d
+; CHECK-SD-NEXT: sshr v23.2s, v23.2s, #24
+; CHECK-SD-NEXT: shl v3.2s, v3.2s, #24
+; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT: sshll v17.2d, v17.2s, #0
+; CHECK-SD-NEXT: scvtf v5.2d, v5.2d
+; CHECK-SD-NEXT: sshll v19.2d, v19.2s, #0
+; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
-; CHECK-SD-NEXT: shl v4.2s, v4.2s, #24
-; CHECK-SD-NEXT: shl v5.2s, v5.2s, #24
-; CHECK-SD-NEXT: sshll v21.2d, v21.2s, #0
-; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: sshr v2.2s, v2.2s, #24
+; CHECK-SD-NEXT: sshr v3.2s, v3.2s, #24
; CHECK-SD-NEXT: shl v7.2s, v7.2s, #24
-; CHECK-SD-NEXT: shl v19.2s, v19.2s, #24
-; CHECK-SD-NEXT: sshr v20.2s, v20.2s, #24
-; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT: scvtf v6.2d, v6.2d
+; CHECK-SD-NEXT: scvtf v17.2d, v17.2d
; CHECK-SD-NEXT: shl v18.2s, v18.2s, #24
-; CHECK-SD-NEXT: shl v23.2s, v24.2s, #24
-; CHECK-SD-NEXT: scvtf v21.2d, v21.2d
+; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT: shl v20.2s, v20.2s, #24
+; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT: sshll v3.2d, v3.2s, #0
+; CHECK-SD-NEXT: sshr v7.2s, v7.2s, #24
+; CHECK-SD-NEXT: stp q5, q17, [x8, #96]
+; CHECK-SD-NEXT: shl v5.2s, v16.2s, #24
+; CHECK-SD-NEXT: sshll v16.2d, v23.2s, #0
+; CHECK-SD-NEXT: stp q4, q6, [x8, #224]
+; CHECK-SD-NEXT: scvtf v4.2d, v19.2d
+; CHECK-SD-NEXT: sshr v6.2s, v22.2s, #24
; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT: sshr v22.2s, v22.2s, #24
-; CHECK-SD-NEXT: sshr v19.2s, v19.2s, #24
-; CHECK-SD-NEXT: sshr v5.2s, v5.2s, #24
; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT: sshr v23.2s, v23.2s, #24
-; CHECK-SD-NEXT: sshr v4.2s, v4.2s, #24
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: sshr v6.2s, v6.2s, #24
-; CHECK-SD-NEXT: sshll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT: stp q1, q21, [x8, #160]
-; CHECK-SD-NEXT: shl v1.2s, v3.2s, #24
-; CHECK-SD-NEXT: sshr v3.2s, v18.2s, #24
+; CHECK-SD-NEXT: sshr v17.2s, v18.2s, #24
+; CHECK-SD-NEXT: scvtf v16.2d, v16.2d
+; CHECK-SD-NEXT: sshr v5.2s, v5.2s, #24
+; CHECK-SD-NEXT: scvtf v3.2d, v3.2d
+; CHECK-SD-NEXT: stp q2, q4, [x8, #192]
+; CHECK-SD-NEXT: sshr v2.2s, v21.2s, #24
+; CHECK-SD-NEXT: sshll v4.2d, v6.2s, #0
+; CHECK-SD-NEXT: sshr v6.2s, v20.2s, #24
; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: sshll v22.2d, v22.2s, #0
-; CHECK-SD-NEXT: sshll v18.2d, v23.2s, #0
-; CHECK-SD-NEXT: sshll v19.2d, v19.2s, #0
; CHECK-SD-NEXT: sshll v5.2d, v5.2s, #0
-; CHECK-SD-NEXT: sshll v4.2d, v4.2s, #0
-; CHECK-SD-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-SD-NEXT: stp q1, q16, [x8, #160]
+; CHECK-SD-NEXT: sshll v1.2d, v7.2s, #0
+; CHECK-SD-NEXT: sshll v7.2d, v17.2s, #0
+; CHECK-SD-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: scvtf v4.2d, v4.2d
; CHECK-SD-NEXT: sshll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT: scvtf v20.2d, v20.2d
-; CHECK-SD-NEXT: scvtf v22.2d, v22.2d
-; CHECK-SD-NEXT: stp q0, q2, [x8, #32]
-; CHECK-SD-NEXT: sshll v2.2d, v3.2s, #0
-; CHECK-SD-NEXT: sshll v3.2d, v16.2s, #0
-; CHECK-SD-NEXT: sshll v16.2d, v17.2s, #0
-; CHECK-SD-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: sshr v0.2s, v7.2s, #24
-; CHECK-SD-NEXT: scvtf v7.2d, v18.2d
-; CHECK-SD-NEXT: scvtf v17.2d, v19.2d
-; CHECK-SD-NEXT: stp q22, q20, [x8, #128]
-; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: scvtf v3.2d, v3.2d
-; CHECK-SD-NEXT: scvtf v16.2d, v16.2d
+; CHECK-SD-NEXT: scvtf v5.2d, v5.2d
; CHECK-SD-NEXT: scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT: stp q17, q7, [x8]
-; CHECK-SD-NEXT: stp q3, q2, [x8, #224]
-; CHECK-SD-NEXT: scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: scvtf v2.2d, v5.2d
-; CHECK-SD-NEXT: stp q1, q16, [x8, #192]
-; CHECK-SD-NEXT: scvtf v3.2d, v4.2d
-; CHECK-SD-NEXT: scvtf v1.2d, v6.2d
-; CHECK-SD-NEXT: stp q2, q0, [x8, #96]
-; CHECK-SD-NEXT: stp q1, q3, [x8, #64]
+; CHECK-SD-NEXT: scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT: stp q3, q4, [x8, #64]
+; CHECK-SD-NEXT: scvtf v3.2d, v6.2d
+; CHECK-SD-NEXT: stp q0, q2, [x8, #32]
+; CHECK-SD-NEXT: scvtf v0.2d, v7.2d
+; CHECK-SD-NEXT: stp q5, q3, [x8, #128]
+; CHECK-SD-NEXT: stp q1, q0, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v32i8_v32f64:
@@ -3949,129 +3833,83 @@ entry:
define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-LABEL: utofp_v32i8_v32f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: ext v16.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: movi d3, #0x0000ff000000ff
-; CHECK-SD-NEXT: umov w11, v2.b[0]
-; CHECK-SD-NEXT: umov w14, v2.b[4]
-; CHECK-SD-NEXT: umov w12, v2.b[2]
-; CHECK-SD-NEXT: umov w15, v2.b[6]
-; CHECK-SD-NEXT: umov w16, v2.b[1]
-; CHECK-SD-NEXT: umov w13, v16.b[2]
-; CHECK-SD-NEXT: umov w17, v16.b[0]
-; CHECK-SD-NEXT: umov w9, v2.b[3]
-; CHECK-SD-NEXT: umov w10, v2.b[5]
-; CHECK-SD-NEXT: fmov s4, w11
-; CHECK-SD-NEXT: fmov s6, w14
-; CHECK-SD-NEXT: umov w14, v16.b[6]
-; CHECK-SD-NEXT: fmov s5, w12
-; CHECK-SD-NEXT: umov w12, v16.b[4]
-; CHECK-SD-NEXT: umov w11, v16.b[1]
-; CHECK-SD-NEXT: fmov s7, w15
-; CHECK-SD-NEXT: umov w15, v16.b[7]
-; CHECK-SD-NEXT: fmov s18, w13
-; CHECK-SD-NEXT: mov v4.s[1], w16
-; CHECK-SD-NEXT: umov w16, v1.b[4]
-; CHECK-SD-NEXT: umov w13, v1.b[6]
-; CHECK-SD-NEXT: fmov s17, w17
-; CHECK-SD-NEXT: fmov s20, w14
-; CHECK-SD-NEXT: mov v5.s[1], w9
-; CHECK-SD-NEXT: umov w9, v1.b[7]
-; CHECK-SD-NEXT: fmov s19, w12
-; CHECK-SD-NEXT: mov v6.s[1], w10
-; CHECK-SD-NEXT: umov w10, v1.b[2]
-; CHECK-SD-NEXT: umov w12, v0.b[6]
-; CHECK-SD-NEXT: umov w14, v1.b[0]
-; CHECK-SD-NEXT: mov v17.s[1], w11
-; CHECK-SD-NEXT: mov v20.s[1], w15
-; CHECK-SD-NEXT: umov w11, v1.b[5]
-; CHECK-SD-NEXT: umov w15, v1.b[3]
-; CHECK-SD-NEXT: fmov s21, w16
-; CHECK-SD-NEXT: umov w16, v1.b[1]
-; CHECK-SD-NEXT: fmov s1, w13
-; CHECK-SD-NEXT: umov w13, v0.b[4]
-; CHECK-SD-NEXT: and v6.8b, v6.8b, v3.8b
-; CHECK-SD-NEXT: fmov s22, w10
-; CHECK-SD-NEXT: fmov s23, w12
-; CHECK-SD-NEXT: fmov s24, w14
-; CHECK-SD-NEXT: mov v21.s[1], w11
-; CHECK-SD-NEXT: umov w10, v0.b[5]
-; CHECK-SD-NEXT: umov w12, v0.b[0]
-; CHECK-SD-NEXT: mov v1.s[1], w9
-; CHECK-SD-NEXT: umov w9, v0.b[7]
-; CHECK-SD-NEXT: umov w11, v16.b[3]
-; CHECK-SD-NEXT: mov v22.s[1], w15
-; CHECK-SD-NEXT: mov v24.s[1], w16
-; CHECK-SD-NEXT: fmov s25, w13
-; CHECK-SD-NEXT: umov w13, v0.b[3]
-; CHECK-SD-NEXT: and v20.8b, v20.8b, v3.8b
-; CHECK-SD-NEXT: and v5.8b, v5.8b, v3.8b
-; CHECK-SD-NEXT: and v21.8b, v21.8b, v3.8b
-; CHECK-SD-NEXT: mov v23.s[1], w9
-; CHECK-SD-NEXT: umov w9, v0.b[2]
-; CHECK-SD-NEXT: and v1.8b, v1.8b, v3.8b
-; CHECK-SD-NEXT: mov v25.s[1], w10
-; CHECK-SD-NEXT: umov w10, v0.b[1]
-; CHECK-SD-NEXT: and v0.8b, v22.8b, v3.8b
-; CHECK-SD-NEXT: fmov s22, w12
-; CHECK-SD-NEXT: and v24.8b, v24.8b, v3.8b
-; CHECK-SD-NEXT: umov w12, v16.b[5]
+; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll2 v4.8h, v0.16b, #0
+; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v5.4s, v3.8h, #0
+; CHECK-SD-NEXT: ushll2 v6.4s, v4.8h, #0
+; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT: ushll v19.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT: ushll v18.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT: ext v7.16b, v5.16b, v5.16b, #8
+; CHECK-SD-NEXT: ext v16.16b, v6.16b, v6.16b, #8
+; CHECK-SD-NEXT: ext v17.16b, v3.16b, v3.16b, #8
+; CHECK-SD-NEXT: and v5.8b, v5.8b, v2.8b
+; CHECK-SD-NEXT: and v6.8b, v6.8b, v2.8b
+; CHECK-SD-NEXT: and v3.8b, v3.8b, v2.8b
+; CHECK-SD-NEXT: ext v20.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT: ext v22.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: ext v23.16b, v4.16b, v4.16b, #8
+; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT: ext v21.16b, v18.16b, v18.16b, #8
+; CHECK-SD-NEXT: and v4.8b, v4.8b, v2.8b
+; CHECK-SD-NEXT: and v7.8b, v7.8b, v2.8b
+; CHECK-SD-NEXT: ushll v5.2d, v5.2s, #0
+; CHECK-SD-NEXT: and v16.8b, v16.8b, v2.8b
+; CHECK-SD-NEXT: and v17.8b, v17.8b, v2.8b
+; CHECK-SD-NEXT: ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT: ushll v3.2d, v3.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: ushll v21.2d, v21.2s, #0
-; CHECK-SD-NEXT: mov v18.s[1], w11
-; CHECK-SD-NEXT: and v16.8b, v23.8b, v3.8b
-; CHECK-SD-NEXT: fmov s23, w9
-; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT: and v25.8b, v25.8b, v3.8b
-; CHECK-SD-NEXT: ushll v24.2d, v24.2s, #0
-; CHECK-SD-NEXT: mov v22.s[1], w10
-; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: ucvtf v21.2d, v21.2d
-; CHECK-SD-NEXT: umov w9, v2.b[7]
-; CHECK-SD-NEXT: mov v23.s[1], w13
+; CHECK-SD-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT: ushll v4.2d, v4.2s, #0
+; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT: ucvtf v5.2d, v5.2d
; CHECK-SD-NEXT: ushll v16.2d, v16.2s, #0
-; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ushll v2.2d, v25.2s, #0
-; CHECK-SD-NEXT: mov v19.s[1], w12
-; CHECK-SD-NEXT: ucvtf v24.2d, v24.2d
-; CHECK-SD-NEXT: and v18.8b, v18.8b, v3.8b
-; CHECK-SD-NEXT: stp q21, q1, [x8, #160]
-; CHECK-SD-NEXT: and v1.8b, v22.8b, v3.8b
-; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d
-; CHECK-SD-NEXT: and v23.8b, v23.8b, v3.8b
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: mov v7.s[1], w9
-; CHECK-SD-NEXT: stp q24, q0, [x8, #128]
-; CHECK-SD-NEXT: and v0.8b, v19.8b, v3.8b
-; CHECK-SD-NEXT: ushll v18.2d, v18.2s, #0
-; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT: ushll v19.2d, v23.2s, #0
-; CHECK-SD-NEXT: stp q2, q16, [x8, #32]
-; CHECK-SD-NEXT: and v16.8b, v17.8b, v3.8b
-; CHECK-SD-NEXT: ushll v17.2d, v20.2s, #0
-; CHECK-SD-NEXT: and v2.8b, v7.8b, v3.8b
-; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT: ushll v17.2d, v17.2s, #0
+; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
+; CHECK-SD-NEXT: ucvtf v3.2d, v3.2d
; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: ucvtf v7.2d, v19.2d
-; CHECK-SD-NEXT: and v3.8b, v4.8b, v3.8b
-; CHECK-SD-NEXT: ushll v4.2d, v6.2s, #0
+; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT: ucvtf v4.2d, v4.2d
+; CHECK-SD-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d
; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d
-; CHECK-SD-NEXT: ushll v16.2d, v16.2s, #0
-; CHECK-SD-NEXT: ucvtf v6.2d, v18.2d
; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: stp q6, q16, [x8, #96]
+; CHECK-SD-NEXT: and v6.8b, v23.8b, v2.8b
+; CHECK-SD-NEXT: and v16.8b, v19.8b, v2.8b
+; CHECK-SD-NEXT: stp q3, q17, [x8, #192]
+; CHECK-SD-NEXT: and v3.8b, v22.8b, v2.8b
+; CHECK-SD-NEXT: and v17.8b, v18.8b, v2.8b
+; CHECK-SD-NEXT: stp q5, q7, [x8, #224]
+; CHECK-SD-NEXT: and v5.8b, v20.8b, v2.8b
+; CHECK-SD-NEXT: ext v7.16b, v19.16b, v19.16b, #8
+; CHECK-SD-NEXT: ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT: ushll v16.2d, v16.2s, #0
; CHECK-SD-NEXT: ushll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT: stp q1, q7, [x8]
-; CHECK-SD-NEXT: ushll v1.2d, v5.2s, #0
-; CHECK-SD-NEXT: ucvtf v5.2d, v16.2d
-; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT: stp q0, q17, [x8, #224]
-; CHECK-SD-NEXT: ucvtf v0.2d, v4.2d
+; CHECK-SD-NEXT: ushll v5.2d, v5.2s, #0
+; CHECK-SD-NEXT: and v7.8b, v7.8b, v2.8b
+; CHECK-SD-NEXT: and v2.8b, v21.8b, v2.8b
+; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
; CHECK-SD-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-SD-NEXT: ucvtf v5.2d, v5.2d
+; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: stp q4, q6, [x8, #64]
+; CHECK-SD-NEXT: stp q0, q3, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q5, [x8, #160]
+; CHECK-SD-NEXT: ushll v1.2d, v17.2s, #0
+; CHECK-SD-NEXT: ucvtf v5.2d, v16.2d
+; CHECK-SD-NEXT: ucvtf v4.2d, v7.2d
+; CHECK-SD-NEXT: ucvtf v0.2d, v2.2d
; CHECK-SD-NEXT: ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT: stp q5, q6, [x8, #192]
-; CHECK-SD-NEXT: stp q0, q2, [x8, #96]
-; CHECK-SD-NEXT: stp q3, q1, [x8, #64]
+; CHECK-SD-NEXT: stp q5, q4, [x8, #128]
+; CHECK-SD-NEXT: stp q1, q0, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v32i8_v32f64:
@@ -5641,16 +5479,11 @@ entry:
define <8 x float> @stofp_v8i8_v8f32(<8 x i8> %a) {
; CHECK-SD-LABEL: stofp_v8i8_v8f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: zip1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-SD-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-SD-NEXT: scvtf v0.4s, v1.4s
-; CHECK-SD-NEXT: scvtf v1.4s, v2.4s
+; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: scvtf v1.4s, v1.4s
+; CHECK-SD-NEXT: scvtf v0.4s, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v8i8_v8f32:
@@ -5669,14 +5502,15 @@ entry:
define <8 x float> @utofp_v8i8_v8f32(<8 x i8> %a) {
; CHECK-SD-LABEL: utofp_v8i8_v8f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: zip1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-SD-NEXT: ucvtf v0.4s, v1.4s
-; CHECK-SD-NEXT: ucvtf v1.4s, v2.4s
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ucvtf v1.4s, v1.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v8i8_v8f32:
@@ -5695,27 +5529,16 @@ entry:
define <16 x float> @stofp_v16i8_v16f32(<16 x i8> %a) {
; CHECK-SD-LABEL: stofp_v16i8_v16f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: zip1 v2.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v3.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: shl v2.4h, v2.4h, #8
-; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-SD-NEXT: sshr v2.4h, v2.4h, #8
-; CHECK-SD-NEXT: shl v3.4h, v3.4h, #8
-; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-SD-NEXT: sshr v3.4h, v3.4h, #8
-; CHECK-SD-NEXT: sshr v1.4h, v1.4h, #8
+; CHECK-SD-NEXT: sshll v1.8h, v0.8b, #0
+; CHECK-SD-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v4.4s, v0.4h, #0
; CHECK-SD-NEXT: scvtf v0.4s, v2.4s
-; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-SD-NEXT: sshll v5.4s, v1.4h, #0
-; CHECK-SD-NEXT: scvtf v1.4s, v4.4s
-; CHECK-SD-NEXT: scvtf v2.4s, v3.4s
-; CHECK-SD-NEXT: scvtf v3.4s, v5.4s
+; CHECK-SD-NEXT: scvtf v3.4s, v3.4s
+; CHECK-SD-NEXT: scvtf v1.4s, v1.4s
+; CHECK-SD-NEXT: scvtf v2.4s, v4.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v16i8_v16f32:
@@ -5739,23 +5562,24 @@ entry:
define <16 x float> @utofp_v16i8_v16f32(<16 x i8> %a) {
; CHECK-SD-LABEL: utofp_v16i8_v16f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: zip1 v2.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v3.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-SD-NEXT: bic v3.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-SD-NEXT: ushll v4.4s, v0.4h, #0
-; CHECK-SD-NEXT: ucvtf v0.4s, v2.4s
-; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-SD-NEXT: ushll v5.4s, v1.4h, #0
-; CHECK-SD-NEXT: ucvtf v1.4s, v4.4s
-; CHECK-SD-NEXT: ucvtf v2.4s, v3.4s
-; CHECK-SD-NEXT: ucvtf v3.4s, v5.4s
+; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
+; CHECK-SD-NEXT: bic v3.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll v5.4s, v3.4h, #0
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ucvtf v2.4s, v1.4s
+; CHECK-SD-NEXT: ucvtf v3.4s, v4.4s
+; CHECK-SD-NEXT: ucvtf v1.4s, v5.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v16i8_v16f32:
@@ -5779,48 +5603,26 @@ entry:
define <32 x float> @stofp_v32i8_v32f32(<32 x i8> %a) {
; CHECK-SD-LABEL: stofp_v32i8_v32f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: zip1 v4.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: shl v4.4h, v4.4h, #8
-; CHECK-SD-NEXT: zip1 v5.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v6.8b, v2.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v2.8b, v2.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v7.8b, v3.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v3.8b, v3.8b, v0.8b
-; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-SD-NEXT: sshr v4.4h, v4.4h, #8
-; CHECK-SD-NEXT: shl v5.4h, v5.4h, #8
-; CHECK-SD-NEXT: shl v6.4h, v6.4h, #8
-; CHECK-SD-NEXT: shl v2.4h, v2.4h, #8
-; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-SD-NEXT: shl v7.4h, v7.4h, #8
-; CHECK-SD-NEXT: shl v3.4h, v3.4h, #8
-; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-SD-NEXT: sshll v4.4s, v4.4h, #0
-; CHECK-SD-NEXT: sshr v5.4h, v5.4h, #8
-; CHECK-SD-NEXT: sshr v6.4h, v6.4h, #8
-; CHECK-SD-NEXT: sshr v2.4h, v2.4h, #8
-; CHECK-SD-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-SD-NEXT: sshr v7.4h, v7.4h, #8
-; CHECK-SD-NEXT: sshr v3.4h, v3.4h, #8
-; CHECK-SD-NEXT: sshll v16.4s, v0.4h, #0
-; CHECK-SD-NEXT: scvtf v0.4s, v4.4s
-; CHECK-SD-NEXT: sshll v5.4s, v5.4h, #0
-; CHECK-SD-NEXT: sshll v6.4s, v6.4h, #0
-; CHECK-SD-NEXT: sshll v17.4s, v2.4h, #0
+; CHECK-SD-NEXT: sshll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: sshll v3.8h, v1.8b, #0
+; CHECK-SD-NEXT: sshll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: sshll2 v4.4s, v2.8h, #0
+; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT: sshll2 v5.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll v6.4s, v0.4h, #0
+; CHECK-SD-NEXT: sshll v7.4s, v3.4h, #0
+; CHECK-SD-NEXT: sshll2 v16.4s, v1.8h, #0
+; CHECK-SD-NEXT: sshll2 v17.4s, v3.8h, #0
; CHECK-SD-NEXT: sshll v18.4s, v1.4h, #0
-; CHECK-SD-NEXT: sshll v7.4s, v7.4h, #0
-; CHECK-SD-NEXT: sshll v19.4s, v3.4h, #0
-; CHECK-SD-NEXT: scvtf v1.4s, v16.4s
-; CHECK-SD-NEXT: scvtf v4.4s, v5.4s
+; CHECK-SD-NEXT: scvtf v1.4s, v4.4s
+; CHECK-SD-NEXT: scvtf v0.4s, v2.4s
+; CHECK-SD-NEXT: scvtf v3.4s, v5.4s
; CHECK-SD-NEXT: scvtf v2.4s, v6.4s
-; CHECK-SD-NEXT: scvtf v3.4s, v17.4s
-; CHECK-SD-NEXT: scvtf v5.4s, v18.4s
-; CHECK-SD-NEXT: scvtf v6.4s, v7.4s
-; CHECK-SD-NEXT: scvtf v7.4s, v19.4s
+; CHECK-SD-NEXT: scvtf v4.4s, v7.4s
+; CHECK-SD-NEXT: scvtf v7.4s, v16.4s
+; CHECK-SD-NEXT: scvtf v5.4s, v17.4s
+; CHECK-SD-NEXT: scvtf v6.4s, v18.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: stofp_v32i8_v32f32:
@@ -5854,40 +5656,42 @@ entry:
define <32 x float> @utofp_v32i8_v32f32(<32 x i8> %a) {
; CHECK-SD-LABEL: utofp_v32i8_v32f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: zip1 v4.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: bic v4.4h, #255, lsl #8
-; CHECK-SD-NEXT: zip1 v5.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v6.8b, v2.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v2.8b, v2.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v7.8b, v3.8b, v0.8b
-; CHECK-SD-NEXT: zip2 v3.8b, v3.8b, v0.8b
+; CHECK-SD-NEXT: ushll2 v2.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-SD-NEXT: ext v5.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT: ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-SD-NEXT: ext v7.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT: // kill: def $d3 killed $d3 killed $q3
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT: bic v3.4h, #255, lsl #8
+; CHECK-SD-NEXT: bic v4.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v5.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v6.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v7.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v3.4h, #255, lsl #8
-; CHECK-SD-NEXT: ushll v16.4s, v0.4h, #0
-; CHECK-SD-NEXT: ucvtf v0.4s, v4.4s
+; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll v17.4s, v3.4h, #0
+; CHECK-SD-NEXT: ushll v16.4s, v4.4h, #0
; CHECK-SD-NEXT: ushll v5.4s, v5.4h, #0
-; CHECK-SD-NEXT: ushll v6.4s, v6.4h, #0
-; CHECK-SD-NEXT: ushll v17.4s, v2.4h, #0
-; CHECK-SD-NEXT: ushll v18.4s, v1.4h, #0
-; CHECK-SD-NEXT: ushll v7.4s, v7.4h, #0
-; CHECK-SD-NEXT: ushll v19.4s, v3.4h, #0
-; CHECK-SD-NEXT: ucvtf v1.4s, v16.4s
-; CHECK-SD-NEXT: ucvtf v4.4s, v5.4s
-; CHECK-SD-NEXT: ucvtf v2.4s, v6.4s
-; CHECK-SD-NEXT: ucvtf v3.4s, v17.4s
-; CHECK-SD-NEXT: ucvtf v5.4s, v18.4s
-; CHECK-SD-NEXT: ucvtf v6.4s, v7.4s
-; CHECK-SD-NEXT: ucvtf v7.4s, v19.4s
+; CHECK-SD-NEXT: ushll v18.4s, v6.4h, #0
+; CHECK-SD-NEXT: ushll v19.4s, v7.4h, #0
+; CHECK-SD-NEXT: ucvtf v2.4s, v2.4s
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-SD-NEXT: ucvtf v4.4s, v1.4s
+; CHECK-SD-NEXT: ucvtf v6.4s, v17.4s
+; CHECK-SD-NEXT: ucvtf v3.4s, v16.4s
+; CHECK-SD-NEXT: ucvtf v1.4s, v5.4s
+; CHECK-SD-NEXT: ucvtf v7.4s, v18.4s
+; CHECK-SD-NEXT: ucvtf v5.4s, v19.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v32i8_v32f32:
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 3fa5d64a210e1..9db61e08c5c26 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -328,9 +328,10 @@ define <3 x i8> @load_v3i8(ptr %ptr) {
; CHECK-SD-LABEL: load_v3i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr s0, [x0]
-; CHECK-SD-NEXT: umov w0, v0.b[0]
-; CHECK-SD-NEXT: umov w1, v0.b[1]
-; CHECK-SD-NEXT: umov w2, v0.b[2]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: umov w0, v0.h[0]
+; CHECK-SD-NEXT: umov w1, v0.h[1]
+; CHECK-SD-NEXT: umov w2, v0.h[2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: load_v3i8:
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 500379d1cfdec..16822af7382b0 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -109,10 +109,8 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: xtn v1.8b, v0.8h
; CHECK-SD-NEXT: umov w8, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
; CHECK-SD-NEXT: ldrh w9, [sp, #12]
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 8183a82f21cb5..57e8b663e77af 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -97,9 +97,7 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: umov w8, v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 5516a4716d59d..dd899fd787455 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -59,20 +59,18 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: masked_gather_v8i8:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: ldr d0, [x0]
+; VBITS_GE_256-NEXT: ldp s0, s1, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
-; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8
-; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8
-; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
+; VBITS_GE_256-NEXT: ushll v1.8h, v1.8b, #0
+; VBITS_GE_256-NEXT: mov v0.d[1], v1.d[0]
+; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
+; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1b { z1.d }, p1/z, [z1.d]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index e3e06dcdf17f3..4c5537f0cdc83 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -59,23 +59,18 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ldr d0, [x0]
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0
-; VBITS_GE_256-NEXT: zip1 v3.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT: zip1 v2.8b, v1.8b, v0.8b
-; VBITS_GE_256-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; VBITS_GE_256-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8
+; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0
+; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8
-; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8
-; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index a6b43d514594e..ce0a8b5b68d2b 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -18,16 +18,11 @@ define <4 x float> @sitofp_v4i8_float(<4 x i8> %a) {
define <8 x float> @sitofp_v8i8_float(<8 x i8> %a) {
; CHECK-LABEL: sitofp_v8i8_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-NEXT: scvtf v0.4s, v1.4s
-; CHECK-NEXT: scvtf v1.4s, v2.4s
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: scvtf v1.4s, v1.4s
+; CHECK-NEXT: scvtf v0.4s, v0.4s
; CHECK-NEXT: ret
%1 = sitofp <8 x i8> %a to <8 x float>
ret <8 x float> %1
@@ -36,27 +31,16 @@ define <8 x float> @sitofp_v8i8_float(<8 x i8> %a) {
define <16 x float> @sitofp_v16i8_float(<16 x i8> %a) {
; CHECK-LABEL: sitofp_v16i8_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b
-; CHECK-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-NEXT: shl v2.4h, v2.4h, #8
-; CHECK-NEXT: shl v0.4h, v0.4h, #8
-; CHECK-NEXT: sshr v2.4h, v2.4h, #8
-; CHECK-NEXT: shl v3.4h, v3.4h, #8
-; CHECK-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NEXT: sshr v3.4h, v3.4h, #8
-; CHECK-NEXT: sshr v1.4h, v1.4h, #8
+; CHECK-NEXT: sshll v1.8h, v0.8b, #0
+; CHECK-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0
+; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-NEXT: sshll v4.4s, v0.4h, #0
; CHECK-NEXT: scvtf v0.4s, v2.4s
-; CHECK-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-NEXT: sshll v5.4s, v1.4h, #0
-; CHECK-NEXT: scvtf v1.4s, v4.4s
-; CHECK-NEXT: scvtf v2.4s, v3.4s
-; CHECK-NEXT: scvtf v3.4s, v5.4s
+; CHECK-NEXT: scvtf v3.4s, v3.4s
+; CHECK-NEXT: scvtf v1.4s, v1.4s
+; CHECK-NEXT: scvtf v2.4s, v4.4s
; CHECK-NEXT: ret
%1 = sitofp <16 x i8> %a to <16 x float>
ret <16 x float> %1
@@ -129,14 +113,15 @@ define <4 x float> @uitofp_v4i8_float(<4 x i8> %a) {
define <8 x float> @uitofp_v8i8_float(<8 x i8> %a) {
; CHECK-LABEL: uitofp_v8i8_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ucvtf v0.4s, v1.4s
-; CHECK-NEXT: ucvtf v1.4s, v2.4s
+; CHECK-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-NEXT: ucvtf v1.4s, v1.4s
; CHECK-NEXT: ret
%1 = uitofp <8 x i8> %a to <8 x float>
ret <8 x float> %1
@@ -145,23 +130,24 @@ define <8 x float> @uitofp_v8i8_float(<8 x i8> %a) {
define <16 x float> @uitofp_v16i8_float(<16 x i8> %a) {
; CHECK-LABEL: uitofp_v16i8_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b
-; CHECK-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-NEXT: bic v2.4h, #255, lsl #8
+; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: bic v3.4h, #255, lsl #8
; CHECK-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-NEXT: ushll v4.4s, v0.4h, #0
-; CHECK-NEXT: ucvtf v0.4s, v2.4s
-; CHECK-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NEXT: ushll v5.4s, v1.4h, #0
-; CHECK-NEXT: ucvtf v1.4s, v4.4s
-; CHECK-NEXT: ucvtf v2.4s, v3.4s
-; CHECK-NEXT: ucvtf v3.4s, v5.4s
+; CHECK-NEXT: bic v2.4h, #255, lsl #8
+; CHECK-NEXT: bic v3.4h, #255, lsl #8
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-NEXT: ushll v5.4s, v3.4h, #0
+; CHECK-NEXT: ucvtf v0.4s, v0.4s
+; CHECK-NEXT: ucvtf v2.4s, v1.4s
+; CHECK-NEXT: ucvtf v3.4s, v4.4s
+; CHECK-NEXT: ucvtf v1.4s, v5.4s
; CHECK-NEXT: ret
%1 = uitofp <16 x i8> %a to <16 x float>
ret <16 x float> %1
@@ -242,39 +228,27 @@ define <4 x double> @sitofp_v4i8_double(<4 x i8> %a) {
define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) {
; CHECK-LABEL: sitofp_v8i8_double:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.b[0]
-; CHECK-NEXT: umov w9, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[4]
-; CHECK-NEXT: umov w12, v0.b[6]
-; CHECK-NEXT: umov w10, v0.b[1]
-; CHECK-NEXT: umov w13, v0.b[3]
-; CHECK-NEXT: umov w14, v0.b[5]
-; CHECK-NEXT: umov w15, v0.b[7]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: fmov s3, w12
-; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: mov v1.s[1], w13
-; CHECK-NEXT: mov v2.s[1], w14
-; CHECK-NEXT: mov v3.s[1], w15
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: shl v1.2s, v1.2s, #24
+; CHECK-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-NEXT: shl v2.2s, v2.2s, #24
; CHECK-NEXT: shl v3.2s, v3.2s, #24
-; CHECK-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: sshr v2.2s, v2.2s, #24
; CHECK-NEXT: sshr v3.2s, v3.2s, #24
-; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: sshll v1.2d, v1.2s, #0
-; CHECK-NEXT: sshll v2.2d, v2.2s, #0
-; CHECK-NEXT: sshll v3.2d, v3.2s, #0
; CHECK-NEXT: scvtf v0.2d, v0.2d
-; CHECK-NEXT: scvtf v1.2d, v1.2d
-; CHECK-NEXT: scvtf v2.2d, v2.2d
-; CHECK-NEXT: scvtf v3.2d, v3.2d
+; CHECK-NEXT: sshll v4.2d, v2.2s, #0
+; CHECK-NEXT: sshll v5.2d, v3.2s, #0
+; CHECK-NEXT: scvtf v2.2d, v1.2d
+; CHECK-NEXT: scvtf v3.2d, v4.2d
+; CHECK-NEXT: scvtf v1.2d, v5.2d
; CHECK-NEXT: ret
%1 = sitofp <8 x i8> %a to <8 x double>
ret <8 x double> %1
@@ -283,71 +257,48 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) {
define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) {
; CHECK-LABEL: sitofp_v16i8_double:
; CHECK: // %bb.0:
-; CHECK-NEXT: umov w8, v0.b[0]
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: umov w9, v0.b[1]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: umov w15, v1.b[0]
-; CHECK-NEXT: umov w17, v1.b[2]
-; CHECK-NEXT: umov w0, v1.b[4]
-; CHECK-NEXT: umov w16, v1.b[1]
-; CHECK-NEXT: umov w18, v1.b[3]
-; CHECK-NEXT: umov w8, v0.b[7]
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: umov w10, v1.b[5]
-; CHECK-NEXT: mov v2.s[1], w9
-; CHECK-NEXT: umov w9, v1.b[6]
-; CHECK-NEXT: fmov s3, w12
-; CHECK-NEXT: umov w12, v1.b[7]
-; CHECK-NEXT: fmov s1, w14
-; CHECK-NEXT: fmov s4, w15
-; CHECK-NEXT: fmov s5, w17
-; CHECK-NEXT: fmov s6, w0
-; CHECK-NEXT: mov v0.s[1], w11
-; CHECK-NEXT: mov v3.s[1], w13
-; CHECK-NEXT: fmov s7, w9
-; CHECK-NEXT: mov v1.s[1], w8
-; CHECK-NEXT: mov v4.s[1], w16
-; CHECK-NEXT: mov v5.s[1], w18
-; CHECK-NEXT: mov v6.s[1], w10
+; CHECK-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: ext v7.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: shl v2.2s, v2.2s, #24
+; CHECK-NEXT: shl v1.2s, v1.2s, #24
; CHECK-NEXT: shl v0.2s, v0.2s, #24
-; CHECK-NEXT: mov v7.s[1], w12
; CHECK-NEXT: shl v3.2s, v3.2s, #24
-; CHECK-NEXT: shl v1.2s, v1.2s, #24
; CHECK-NEXT: shl v4.2s, v4.2s, #24
-; CHECK-NEXT: sshr v2.2s, v2.2s, #24
; CHECK-NEXT: shl v5.2s, v5.2s, #24
; CHECK-NEXT: shl v6.2s, v6.2s, #24
+; CHECK-NEXT: shl v7.2s, v7.2s, #24
+; CHECK-NEXT: sshr v2.2s, v2.2s, #24
+; CHECK-NEXT: sshr v1.2s, v1.2s, #24
; CHECK-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-NEXT: sshr v3.2s, v3.2s, #24
-; CHECK-NEXT: shl v7.2s, v7.2s, #24
; CHECK-NEXT: sshr v4.2s, v4.2s, #24
-; CHECK-NEXT: sshr v1.2s, v1.2s, #24
; CHECK-NEXT: sshr v5.2s, v5.2s, #24
; CHECK-NEXT: sshr v6.2s, v6.2s, #24
+; CHECK-NEXT: sshr v7.2s, v7.2s, #24
; CHECK-NEXT: sshll v2.2d, v2.2s, #0
+; CHECK-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-NEXT: sshll v16.2d, v0.2s, #0
-; CHECK-NEXT: sshll v3.2d, v3.2s, #0
-; CHECK-NEXT: sshr v7.2s, v7.2s, #24
-; CHECK-NEXT: sshll v4.2d, v4.2s, #0
-; CHECK-NEXT: sshll v17.2d, v1.2s, #0
+; CHECK-NEXT: sshll v18.2d, v3.2s, #0
+; CHECK-NEXT: sshll v17.2d, v4.2s, #0
; CHECK-NEXT: sshll v5.2d, v5.2s, #0
-; CHECK-NEXT: sshll v6.2d, v6.2s, #0
-; CHECK-NEXT: scvtf v0.2d, v2.2d
-; CHECK-NEXT: scvtf v1.2d, v16.2d
-; CHECK-NEXT: scvtf v2.2d, v3.2d
-; CHECK-NEXT: sshll v7.2d, v7.2s, #0
-; CHECK-NEXT: scvtf v4.2d, v4.2d
+; CHECK-NEXT: sshll v19.2d, v6.2s, #0
+; CHECK-NEXT: sshll v20.2d, v7.2s, #0
+; CHECK-NEXT: scvtf v2.2d, v2.2d
+; CHECK-NEXT: scvtf v0.2d, v1.2d
+; CHECK-NEXT: scvtf v4.2d, v16.2d
+; CHECK-NEXT: scvtf v6.2d, v18.2d
; CHECK-NEXT: scvtf v3.2d, v17.2d
-; CHECK-NEXT: scvtf v5.2d, v5.2d
-; CHECK-NEXT: scvtf v6.2d, v6.2d
-; CHECK-NEXT: scvtf v7.2d, v7.2d
+; CHECK-NEXT: scvtf v1.2d, v5.2d
+; CHECK-NEXT: scvtf v7.2d, v19.2d
+; CHECK-NEXT: scvtf v5.2d, v20.2d
; CHECK-NEXT: ret
%1 = sitofp <16 x i8> %a to <16 x double>
ret <16 x double> %1
@@ -419,36 +370,24 @@ define <4 x double> @uitofp_v4i8_double(<4 x i8> %a) {
define <8 x double> @uitofp_v8i8_double(<8 x i8> %a) {
; CHECK-LABEL: uitofp_v8i8_double:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.b[0]
-; CHECK-NEXT: umov w9, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[4]
-; CHECK-NEXT: umov w12, v0.b[6]
-; CHECK-NEXT: umov w10, v0.b[1]
-; CHECK-NEXT: umov w13, v0.b[3]
-; CHECK-NEXT: umov w14, v0.b[5]
-; CHECK-NEXT: umov w15, v0.b[7]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: movi d1, #0x0000ff000000ff
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: fmov s3, w11
-; CHECK-NEXT: fmov s4, w12
-; CHECK-NEXT: mov v0.s[1], w10
-; CHECK-NEXT: mov v2.s[1], w13
-; CHECK-NEXT: mov v3.s[1], w14
-; CHECK-NEXT: mov v4.s[1], w15
+; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: and v2.8b, v2.8b, v1.8b
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: and v3.8b, v3.8b, v1.8b
; CHECK-NEXT: and v1.8b, v4.8b, v1.8b
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-NEXT: ushll v3.2d, v3.2s, #0
-; CHECK-NEXT: ushll v4.2d, v1.2s, #0
; CHECK-NEXT: ucvtf v0.2d, v0.2d
-; CHECK-NEXT: ucvtf v1.2d, v2.2d
-; CHECK-NEXT: ucvtf v2.2d, v3.2d
-; CHECK-NEXT: ucvtf v3.2d, v4.2d
+; CHECK-NEXT: ushll v3.2d, v3.2s, #0
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-NEXT: ucvtf v2.2d, v2.2d
+; CHECK-NEXT: ucvtf v3.2d, v3.2d
+; CHECK-NEXT: ucvtf v1.2d, v1.2d
; CHECK-NEXT: ret
%1 = uitofp <8 x i8> %a to <8 x double>
ret <8 x double> %1
@@ -457,64 +396,41 @@ define <8 x double> @uitofp_v8i8_double(<8 x i8> %a) {
define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) {
; CHECK-LABEL: uitofp_v16i8_double:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: umov w8, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w9, v0.b[1]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w18, v0.b[6]
+; CHECK-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-NEXT: movi d1, #0x0000ff000000ff
-; CHECK-NEXT: umov w14, v2.b[0]
-; CHECK-NEXT: umov w16, v2.b[2]
-; CHECK-NEXT: umov w0, v2.b[4]
-; CHECK-NEXT: fmov s3, w8
-; CHECK-NEXT: umov w8, v0.b[7]
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: umov w10, v2.b[6]
-; CHECK-NEXT: umov w15, v2.b[1]
-; CHECK-NEXT: umov w17, v2.b[3]
-; CHECK-NEXT: fmov s4, w12
-; CHECK-NEXT: umov w12, v2.b[5]
-; CHECK-NEXT: fmov s7, w18
-; CHECK-NEXT: mov v3.s[1], w9
-; CHECK-NEXT: umov w9, v2.b[7]
-; CHECK-NEXT: fmov s2, w14
-; CHECK-NEXT: fmov s5, w16
-; CHECK-NEXT: fmov s6, w0
-; CHECK-NEXT: mov v0.s[1], w11
-; CHECK-NEXT: fmov s16, w10
-; CHECK-NEXT: mov v4.s[1], w13
-; CHECK-NEXT: mov v7.s[1], w8
-; CHECK-NEXT: mov v2.s[1], w15
-; CHECK-NEXT: mov v5.s[1], w17
-; CHECK-NEXT: mov v6.s[1], w12
+; CHECK-NEXT: ushll2 v3.4s, v2.8h, #0
+; CHECK-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ext v5.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v7.16b, v4.16b, v4.16b, #8
+; CHECK-NEXT: ext v16.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v3.8b, v3.8b, v1.8b
-; CHECK-NEXT: mov v16.s[1], w9
+; CHECK-NEXT: and v2.8b, v2.8b, v1.8b
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: and v4.8b, v4.8b, v1.8b
-; CHECK-NEXT: and v7.8b, v7.8b, v1.8b
-; CHECK-NEXT: and v2.8b, v2.8b, v1.8b
-; CHECK-NEXT: ushll v3.2d, v3.2s, #0
; CHECK-NEXT: and v5.8b, v5.8b, v1.8b
; CHECK-NEXT: and v6.8b, v6.8b, v1.8b
+; CHECK-NEXT: and v7.8b, v7.8b, v1.8b
; CHECK-NEXT: and v1.8b, v16.8b, v1.8b
-; CHECK-NEXT: ushll v16.2d, v0.2s, #0
-; CHECK-NEXT: ushll v17.2d, v4.2s, #0
-; CHECK-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-NEXT: ushll v7.2d, v7.2s, #0
-; CHECK-NEXT: ucvtf v0.2d, v3.2d
+; CHECK-NEXT: ushll v3.2d, v3.2s, #0
+; CHECK-NEXT: ushll v16.2d, v2.2s, #0
+; CHECK-NEXT: ushll v17.2d, v0.2s, #0
+; CHECK-NEXT: ushll v18.2d, v4.2s, #0
; CHECK-NEXT: ushll v5.2d, v5.2s, #0
; CHECK-NEXT: ushll v6.2d, v6.2s, #0
-; CHECK-NEXT: ushll v18.2d, v1.2s, #0
-; CHECK-NEXT: ucvtf v1.2d, v16.2d
-; CHECK-NEXT: ucvtf v4.2d, v2.2d
-; CHECK-NEXT: ucvtf v2.2d, v17.2d
-; CHECK-NEXT: ucvtf v3.2d, v7.2d
-; CHECK-NEXT: ucvtf v5.2d, v5.2d
-; CHECK-NEXT: ucvtf v6.2d, v6.2d
-; CHECK-NEXT: ucvtf v7.2d, v18.2d
+; CHECK-NEXT: ushll v7.2d, v7.2s, #0
+; CHECK-NEXT: ushll v19.2d, v1.2s, #0
+; CHECK-NEXT: ucvtf v2.2d, v3.2d
+; CHECK-NEXT: ucvtf v0.2d, v16.2d
+; CHECK-NEXT: ucvtf v4.2d, v17.2d
+; CHECK-NEXT: ucvtf v3.2d, v5.2d
+; CHECK-NEXT: ucvtf v1.2d, v6.2d
+; CHECK-NEXT: ucvtf v6.2d, v18.2d
+; CHECK-NEXT: ucvtf v7.2d, v7.2d
+; CHECK-NEXT: ucvtf v5.2d, v19.2d
; CHECK-NEXT: ret
%1 = uitofp <16 x i8> %a to <16 x double>
ret <16 x double> %1
>From 301300d9ab3c7cf8a694e5696f59a1243d9c1b22 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 28 Mar 2025 12:38:29 +0000
Subject: [PATCH 3/3] [DAG][AArch64] Handle truncated buildvectors to allow
and(subvector(anyext)) fold.
This fold was not handling the extended BUILDVECTORs that we see when i8/i16
are not legal types. Using isConstOrConstSplat(N1, false, true) allows it to
match truncated constants. The other changes are to make sure that truncated
values in N1C are treated correctly, the fold we are mostly interested in is
```
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() && N1C &&
ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
```
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +-
.../aarch64-neon-vector-insert-uaddlv.ll | 12 +--
llvm/test/CodeGen/AArch64/ctlz.ll | 3 +-
llvm/test/CodeGen/AArch64/ctpop.ll | 3 +-
llvm/test/CodeGen/AArch64/itofp.ll | 90 +++++++------------
.../AArch64/vec3-loads-ext-trunc-stores.ll | 23 ++---
llvm/test/CodeGen/AArch64/vector-fcvt.ll | 36 +++-----
7 files changed, 60 insertions(+), 112 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2fd744391b917..c39ac138b3ace 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7166,7 +7166,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
// if (and x, c) is known to be zero, return 0
unsigned BitWidth = VT.getScalarSizeInBits();
- ConstantSDNode *N1C = isConstOrConstSplat(N1);
+ ConstantSDNode *N1C = isConstOrConstSplat(N1, false, true);
if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
return DAG.getConstant(0, DL, VT);
@@ -7205,7 +7205,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0Op0);
// fold (and (any_ext V), c) -> (zero_ext (and (trunc V), c)) if profitable.
- if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
+ APInt N1APInt = N1C->getAPIntValue().trunc(VT.getScalarSizeInBits());
+ if (N1APInt.countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
TLI.isNarrowingProfitable(N, VT, SrcVT))
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 412f39f8adc1b..f37767291ca14 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -282,8 +282,7 @@ define void @insert_vec_v16i8_uaddlv_from_v8i8(ptr %0) {
; CHECK-NEXT: uaddlv.8b h1, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: mov.b v2[0], v1[0]
-; CHECK-NEXT: zip1.8b v2, v2, v2
-; CHECK-NEXT: bic.4h v2, #255, lsl #8
+; CHECK-NEXT: ushll.8h v2, v2, #0
; CHECK-NEXT: ushll.4s v2, v2, #0
; CHECK-NEXT: ucvtf.4s v2, v2
; CHECK-NEXT: stp q2, q0, [x0]
@@ -305,8 +304,7 @@ define void @insert_vec_v8i8_uaddlv_from_v8i8(ptr %0) {
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: uaddlv.8b h1, v0
; CHECK-NEXT: mov.b v0[0], v1[0]
-; CHECK-NEXT: zip1.8b v0, v0, v0
-; CHECK-NEXT: bic.4h v0, #255, lsl #8
+; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
@@ -436,8 +434,7 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: stp xzr, xzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.b v1[0], v0[0]
-; CHECK-NEXT: zip1.8b v1, v1, v1
-; CHECK-NEXT: bic.4h v1, #255, lsl #8
+; CHECK-NEXT: ushll.8h v1, v1, #0
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: str q1, [x0]
@@ -461,8 +458,7 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.b v1[0], v0[0]
-; CHECK-NEXT: zip1.8b v1, v1, v1
-; CHECK-NEXT: bic.4h v1, #255, lsl #8
+; CHECK-NEXT: ushll.8h v1, v1, #0
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q2, [x0]
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index afdeff06fdef6..facc15ef15e8b 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -44,8 +44,7 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: ldr s1, [x0]
; CHECK-SD-NEXT: movi v0.4h, #8
-; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
-; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: clz v1.4h, v1.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index c7c378d3e67cd..edb1d55a15733 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -43,8 +43,7 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: ldr s0, [x0]
-; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index e4fb2b7c2a3c7..80a7d47c063e4 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -5503,14 +5503,10 @@ define <8 x float> @utofp_v8i8_v8f32(<8 x i8> %a) {
; CHECK-SD-LABEL: utofp_v8i8_v8f32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
; CHECK-SD-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v8i8_v8f32:
@@ -5562,24 +5558,16 @@ entry:
define <16 x float> @utofp_v16i8_v16f32(<16 x i8> %a) {
; CHECK-SD-LABEL: utofp_v16i8_v16f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v3.4h, #255, lsl #8
-; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-SD-NEXT: ushll v5.4s, v3.4h, #0
-; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-SD-NEXT: ucvtf v2.4s, v1.4s
-; CHECK-SD-NEXT: ucvtf v3.4s, v4.4s
-; CHECK-SD-NEXT: ucvtf v1.4s, v5.4s
+; CHECK-SD-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT: ushll v4.4s, v0.4h, #0
+; CHECK-SD-NEXT: ucvtf v0.4s, v2.4s
+; CHECK-SD-NEXT: ucvtf v3.4s, v3.4s
+; CHECK-SD-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-SD-NEXT: ucvtf v2.4s, v4.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v16i8_v16f32:
@@ -5656,42 +5644,26 @@ entry:
define <32 x float> @utofp_v32i8_v32f32(<32 x i8> %a) {
; CHECK-SD-LABEL: utofp_v32i8_v32f32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ushll2 v2.8h, v0.16b, #0
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-SD-NEXT: ext v4.16b, v2.16b, v2.16b, #8
-; CHECK-SD-NEXT: ext v5.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: ext v6.16b, v3.16b, v3.16b, #8
-; CHECK-SD-NEXT: ext v7.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
-; CHECK-SD-NEXT: // kill: def $d3 killed $d3 killed $q3
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v3.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v4.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v5.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v6.4h, #255, lsl #8
-; CHECK-SD-NEXT: bic v7.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll2 v4.4s, v2.8h, #0
; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT: ushll v17.4s, v3.4h, #0
-; CHECK-SD-NEXT: ushll v16.4s, v4.4h, #0
-; CHECK-SD-NEXT: ushll v5.4s, v5.4h, #0
-; CHECK-SD-NEXT: ushll v18.4s, v6.4h, #0
-; CHECK-SD-NEXT: ushll v19.4s, v7.4h, #0
-; CHECK-SD-NEXT: ucvtf v2.4s, v2.4s
-; CHECK-SD-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-SD-NEXT: ucvtf v4.4s, v1.4s
-; CHECK-SD-NEXT: ucvtf v6.4s, v17.4s
-; CHECK-SD-NEXT: ucvtf v3.4s, v16.4s
-; CHECK-SD-NEXT: ucvtf v1.4s, v5.4s
-; CHECK-SD-NEXT: ucvtf v7.4s, v18.4s
-; CHECK-SD-NEXT: ucvtf v5.4s, v19.4s
+; CHECK-SD-NEXT: ushll2 v5.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v6.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v7.4s, v3.4h, #0
+; CHECK-SD-NEXT: ushll2 v16.4s, v1.8h, #0
+; CHECK-SD-NEXT: ushll2 v17.4s, v3.8h, #0
+; CHECK-SD-NEXT: ushll v18.4s, v1.4h, #0
+; CHECK-SD-NEXT: ucvtf v1.4s, v4.4s
+; CHECK-SD-NEXT: ucvtf v0.4s, v2.4s
+; CHECK-SD-NEXT: ucvtf v3.4s, v5.4s
+; CHECK-SD-NEXT: ucvtf v2.4s, v6.4s
+; CHECK-SD-NEXT: ucvtf v4.4s, v7.4s
+; CHECK-SD-NEXT: ucvtf v7.4s, v16.4s
+; CHECK-SD-NEXT: ucvtf v5.4s, v17.4s
+; CHECK-SD-NEXT: ucvtf v6.4s, v18.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: utofp_v32i8_v32f32:
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 45b7a2759b0b3..be08dd25c6bb3 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -444,8 +444,7 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: zip1.8b v0, v0, v0
-; CHECK-NEXT: bic.4h v0, #255, lsl #8
+; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: ret
@@ -480,8 +479,7 @@ define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: zip1.8b v0, v0, v0
-; CHECK-NEXT: bic.4h v0, #255, lsl #8
+; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: ret
@@ -491,8 +489,7 @@ define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) {
; BE-NEXT: ldr s0, [x0]
; BE-NEXT: add x8, x1, #4
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; BE-NEXT: bic v0.4h, #255, lsl #8
+; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: rev32 v1.8h, v0.8h
; BE-NEXT: st1 { v0.h }[2], [x8]
; BE-NEXT: str s1, [x1]
@@ -509,8 +506,7 @@ define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: zip1.8b v0, v0, v0
-; CHECK-NEXT: bic.4h v0, #255, lsl #8
+; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: ret
@@ -520,8 +516,7 @@ define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) {
; BE-NEXT: ldr s0, [x0]
; BE-NEXT: add x8, x1, #4
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
-; BE-NEXT: bic v0.4h, #255, lsl #8
+; BE-NEXT: ushll v0.8h, v0.8b, #0
; BE-NEXT: rev32 v1.8h, v0.8h
; BE-NEXT: st1 { v0.h }[2], [x8]
; BE-NEXT: str s1, [x1]
@@ -541,13 +536,11 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x8, lCPI15_0 at PAGE
; CHECK-NEXT: Lloh3:
-; CHECK-NEXT: ldr d1, [x8, lCPI15_0 at PAGEOFF]
+; CHECK-NEXT: ldr d0, [x8, lCPI15_0 at PAGEOFF]
; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: orr w9, w10, w9, lsl #16
-; CHECK-NEXT: fmov s0, w9
-; CHECK-NEXT: zip1.8b v0, v0, v0
-; CHECK-NEXT: bic.4h v0, #255, lsl #8
-; CHECK-NEXT: add.4h v0, v0, v1
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: uaddw.8h v0, v0, v1
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index ce0a8b5b68d2b..3c763709589a7 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -114,14 +114,10 @@ define <8 x float> @uitofp_v8i8_float(<8 x i8> %a) {
; CHECK-LABEL: uitofp_v8i8_float:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: bic v1.4h, #255, lsl #8
+; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ucvtf v0.4s, v0.4s
; CHECK-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-NEXT: ucvtf v0.4s, v0.4s
; CHECK-NEXT: ret
%1 = uitofp <8 x i8> %a to <8 x float>
ret <8 x float> %1
@@ -130,24 +126,16 @@ define <8 x float> @uitofp_v8i8_float(<8 x i8> %a) {
define <16 x float> @uitofp_v16i8_float(<16 x i8> %a) {
; CHECK-LABEL: uitofp_v16i8_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-NEXT: bic v2.4h, #255, lsl #8
-; CHECK-NEXT: bic v3.4h, #255, lsl #8
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-NEXT: ushll v5.4s, v3.4h, #0
-; CHECK-NEXT: ucvtf v0.4s, v0.4s
-; CHECK-NEXT: ucvtf v2.4s, v1.4s
-; CHECK-NEXT: ucvtf v3.4s, v4.4s
-; CHECK-NEXT: ucvtf v1.4s, v5.4s
+; CHECK-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT: ushll v4.4s, v0.4h, #0
+; CHECK-NEXT: ucvtf v0.4s, v2.4s
+; CHECK-NEXT: ucvtf v3.4s, v3.4s
+; CHECK-NEXT: ucvtf v1.4s, v1.4s
+; CHECK-NEXT: ucvtf v2.4s, v4.4s
; CHECK-NEXT: ret
%1 = uitofp <16 x i8> %a to <16 x float>
ret <16 x float> %1
More information about the llvm-commits
mailing list