[llvm] 59fbb9e - [AArch64] Add tablegen patterns for i8 and i16 vector insert/extract pairs (#136091)

via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 28 00:17:49 PDT 2025


Author: David Green
Date: 2025-04-28T08:17:45+01:00
New Revision: 59fbb9e7751b36bbca1064abb6c21ce59c038a70

URL: https://github.com/llvm/llvm-project/commit/59fbb9e7751b36bbca1064abb6c21ce59c038a70
DIFF: https://github.com/llvm/llvm-project/commit/59fbb9e7751b36bbca1064abb6c21ce59c038a70.diff

LOG: [AArch64] Add tablegen patterns for i8 and i16 vector insert/extract pairs (#136091)

An i8 and i16 vector extract/insert has to go via a i32 to make sure the
types are legal. This patch adds patterns for extract from a i8/i16
vector, inserted into a i16/i32 vector. This avoids the round trip via a
GPR which can limit performance.

Added: 
    

Modified: 
    llvm/include/llvm/Target/TargetSelectionDAG.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
    llvm/test/CodeGen/AArch64/bitcast-extend.ll
    llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
    llvm/test/CodeGen/AArch64/itofp.ll
    llvm/test/CodeGen/AArch64/neon-bitcast.ll
    llvm/test/CodeGen/AArch64/shuffle-extend.ll
    llvm/test/CodeGen/AArch64/vector-fcvt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a53527442719a..3515a7da71075 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -827,8 +827,11 @@ def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1,
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
                               []>;
 
-// vector_extract/vector_insert are deprecated. extractelt/insertelt
-// are preferred.
+// vector_extract/vector_insert are similar to extractelt/insertelt but allow
+// types that require promotion (a 16i8 extract where i8 is not a legal type so
+// uses i32 for example). extractelt/insertelt are preferred where the element
+// type and the extracted types match due to the extra type checking they
+// perform.
 def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
     SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>;
 def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f1c95fdfc8974..f7b13092821d6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7347,6 +7347,41 @@ def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))),
 def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), (i64 imm:$Immd))),
           (INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>;
 
+// Patterns for i8/i16 -> v2i32/v4i16 lane moves via insert and extract that go via i32.
+multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType OutVT,
+                                    Instruction INS, SDNodeXForm VecIndexMult> {
+  // VT64->OutVT
+  def : Pat<(OutVT (vector_insert (OutVT V64:$src),
+                                  (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
+                                  (i64 imm:$Immd))),
+            (EXTRACT_SUBREG
+              (INS (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$src, dsub), (VecIndexMult imm:$Immd),
+                   (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
+              dsub)>;
+  def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))))),
+            (EXTRACT_SUBREG
+              (INS (IMPLICIT_DEF), 0,
+                   (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
+              dsub)>;
+
+  // VT128->OutVT
+  def : Pat<(OutVT (vector_insert (OutVT V64:$src),
+                                  (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
+                                  (i64 imm:$Immd))),
+            (EXTRACT_SUBREG
+              (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), (VecIndexMult imm:$Immd),
+                   V128:$Rn, imm:$Immn),
+              dsub)>;
+  def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))))),
+            (EXTRACT_SUBREG
+              (INS (IMPLICIT_DEF), 0, V128:$Rn, imm:$Immn),
+              dsub)>;
+}
+
+defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, VecIndex_x2>;
+defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, VecIndex_x4>;
+defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, VecIndex_x2>;
+
 // bitcast of an extract
 // f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
 def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),

diff  --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index 7a4cdd52db904..fccb1fb675768 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -11,14 +11,11 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
 ; CHECK-SDAG-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SDAG-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SDAG-NEXT:    str d0, [sp, #8]
-; CHECK-SDAG-NEXT:    umov w9, v0.b[1]
 ; CHECK-SDAG-NEXT:    bfxil x8, x0, #0, #3
 ; CHECK-SDAG-NEXT:    ld1 { v1.b }[0], [x8]
-; CHECK-SDAG-NEXT:    umov w8, v0.b[2]
-; CHECK-SDAG-NEXT:    mov v1.h[1], w9
-; CHECK-SDAG-NEXT:    umov w9, v0.b[3]
-; CHECK-SDAG-NEXT:    mov v1.h[2], w8
-; CHECK-SDAG-NEXT:    mov v1.h[3], w9
+; CHECK-SDAG-NEXT:    mov v1.b[2], v0.b[1]
+; CHECK-SDAG-NEXT:    mov v1.b[4], v0.b[2]
+; CHECK-SDAG-NEXT:    mov v1.b[6], v0.b[3]
 ; CHECK-SDAG-NEXT:    fmov d0, d1
 ; CHECK-SDAG-NEXT:    add sp, sp, #16
 ; CHECK-SDAG-NEXT:    ret
@@ -168,11 +165,10 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
 ; CHECK-SDAG-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SDAG-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SDAG-NEXT:    str d0, [sp, #8]
-; CHECK-SDAG-NEXT:    umov w9, v0.h[1]
 ; CHECK-SDAG-NEXT:    bfi x8, x0, #1, #2
-; CHECK-SDAG-NEXT:    ld1 { v0.h }[0], [x8]
-; CHECK-SDAG-NEXT:    mov v0.s[1], w9
-; CHECK-SDAG-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SDAG-NEXT:    ld1 { v1.h }[0], [x8]
+; CHECK-SDAG-NEXT:    mov v1.h[2], v0.h[1]
+; CHECK-SDAG-NEXT:    fmov d0, d1
 ; CHECK-SDAG-NEXT:    add sp, sp, #16
 ; CHECK-SDAG-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index 85daa3ca6623e..33238ccf86a39 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -70,16 +70,12 @@ define <4 x i64> @z_i32_v4i64(i32 %x) {
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    fmov s0, w0
 ; CHECK-SD-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-SD-NEXT:    umov w8, v0.b[2]
-; CHECK-SD-NEXT:    umov w9, v0.b[0]
-; CHECK-SD-NEXT:    umov w10, v0.b[3]
-; CHECK-SD-NEXT:    umov w11, v0.b[1]
-; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:    fmov s2, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w11
-; CHECK-SD-NEXT:    mov v2.s[1], w10
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v3.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v2.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v3.b[4], v0.b[3]
+; CHECK-SD-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v3.2s, #0
 ; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    and v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
@@ -176,16 +172,12 @@ define <4 x i64> @s_i32_v4i64(i32 %x) {
 ; CHECK-SD-LABEL: s_i32_v4i64:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    umov w8, v0.b[2]
-; CHECK-SD-NEXT:    umov w9, v0.b[0]
-; CHECK-SD-NEXT:    umov w10, v0.b[3]
-; CHECK-SD-NEXT:    umov w11, v0.b[1]
-; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w11
-; CHECK-SD-NEXT:    mov v1.s[1], w10
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v2.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v1.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v2.b[4], v0.b[3]
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
 ; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #56

diff  --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
index 8b74de1c127dd..e90b6cb7f809b 100644
--- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
+++ b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
@@ -5,16 +5,12 @@
 define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; CHECKLE-LABEL: test_reconstructshuffle:
 ; CHECKLE:       // %bb.0:
-; CHECKLE-NEXT:    umov w8, v0.b[3]
-; CHECKLE-NEXT:    umov w9, v0.b[2]
-; CHECKLE-NEXT:    fmov s2, w8
-; CHECKLE-NEXT:    umov w8, v0.b[1]
-; CHECKLE-NEXT:    mov v2.h[1], w9
-; CHECKLE-NEXT:    mov v2.h[2], w8
-; CHECKLE-NEXT:    umov w8, v0.b[0]
-; CHECKLE-NEXT:    ext v0.16b, v1.16b, v1.16b, #8
-; CHECKLE-NEXT:    mov v2.h[3], w8
-; CHECKLE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; CHECKLE-NEXT:    mov v2.b[0], v0.b[3]
+; CHECKLE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKLE-NEXT:    mov v2.b[2], v0.b[2]
+; CHECKLE-NEXT:    mov v2.b[4], v0.b[1]
+; CHECKLE-NEXT:    mov v2.b[6], v0.b[0]
+; CHECKLE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
 ; CHECKLE-NEXT:    add v0.4h, v2.4h, v0.4h
 ; CHECKLE-NEXT:    bic v0.4h, #255, lsl #8
 ; CHECKLE-NEXT:    ret
@@ -25,16 +21,12 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; CHECKBE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECKBE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT:    umov w8, v0.b[3]
-; CHECKBE-NEXT:    umov w9, v0.b[2]
-; CHECKBE-NEXT:    fmov s2, w8
-; CHECKBE-NEXT:    umov w8, v0.b[1]
-; CHECKBE-NEXT:    mov v2.h[1], w9
-; CHECKBE-NEXT:    mov v2.h[2], w8
-; CHECKBE-NEXT:    umov w8, v0.b[0]
-; CHECKBE-NEXT:    ext v0.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT:    mov v2.h[3], w8
-; CHECKBE-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; CHECKBE-NEXT:    mov v2.b[0], v0.b[3]
+; CHECKBE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT:    mov v2.b[2], v0.b[2]
+; CHECKBE-NEXT:    mov v2.b[4], v0.b[1]
+; CHECKBE-NEXT:    mov v2.b[6], v0.b[0]
+; CHECKBE-NEXT:    zip2 v0.8b, v1.8b, v0.8b
 ; CHECKBE-NEXT:    add v0.4h, v2.4h, v0.4h
 ; CHECKBE-NEXT:    bic v0.4h, #255, lsl #8
 ; CHECKBE-NEXT:    rev64 v0.4h, v0.4h

diff  --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 07957c117868d..fb2bdb4d63f47 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3443,26 +3443,18 @@ define <8 x double> @stofp_v8i8_v8f64(<8 x i8> %a) {
 ; CHECK-SD-LABEL: stofp_v8i8_v8f64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    umov w8, v0.b[0]
-; CHECK-SD-NEXT:    umov w9, v0.b[2]
-; CHECK-SD-NEXT:    umov w11, v0.b[4]
-; CHECK-SD-NEXT:    umov w12, v0.b[6]
-; CHECK-SD-NEXT:    umov w10, v0.b[1]
-; CHECK-SD-NEXT:    umov w13, v0.b[3]
-; CHECK-SD-NEXT:    umov w14, v0.b[5]
-; CHECK-SD-NEXT:    umov w15, v0.b[7]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    fmov s2, w11
-; CHECK-SD-NEXT:    fmov s3, w12
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w13
-; CHECK-SD-NEXT:    mov v2.s[1], w14
-; CHECK-SD-NEXT:    mov v3.s[1], w15
-; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-SD-NEXT:    shl v2.2s, v2.2s, #24
-; CHECK-SD-NEXT:    shl v3.2s, v3.2s, #24
+; CHECK-SD-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v2.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v3.b[0], v0.b[4]
+; CHECK-SD-NEXT:    mov v4.b[0], v0.b[6]
+; CHECK-SD-NEXT:    mov v1.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v2.b[4], v0.b[3]
+; CHECK-SD-NEXT:    mov v3.b[4], v0.b[5]
+; CHECK-SD-NEXT:    mov v4.b[4], v0.b[7]
+; CHECK-SD-NEXT:    shl v0.2s, v1.2s, #24
+; CHECK-SD-NEXT:    shl v1.2s, v2.2s, #24
+; CHECK-SD-NEXT:    shl v2.2s, v3.2s, #24
+; CHECK-SD-NEXT:    shl v3.2s, v4.2s, #24
 ; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    sshr v2.2s, v2.2s, #24
@@ -3500,27 +3492,19 @@ define <8 x double> @utofp_v8i8_v8f64(<8 x i8> %a) {
 ; CHECK-SD-LABEL: utofp_v8i8_v8f64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    umov w8, v0.b[0]
-; CHECK-SD-NEXT:    umov w9, v0.b[2]
-; CHECK-SD-NEXT:    umov w11, v0.b[4]
-; CHECK-SD-NEXT:    umov w12, v0.b[6]
-; CHECK-SD-NEXT:    umov w10, v0.b[1]
-; CHECK-SD-NEXT:    umov w13, v0.b[3]
-; CHECK-SD-NEXT:    umov w14, v0.b[5]
-; CHECK-SD-NEXT:    umov w15, v0.b[7]
+; CHECK-SD-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v3.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v4.b[0], v0.b[4]
+; CHECK-SD-NEXT:    mov v5.b[0], v0.b[6]
 ; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s2, w9
-; CHECK-SD-NEXT:    fmov s3, w11
-; CHECK-SD-NEXT:    fmov s4, w12
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v2.s[1], w13
-; CHECK-SD-NEXT:    mov v3.s[1], w14
-; CHECK-SD-NEXT:    mov v4.s[1], w15
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    and v2.8b, v2.8b, v1.8b
-; CHECK-SD-NEXT:    and v3.8b, v3.8b, v1.8b
-; CHECK-SD-NEXT:    and v1.8b, v4.8b, v1.8b
+; CHECK-SD-NEXT:    mov v2.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v3.b[4], v0.b[3]
+; CHECK-SD-NEXT:    mov v4.b[4], v0.b[5]
+; CHECK-SD-NEXT:    mov v5.b[4], v0.b[7]
+; CHECK-SD-NEXT:    and v0.8b, v2.8b, v1.8b
+; CHECK-SD-NEXT:    and v2.8b, v3.8b, v1.8b
+; CHECK-SD-NEXT:    and v3.8b, v4.8b, v1.8b
+; CHECK-SD-NEXT:    and v1.8b, v5.8b, v1.8b
 ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
@@ -3553,68 +3537,52 @@ entry:
 define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) {
 ; CHECK-SD-LABEL: stofp_v16i8_v16f64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umov w8, v0.b[0]
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    umov w9, v0.b[1]
-; CHECK-SD-NEXT:    umov w10, v0.b[2]
-; CHECK-SD-NEXT:    umov w12, v0.b[4]
-; CHECK-SD-NEXT:    umov w14, v0.b[6]
-; CHECK-SD-NEXT:    umov w11, v0.b[3]
-; CHECK-SD-NEXT:    umov w13, v0.b[5]
-; CHECK-SD-NEXT:    fmov s2, w8
-; CHECK-SD-NEXT:    umov w15, v1.b[0]
-; CHECK-SD-NEXT:    umov w17, v1.b[2]
-; CHECK-SD-NEXT:    umov w0, v1.b[4]
-; CHECK-SD-NEXT:    umov w16, v1.b[1]
-; CHECK-SD-NEXT:    umov w18, v1.b[3]
-; CHECK-SD-NEXT:    umov w8, v0.b[7]
-; CHECK-SD-NEXT:    fmov s0, w10
-; CHECK-SD-NEXT:    umov w10, v1.b[5]
-; CHECK-SD-NEXT:    mov v2.s[1], w9
-; CHECK-SD-NEXT:    umov w9, v1.b[6]
-; CHECK-SD-NEXT:    fmov s3, w12
-; CHECK-SD-NEXT:    umov w12, v1.b[7]
-; CHECK-SD-NEXT:    fmov s1, w14
-; CHECK-SD-NEXT:    fmov s4, w15
-; CHECK-SD-NEXT:    fmov s5, w17
-; CHECK-SD-NEXT:    fmov s6, w0
-; CHECK-SD-NEXT:    mov v0.s[1], w11
-; CHECK-SD-NEXT:    mov v3.s[1], w13
-; CHECK-SD-NEXT:    fmov s7, w9
-; CHECK-SD-NEXT:    mov v1.s[1], w8
-; CHECK-SD-NEXT:    mov v4.s[1], w16
-; CHECK-SD-NEXT:    mov v5.s[1], w18
-; CHECK-SD-NEXT:    mov v6.s[1], w10
-; CHECK-SD-NEXT:    shl v2.2s, v2.2s, #24
-; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-SD-NEXT:    mov v7.s[1], w12
-; CHECK-SD-NEXT:    shl v3.2s, v3.2s, #24
-; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-SD-NEXT:    shl v4.2s, v4.2s, #24
-; CHECK-SD-NEXT:    sshr v2.2s, v2.2s, #24
-; CHECK-SD-NEXT:    shl v5.2s, v5.2s, #24
-; CHECK-SD-NEXT:    shl v6.2s, v6.2s, #24
+; CHECK-SD-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v3.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v4.b[0], v0.b[4]
+; CHECK-SD-NEXT:    mov v5.b[0], v0.b[6]
+; CHECK-SD-NEXT:    mov v6.b[0], v1.b[0]
+; CHECK-SD-NEXT:    mov v7.b[0], v1.b[2]
+; CHECK-SD-NEXT:    mov v16.b[0], v1.b[4]
+; CHECK-SD-NEXT:    mov v17.b[0], v1.b[6]
+; CHECK-SD-NEXT:    mov v2.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v3.b[4], v0.b[3]
+; CHECK-SD-NEXT:    mov v4.b[4], v0.b[5]
+; CHECK-SD-NEXT:    mov v5.b[4], v0.b[7]
+; CHECK-SD-NEXT:    mov v6.b[4], v1.b[1]
+; CHECK-SD-NEXT:    mov v7.b[4], v1.b[3]
+; CHECK-SD-NEXT:    mov v16.b[4], v1.b[5]
+; CHECK-SD-NEXT:    mov v17.b[4], v1.b[7]
+; CHECK-SD-NEXT:    shl v0.2s, v2.2s, #24
+; CHECK-SD-NEXT:    shl v1.2s, v3.2s, #24
+; CHECK-SD-NEXT:    shl v2.2s, v4.2s, #24
+; CHECK-SD-NEXT:    shl v3.2s, v5.2s, #24
+; CHECK-SD-NEXT:    shl v4.2s, v6.2s, #24
+; CHECK-SD-NEXT:    shl v5.2s, v7.2s, #24
+; CHECK-SD-NEXT:    shl v6.2s, v16.2s, #24
+; CHECK-SD-NEXT:    shl v7.2s, v17.2s, #24
 ; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-SD-NEXT:    sshr v2.2s, v2.2s, #24
 ; CHECK-SD-NEXT:    sshr v3.2s, v3.2s, #24
-; CHECK-SD-NEXT:    shl v7.2s, v7.2s, #24
 ; CHECK-SD-NEXT:    sshr v4.2s, v4.2s, #24
-; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    sshr v5.2s, v5.2s, #24
 ; CHECK-SD-NEXT:    sshr v6.2s, v6.2s, #24
+; CHECK-SD-NEXT:    sshr v7.2s, v7.2s, #24
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    sshll v16.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT:    sshr v7.2s, v7.2s, #24
 ; CHECK-SD-NEXT:    sshll v4.2d, v4.2s, #0
-; CHECK-SD-NEXT:    sshll v17.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshll v5.2d, v5.2s, #0
 ; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT:    scvtf v0.2d, v2.2d
-; CHECK-SD-NEXT:    scvtf v1.2d, v16.2d
-; CHECK-SD-NEXT:    scvtf v2.2d, v3.2d
 ; CHECK-SD-NEXT:    sshll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-SD-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-SD-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-SD-NEXT:    scvtf v3.2d, v17.2d
 ; CHECK-SD-NEXT:    scvtf v5.2d, v5.2d
 ; CHECK-SD-NEXT:    scvtf v6.2d, v6.2d
 ; CHECK-SD-NEXT:    scvtf v7.2d, v7.2d
@@ -3654,63 +3622,47 @@ define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) {
 ; CHECK-SD-LABEL: utofp_v16i8_v16f64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    umov w8, v0.b[0]
-; CHECK-SD-NEXT:    umov w10, v0.b[2]
-; CHECK-SD-NEXT:    umov w9, v0.b[1]
-; CHECK-SD-NEXT:    umov w12, v0.b[4]
-; CHECK-SD-NEXT:    umov w11, v0.b[3]
-; CHECK-SD-NEXT:    umov w13, v0.b[5]
-; CHECK-SD-NEXT:    umov w18, v0.b[6]
+; CHECK-SD-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v4.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v5.b[0], v0.b[4]
+; CHECK-SD-NEXT:    mov v6.b[0], v0.b[6]
 ; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT:    umov w14, v2.b[0]
-; CHECK-SD-NEXT:    umov w16, v2.b[2]
-; CHECK-SD-NEXT:    umov w0, v2.b[4]
-; CHECK-SD-NEXT:    fmov s3, w8
-; CHECK-SD-NEXT:    umov w8, v0.b[7]
-; CHECK-SD-NEXT:    fmov s0, w10
-; CHECK-SD-NEXT:    umov w10, v2.b[6]
-; CHECK-SD-NEXT:    umov w15, v2.b[1]
-; CHECK-SD-NEXT:    umov w17, v2.b[3]
-; CHECK-SD-NEXT:    fmov s4, w12
-; CHECK-SD-NEXT:    umov w12, v2.b[5]
-; CHECK-SD-NEXT:    fmov s7, w18
-; CHECK-SD-NEXT:    mov v3.s[1], w9
-; CHECK-SD-NEXT:    umov w9, v2.b[7]
-; CHECK-SD-NEXT:    fmov s2, w14
-; CHECK-SD-NEXT:    fmov s5, w16
-; CHECK-SD-NEXT:    fmov s6, w0
-; CHECK-SD-NEXT:    mov v0.s[1], w11
-; CHECK-SD-NEXT:    fmov s16, w10
-; CHECK-SD-NEXT:    mov v4.s[1], w13
-; CHECK-SD-NEXT:    mov v7.s[1], w8
-; CHECK-SD-NEXT:    mov v2.s[1], w15
-; CHECK-SD-NEXT:    mov v5.s[1], w17
-; CHECK-SD-NEXT:    mov v6.s[1], w12
-; CHECK-SD-NEXT:    and v3.8b, v3.8b, v1.8b
-; CHECK-SD-NEXT:    mov v16.s[1], w9
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    and v4.8b, v4.8b, v1.8b
-; CHECK-SD-NEXT:    and v7.8b, v7.8b, v1.8b
-; CHECK-SD-NEXT:    and v2.8b, v2.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT:    and v5.8b, v5.8b, v1.8b
-; CHECK-SD-NEXT:    and v6.8b, v6.8b, v1.8b
-; CHECK-SD-NEXT:    and v1.8b, v16.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v16.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll v17.2d, v4.2s, #0
+; CHECK-SD-NEXT:    mov v7.b[0], v2.b[0]
+; CHECK-SD-NEXT:    mov v16.b[0], v2.b[2]
+; CHECK-SD-NEXT:    mov v17.b[0], v2.b[4]
+; CHECK-SD-NEXT:    mov v18.b[0], v2.b[6]
+; CHECK-SD-NEXT:    mov v3.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v4.b[4], v0.b[3]
+; CHECK-SD-NEXT:    mov v5.b[4], v0.b[5]
+; CHECK-SD-NEXT:    mov v6.b[4], v0.b[7]
+; CHECK-SD-NEXT:    mov v7.b[4], v2.b[1]
+; CHECK-SD-NEXT:    mov v16.b[4], v2.b[3]
+; CHECK-SD-NEXT:    mov v17.b[4], v2.b[5]
+; CHECK-SD-NEXT:    mov v18.b[4], v2.b[7]
+; CHECK-SD-NEXT:    and v0.8b, v3.8b, v1.8b
+; CHECK-SD-NEXT:    and v2.8b, v4.8b, v1.8b
+; CHECK-SD-NEXT:    and v3.8b, v5.8b, v1.8b
+; CHECK-SD-NEXT:    and v4.8b, v6.8b, v1.8b
+; CHECK-SD-NEXT:    and v5.8b, v7.8b, v1.8b
+; CHECK-SD-NEXT:    and v6.8b, v16.8b, v1.8b
+; CHECK-SD-NEXT:    and v7.8b, v17.8b, v1.8b
+; CHECK-SD-NEXT:    and v1.8b, v18.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    ushll v7.2d, v7.2s, #0
-; CHECK-SD-NEXT:    ucvtf v0.2d, v3.2d
+; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v4.2s, #0
 ; CHECK-SD-NEXT:    ushll v5.2d, v5.2s, #0
 ; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT:    ushll v18.2d, v1.2s, #0
-; CHECK-SD-NEXT:    ucvtf v1.2d, v16.2d
-; CHECK-SD-NEXT:    ucvtf v4.2d, v2.2d
-; CHECK-SD-NEXT:    ucvtf v2.2d, v17.2d
-; CHECK-SD-NEXT:    ucvtf v3.2d, v7.2d
-; CHECK-SD-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-SD-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-SD-NEXT:    ucvtf v7.2d, v18.2d
+; CHECK-SD-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT:    ushll v16.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-SD-NEXT:    ucvtf v1.2d, v2.2d
+; CHECK-SD-NEXT:    ucvtf v2.2d, v3.2d
+; CHECK-SD-NEXT:    ucvtf v3.2d, v4.2d
+; CHECK-SD-NEXT:    ucvtf v4.2d, v5.2d
+; CHECK-SD-NEXT:    ucvtf v5.2d, v6.2d
+; CHECK-SD-NEXT:    ucvtf v6.2d, v7.2d
+; CHECK-SD-NEXT:    ucvtf v7.2d, v16.2d
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v16i8_v16f64:
@@ -3747,143 +3699,111 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
 ; CHECK-SD-LABEL: stofp_v32i8_v32f64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT:    umov w9, v3.b[0]
-; CHECK-SD-NEXT:    umov w11, v3.b[4]
-; CHECK-SD-NEXT:    umov w13, v3.b[6]
-; CHECK-SD-NEXT:    umov w18, v2.b[2]
-; CHECK-SD-NEXT:    umov w10, v3.b[2]
-; CHECK-SD-NEXT:    umov w12, v3.b[1]
-; CHECK-SD-NEXT:    umov w16, v2.b[0]
-; CHECK-SD-NEXT:    umov w14, v3.b[3]
-; CHECK-SD-NEXT:    umov w15, v3.b[5]
-; CHECK-SD-NEXT:    umov w17, v3.b[7]
-; CHECK-SD-NEXT:    fmov s6, w9
-; CHECK-SD-NEXT:    fmov s5, w11
-; CHECK-SD-NEXT:    fmov s7, w13
-; CHECK-SD-NEXT:    umov w13, v2.b[4]
-; CHECK-SD-NEXT:    umov w11, v2.b[3]
-; CHECK-SD-NEXT:    umov w9, v2.b[6]
-; CHECK-SD-NEXT:    fmov s17, w18
-; CHECK-SD-NEXT:    fmov s4, w10
-; CHECK-SD-NEXT:    umov w10, v2.b[1]
-; CHECK-SD-NEXT:    mov v6.s[1], w12
-; CHECK-SD-NEXT:    fmov s3, w16
-; CHECK-SD-NEXT:    umov w12, v2.b[5]
-; CHECK-SD-NEXT:    mov v5.s[1], w15
-; CHECK-SD-NEXT:    umov w15, v1.b[0]
-; CHECK-SD-NEXT:    umov w16, v0.b[6]
-; CHECK-SD-NEXT:    fmov s16, w13
-; CHECK-SD-NEXT:    umov w13, v1.b[2]
-; CHECK-SD-NEXT:    mov v17.s[1], w11
-; CHECK-SD-NEXT:    umov w11, v1.b[6]
-; CHECK-SD-NEXT:    fmov s18, w9
-; CHECK-SD-NEXT:    umov w9, v1.b[4]
-; CHECK-SD-NEXT:    mov v3.s[1], w10
-; CHECK-SD-NEXT:    umov w10, v0.b[0]
-; CHECK-SD-NEXT:    mov v4.s[1], w14
-; CHECK-SD-NEXT:    mov v16.s[1], w12
-; CHECK-SD-NEXT:    umov w12, v1.b[7]
-; CHECK-SD-NEXT:    umov w14, v1.b[5]
-; CHECK-SD-NEXT:    fmov s20, w13
-; CHECK-SD-NEXT:    umov w13, v1.b[3]
-; CHECK-SD-NEXT:    fmov s22, w15
-; CHECK-SD-NEXT:    fmov s21, w11
-; CHECK-SD-NEXT:    umov w11, v1.b[1]
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    fmov s19, w10
-; CHECK-SD-NEXT:    umov w10, v0.b[4]
-; CHECK-SD-NEXT:    umov w9, v0.b[7]
-; CHECK-SD-NEXT:    fmov s23, w16
-; CHECK-SD-NEXT:    mov v7.s[1], w17
+; CHECK-SD-NEXT:    mov v5.b[0], v1.b[6]
+; CHECK-SD-NEXT:    mov v17.b[0], v1.b[4]
+; CHECK-SD-NEXT:    mov v20.b[0], v1.b[2]
+; CHECK-SD-NEXT:    mov v21.b[0], v1.b[0]
+; CHECK-SD-NEXT:    mov v18.b[0], v0.b[0]
+; CHECK-SD-NEXT:    mov v19.b[0], v0.b[6]
+; CHECK-SD-NEXT:    mov v22.b[0], v0.b[4]
+; CHECK-SD-NEXT:    ext v16.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    mov v2.b[0], v3.b[0]
+; CHECK-SD-NEXT:    mov v4.b[0], v3.b[2]
+; CHECK-SD-NEXT:    mov v6.b[0], v3.b[4]
+; CHECK-SD-NEXT:    mov v7.b[0], v3.b[6]
+; CHECK-SD-NEXT:    mov v5.b[4], v1.b[7]
+; CHECK-SD-NEXT:    mov v17.b[4], v1.b[5]
+; CHECK-SD-NEXT:    mov v20.b[4], v1.b[3]
+; CHECK-SD-NEXT:    mov v21.b[4], v1.b[1]
+; CHECK-SD-NEXT:    mov v19.b[4], v0.b[7]
+; CHECK-SD-NEXT:    mov v22.b[4], v0.b[5]
+; CHECK-SD-NEXT:    mov v18.b[4], v0.b[1]
+; CHECK-SD-NEXT:    mov v23.b[0], v16.b[0]
+; CHECK-SD-NEXT:    mov v2.b[4], v3.b[1]
+; CHECK-SD-NEXT:    mov v4.b[4], v3.b[3]
+; CHECK-SD-NEXT:    mov v6.b[4], v3.b[5]
+; CHECK-SD-NEXT:    mov v7.b[4], v3.b[7]
+; CHECK-SD-NEXT:    mov v3.b[0], v0.b[2]
+; CHECK-SD-NEXT:    shl v5.2s, v5.2s, #24
 ; CHECK-SD-NEXT:    shl v17.2s, v17.2s, #24
-; CHECK-SD-NEXT:    mov v21.s[1], w12
-; CHECK-SD-NEXT:    mov v1.s[1], w14
-; CHECK-SD-NEXT:    umov w14, v0.b[5]
-; CHECK-SD-NEXT:    umov w12, v0.b[1]
-; CHECK-SD-NEXT:    mov v20.s[1], w13
-; CHECK-SD-NEXT:    umov w13, v0.b[2]
-; CHECK-SD-NEXT:    mov v22.s[1], w11
-; CHECK-SD-NEXT:    umov w11, v0.b[3]
-; CHECK-SD-NEXT:    fmov s0, w10
-; CHECK-SD-NEXT:    mov v23.s[1], w9
-; CHECK-SD-NEXT:    umov w9, v2.b[7]
-; CHECK-SD-NEXT:    shl v16.2s, v16.2s, #24
-; CHECK-SD-NEXT:    shl v21.2s, v21.2s, #24
-; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-SD-NEXT:    sshr v17.2s, v17.2s, #24
-; CHECK-SD-NEXT:    mov v0.s[1], w14
-; CHECK-SD-NEXT:    fmov s24, w13
-; CHECK-SD-NEXT:    mov v19.s[1], w12
-; CHECK-SD-NEXT:    sshr v16.2s, v16.2s, #24
-; CHECK-SD-NEXT:    shl v6.2s, v6.2s, #24
 ; CHECK-SD-NEXT:    shl v20.2s, v20.2s, #24
-; CHECK-SD-NEXT:    sshr v21.2s, v21.2s, #24
-; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
-; CHECK-SD-NEXT:    shl v2.2s, v23.2s, #24
-; CHECK-SD-NEXT:    mov v18.s[1], w9
-; CHECK-SD-NEXT:    mov v24.s[1], w11
-; CHECK-SD-NEXT:    shl v22.2s, v22.2s, #24
-; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-SD-NEXT:    shl v4.2s, v4.2s, #24
-; CHECK-SD-NEXT:    shl v5.2s, v5.2s, #24
-; CHECK-SD-NEXT:    sshll v21.2d, v21.2s, #0
-; CHECK-SD-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT:    sshr v2.2s, v2.2s, #24
-; CHECK-SD-NEXT:    shl v7.2s, v7.2s, #24
+; CHECK-SD-NEXT:    mov v24.b[0], v16.b[4]
+; CHECK-SD-NEXT:    mov v23.b[4], v16.b[1]
+; CHECK-SD-NEXT:    shl v18.2s, v18.2s, #24
 ; CHECK-SD-NEXT:    shl v19.2s, v19.2s, #24
+; CHECK-SD-NEXT:    sshr v5.2s, v5.2s, #24
+; CHECK-SD-NEXT:    shl v1.2s, v2.2s, #24
+; CHECK-SD-NEXT:    shl v2.2s, v4.2s, #24
+; CHECK-SD-NEXT:    sshr v17.2s, v17.2s, #24
+; CHECK-SD-NEXT:    mov v3.b[4], v0.b[3]
+; CHECK-SD-NEXT:    shl v0.2s, v21.2s, #24
+; CHECK-SD-NEXT:    shl v4.2s, v6.2s, #24
+; CHECK-SD-NEXT:    shl v6.2s, v7.2s, #24
+; CHECK-SD-NEXT:    mov v7.b[0], v16.b[2]
+; CHECK-SD-NEXT:    sshll v5.2d, v5.2s, #0
 ; CHECK-SD-NEXT:    sshr v20.2s, v20.2s, #24
+; CHECK-SD-NEXT:    mov v21.b[0], v16.b[6]
+; CHECK-SD-NEXT:    sshll v17.2d, v17.2s, #0
 ; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
-; CHECK-SD-NEXT:    shl v18.2s, v18.2s, #24
-; CHECK-SD-NEXT:    shl v23.2s, v24.2s, #24
-; CHECK-SD-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-SD-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    sshr v22.2s, v22.2s, #24
+; CHECK-SD-NEXT:    shl v22.2s, v22.2s, #24
+; CHECK-SD-NEXT:    shl v3.2s, v3.2s, #24
+; CHECK-SD-NEXT:    mov v24.b[4], v16.b[5]
 ; CHECK-SD-NEXT:    sshr v19.2s, v19.2s, #24
-; CHECK-SD-NEXT:    sshr v5.2s, v5.2s, #24
+; CHECK-SD-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-SD-NEXT:    mov v7.b[4], v16.b[3]
+; CHECK-SD-NEXT:    sshll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT:    scvtf v17.2d, v17.2d
 ; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    sshr v23.2s, v23.2s, #24
-; CHECK-SD-NEXT:    sshr v4.2s, v4.2s, #24
-; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT:    mov v21.b[4], v16.b[7]
+; CHECK-SD-NEXT:    sshr v3.2s, v3.2s, #24
+; CHECK-SD-NEXT:    sshr v16.2s, v22.2s, #24
+; CHECK-SD-NEXT:    sshll v19.2d, v19.2s, #0
+; CHECK-SD-NEXT:    scvtf v20.2d, v20.2d
+; CHECK-SD-NEXT:    shl v22.2s, v24.2s, #24
 ; CHECK-SD-NEXT:    sshr v6.2s, v6.2s, #24
-; CHECK-SD-NEXT:    sshll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT:    stp q1, q21, [x8, #160]
-; CHECK-SD-NEXT:    shl v1.2s, v3.2s, #24
-; CHECK-SD-NEXT:    sshr v3.2s, v18.2s, #24
 ; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    sshll v22.2d, v22.2s, #0
-; CHECK-SD-NEXT:    sshll v18.2d, v23.2s, #0
-; CHECK-SD-NEXT:    sshll v19.2d, v19.2s, #0
-; CHECK-SD-NEXT:    sshll v5.2d, v5.2s, #0
-; CHECK-SD-NEXT:    sshll v4.2d, v4.2s, #0
+; CHECK-SD-NEXT:    shl v7.2s, v7.2s, #24
+; CHECK-SD-NEXT:    sshr v4.2s, v4.2s, #24
+; CHECK-SD-NEXT:    stp q17, q5, [x8, #160]
+; CHECK-SD-NEXT:    sshr v5.2s, v18.2s, #24
+; CHECK-SD-NEXT:    shl v17.2s, v23.2s, #24
+; CHECK-SD-NEXT:    sshll v3.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll v16.2d, v16.2s, #0
+; CHECK-SD-NEXT:    shl v21.2s, v21.2s, #24
+; CHECK-SD-NEXT:    scvtf v19.2d, v19.2d
+; CHECK-SD-NEXT:    sshr v2.2s, v2.2s, #24
 ; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
-; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-SD-NEXT:    scvtf v22.2d, v22.2d
-; CHECK-SD-NEXT:    stp q0, q2, [x8, #32]
-; CHECK-SD-NEXT:    sshll v2.2d, v3.2s, #0
-; CHECK-SD-NEXT:    sshll v3.2d, v16.2s, #0
-; CHECK-SD-NEXT:    sshll v16.2d, v17.2s, #0
-; CHECK-SD-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    stp q0, q20, [x8, #128]
+; CHECK-SD-NEXT:    sshll v5.2d, v5.2s, #0
 ; CHECK-SD-NEXT:    sshr v0.2s, v7.2s, #24
-; CHECK-SD-NEXT:    scvtf v7.2d, v18.2d
-; CHECK-SD-NEXT:    scvtf v17.2d, v19.2d
-; CHECK-SD-NEXT:    stp q22, q20, [x8, #128]
-; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT:    sshr v7.2s, v17.2s, #24
 ; CHECK-SD-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-SD-NEXT:    scvtf v16.2d, v16.2d
-; CHECK-SD-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-SD-NEXT:    sshr v18.2s, v21.2s, #24
+; CHECK-SD-NEXT:    sshr v20.2s, v22.2s, #24
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    scvtf v5.2d, v5.2d
 ; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    stp q17, q7, [x8]
-; CHECK-SD-NEXT:    stp q3, q2, [x8, #224]
+; CHECK-SD-NEXT:    sshll v4.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    stp q16, q19, [x8, #32]
+; CHECK-SD-NEXT:    sshll v17.2d, v18.2s, #0
+; CHECK-SD-NEXT:    sshll v16.2d, v20.2s, #0
 ; CHECK-SD-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    scvtf v2.2d, v5.2d
-; CHECK-SD-NEXT:    stp q1, q16, [x8, #192]
-; CHECK-SD-NEXT:    scvtf v3.2d, v4.2d
-; CHECK-SD-NEXT:    scvtf v1.2d, v6.2d
-; CHECK-SD-NEXT:    stp q2, q0, [x8, #96]
-; CHECK-SD-NEXT:    stp q1, q3, [x8, #64]
+; CHECK-SD-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-SD-NEXT:    stp q5, q3, [x8]
+; CHECK-SD-NEXT:    scvtf v3.2d, v7.2d
+; CHECK-SD-NEXT:    scvtf v5.2d, v6.2d
+; CHECK-SD-NEXT:    scvtf v17.2d, v17.2d
+; CHECK-SD-NEXT:    scvtf v16.2d, v16.2d
+; CHECK-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-SD-NEXT:    stp q4, q5, [x8, #96]
+; CHECK-SD-NEXT:    stp q3, q0, [x8, #192]
+; CHECK-SD-NEXT:    scvtf v0.2d, v1.2d
+; CHECK-SD-NEXT:    stp q16, q17, [x8, #224]
+; CHECK-SD-NEXT:    stp q0, q2, [x8, #64]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: stofp_v32i8_v32f64:
@@ -3949,129 +3869,97 @@ entry:
 define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) {
 ; CHECK-SD-LABEL: utofp_v32i8_v32f64:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov v6.b[0], v1.b[6]
+; CHECK-SD-NEXT:    mov v7.b[0], v1.b[4]
+; CHECK-SD-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-SD-NEXT:    mov v16.b[0], v1.b[2]
+; CHECK-SD-NEXT:    mov v17.b[0], v1.b[0]
+; CHECK-SD-NEXT:    mov v19.b[0], v0.b[6]
+; CHECK-SD-NEXT:    mov v20.b[0], v0.b[4]
+; CHECK-SD-NEXT:    movi d5, #0x0000ff000000ff
+; CHECK-SD-NEXT:    mov v24.b[0], v0.b[2]
+; CHECK-SD-NEXT:    mov v25.b[0], v0.b[0]
 ; CHECK-SD-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ext v16.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT:    movi d3, #0x0000ff000000ff
-; CHECK-SD-NEXT:    umov w11, v2.b[0]
-; CHECK-SD-NEXT:    umov w14, v2.b[4]
-; CHECK-SD-NEXT:    umov w12, v2.b[2]
-; CHECK-SD-NEXT:    umov w15, v2.b[6]
-; CHECK-SD-NEXT:    umov w16, v2.b[1]
-; CHECK-SD-NEXT:    umov w13, v16.b[2]
-; CHECK-SD-NEXT:    umov w17, v16.b[0]
-; CHECK-SD-NEXT:    umov w9, v2.b[3]
-; CHECK-SD-NEXT:    umov w10, v2.b[5]
-; CHECK-SD-NEXT:    fmov s4, w11
-; CHECK-SD-NEXT:    fmov s6, w14
-; CHECK-SD-NEXT:    umov w14, v16.b[6]
-; CHECK-SD-NEXT:    fmov s5, w12
-; CHECK-SD-NEXT:    umov w12, v16.b[4]
-; CHECK-SD-NEXT:    umov w11, v16.b[1]
-; CHECK-SD-NEXT:    fmov s7, w15
-; CHECK-SD-NEXT:    umov w15, v16.b[7]
-; CHECK-SD-NEXT:    fmov s18, w13
-; CHECK-SD-NEXT:    mov v4.s[1], w16
-; CHECK-SD-NEXT:    umov w16, v1.b[4]
-; CHECK-SD-NEXT:    umov w13, v1.b[6]
-; CHECK-SD-NEXT:    fmov s17, w17
-; CHECK-SD-NEXT:    fmov s20, w14
-; CHECK-SD-NEXT:    mov v5.s[1], w9
-; CHECK-SD-NEXT:    umov w9, v1.b[7]
-; CHECK-SD-NEXT:    fmov s19, w12
-; CHECK-SD-NEXT:    mov v6.s[1], w10
-; CHECK-SD-NEXT:    umov w10, v1.b[2]
-; CHECK-SD-NEXT:    umov w12, v0.b[6]
-; CHECK-SD-NEXT:    umov w14, v1.b[0]
-; CHECK-SD-NEXT:    mov v17.s[1], w11
-; CHECK-SD-NEXT:    mov v20.s[1], w15
-; CHECK-SD-NEXT:    umov w11, v1.b[5]
-; CHECK-SD-NEXT:    umov w15, v1.b[3]
-; CHECK-SD-NEXT:    fmov s21, w16
-; CHECK-SD-NEXT:    umov w16, v1.b[1]
-; CHECK-SD-NEXT:    fmov s1, w13
-; CHECK-SD-NEXT:    umov w13, v0.b[4]
-; CHECK-SD-NEXT:    and v6.8b, v6.8b, v3.8b
-; CHECK-SD-NEXT:    fmov s22, w10
-; CHECK-SD-NEXT:    fmov s23, w12
-; CHECK-SD-NEXT:    fmov s24, w14
-; CHECK-SD-NEXT:    mov v21.s[1], w11
-; CHECK-SD-NEXT:    umov w10, v0.b[5]
-; CHECK-SD-NEXT:    umov w12, v0.b[0]
-; CHECK-SD-NEXT:    mov v1.s[1], w9
-; CHECK-SD-NEXT:    umov w9, v0.b[7]
-; CHECK-SD-NEXT:    umov w11, v16.b[3]
-; CHECK-SD-NEXT:    mov v22.s[1], w15
-; CHECK-SD-NEXT:    mov v24.s[1], w16
-; CHECK-SD-NEXT:    fmov s25, w13
-; CHECK-SD-NEXT:    umov w13, v0.b[3]
-; CHECK-SD-NEXT:    and v20.8b, v20.8b, v3.8b
-; CHECK-SD-NEXT:    and v5.8b, v5.8b, v3.8b
-; CHECK-SD-NEXT:    and v21.8b, v21.8b, v3.8b
-; CHECK-SD-NEXT:    mov v23.s[1], w9
-; CHECK-SD-NEXT:    umov w9, v0.b[2]
-; CHECK-SD-NEXT:    and v1.8b, v1.8b, v3.8b
-; CHECK-SD-NEXT:    mov v25.s[1], w10
-; CHECK-SD-NEXT:    umov w10, v0.b[1]
-; CHECK-SD-NEXT:    and v0.8b, v22.8b, v3.8b
-; CHECK-SD-NEXT:    fmov s22, w12
-; CHECK-SD-NEXT:    and v24.8b, v24.8b, v3.8b
-; CHECK-SD-NEXT:    umov w12, v16.b[5]
-; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT:    ushll v21.2d, v21.2s, #0
-; CHECK-SD-NEXT:    mov v18.s[1], w11
-; CHECK-SD-NEXT:    and v16.8b, v23.8b, v3.8b
-; CHECK-SD-NEXT:    fmov s23, w9
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    and v25.8b, v25.8b, v3.8b
-; CHECK-SD-NEXT:    ushll v24.2d, v24.2s, #0
-; CHECK-SD-NEXT:    mov v22.s[1], w10
-; CHECK-SD-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-SD-NEXT:    umov w9, v2.b[7]
-; CHECK-SD-NEXT:    mov v23.s[1], w13
+; CHECK-SD-NEXT:    mov v6.b[4], v1.b[7]
+; CHECK-SD-NEXT:    mov v7.b[4], v1.b[5]
+; CHECK-SD-NEXT:    mov v18.b[0], v3.b[0]
+; CHECK-SD-NEXT:    mov v21.b[0], v3.b[2]
+; CHECK-SD-NEXT:    mov v23.b[0], v3.b[4]
+; CHECK-SD-NEXT:    mov v16.b[4], v1.b[3]
+; CHECK-SD-NEXT:    mov v17.b[4], v1.b[1]
+; CHECK-SD-NEXT:    mov v1.b[0], v3.b[6]
+; CHECK-SD-NEXT:    mov v19.b[4], v0.b[7]
+; CHECK-SD-NEXT:    mov v20.b[4], v0.b[5]
+; CHECK-SD-NEXT:    mov v24.b[4], v0.b[3]
+; CHECK-SD-NEXT:    mov v25.b[4], v0.b[1]
+; CHECK-SD-NEXT:    and v6.8b, v6.8b, v5.8b
+; CHECK-SD-NEXT:    and v7.8b, v7.8b, v5.8b
+; CHECK-SD-NEXT:    mov v18.b[4], v3.b[1]
+; CHECK-SD-NEXT:    mov v21.b[4], v3.b[3]
+; CHECK-SD-NEXT:    mov v23.b[4], v3.b[5]
+; CHECK-SD-NEXT:    and v16.8b, v16.8b, v5.8b
+; CHECK-SD-NEXT:    and v17.8b, v17.8b, v5.8b
+; CHECK-SD-NEXT:    mov v1.b[4], v3.b[7]
+; CHECK-SD-NEXT:    and v3.8b, v19.8b, v5.8b
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT:    and v20.8b, v20.8b, v5.8b
 ; CHECK-SD-NEXT:    ushll v16.2d, v16.2s, #0
-; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-SD-NEXT:    ushll v2.2d, v25.2s, #0
-; CHECK-SD-NEXT:    mov v19.s[1], w12
-; CHECK-SD-NEXT:    ucvtf v24.2d, v24.2d
-; CHECK-SD-NEXT:    and v18.8b, v18.8b, v3.8b
-; CHECK-SD-NEXT:    stp q21, q1, [x8, #160]
-; CHECK-SD-NEXT:    and v1.8b, v22.8b, v3.8b
+; CHECK-SD-NEXT:    mov v4.b[0], v2.b[0]
+; CHECK-SD-NEXT:    mov v22.b[0], v2.b[2]
+; CHECK-SD-NEXT:    ushll v17.2d, v17.2s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    mov v19.b[0], v2.b[4]
+; CHECK-SD-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-SD-NEXT:    ucvtf v3.2d, v7.2d
+; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT:    mov v7.b[0], v2.b[6]
 ; CHECK-SD-NEXT:    ucvtf v16.2d, v16.2d
-; CHECK-SD-NEXT:    and v23.8b, v23.8b, v3.8b
-; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    mov v7.s[1], w9
-; CHECK-SD-NEXT:    stp q24, q0, [x8, #128]
-; CHECK-SD-NEXT:    and v0.8b, v19.8b, v3.8b
-; CHECK-SD-NEXT:    ushll v18.2d, v18.2s, #0
-; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT:    ushll v19.2d, v23.2s, #0
-; CHECK-SD-NEXT:    stp q2, q16, [x8, #32]
-; CHECK-SD-NEXT:    and v16.8b, v17.8b, v3.8b
-; CHECK-SD-NEXT:    ushll v17.2d, v20.2s, #0
-; CHECK-SD-NEXT:    and v2.8b, v7.8b, v3.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT:    ucvtf v7.2d, v19.2d
-; CHECK-SD-NEXT:    and v3.8b, v4.8b, v3.8b
-; CHECK-SD-NEXT:    ushll v4.2d, v6.2s, #0
+; CHECK-SD-NEXT:    and v24.8b, v24.8b, v5.8b
 ; CHECK-SD-NEXT:    ucvtf v17.2d, v17.2d
-; CHECK-SD-NEXT:    ushll v16.2d, v16.2s, #0
-; CHECK-SD-NEXT:    ucvtf v6.2d, v18.2d
 ; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-SD-NEXT:    mov v4.b[4], v2.b[1]
+; CHECK-SD-NEXT:    ucvtf v20.2d, v20.2d
+; CHECK-SD-NEXT:    mov v22.b[4], v2.b[3]
+; CHECK-SD-NEXT:    mov v19.b[4], v2.b[5]
+; CHECK-SD-NEXT:    stp q3, q6, [x8, #160]
+; CHECK-SD-NEXT:    and v6.8b, v25.8b, v5.8b
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v5.8b
+; CHECK-SD-NEXT:    mov v7.b[4], v2.b[7]
+; CHECK-SD-NEXT:    and v2.8b, v23.8b, v5.8b
+; CHECK-SD-NEXT:    ushll v3.2d, v24.2s, #0
+; CHECK-SD-NEXT:    stp q17, q16, [x8, #128]
+; CHECK-SD-NEXT:    and v16.8b, v21.8b, v5.8b
+; CHECK-SD-NEXT:    and v4.8b, v4.8b, v5.8b
+; CHECK-SD-NEXT:    stp q20, q0, [x8, #32]
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    and v0.8b, v18.8b, v5.8b
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT:    stp q1, q7, [x8]
-; CHECK-SD-NEXT:    ushll v1.2d, v5.2s, #0
-; CHECK-SD-NEXT:    ucvtf v5.2d, v16.2d
-; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-SD-NEXT:    stp q0, q17, [x8, #224]
-; CHECK-SD-NEXT:    ucvtf v0.2d, v4.2d
 ; CHECK-SD-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-SD-NEXT:    ushll v16.2d, v16.2s, #0
+; CHECK-SD-NEXT:    and v7.8b, v7.8b, v5.8b
+; CHECK-SD-NEXT:    and v17.8b, v19.8b, v5.8b
+; CHECK-SD-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    and v18.8b, v22.8b, v5.8b
 ; CHECK-SD-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-SD-NEXT:    stp q5, q6, [x8, #192]
-; CHECK-SD-NEXT:    stp q0, q2, [x8, #96]
-; CHECK-SD-NEXT:    stp q3, q1, [x8, #64]
+; CHECK-SD-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-SD-NEXT:    ushll v4.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ucvtf v16.2d, v16.2d
+; CHECK-SD-NEXT:    ushll v5.2d, v7.2s, #0
+; CHECK-SD-NEXT:    ushll v7.2d, v17.2s, #0
+; CHECK-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-SD-NEXT:    stp q6, q3, [x8]
+; CHECK-SD-NEXT:    ushll v3.2d, v18.2s, #0
+; CHECK-SD-NEXT:    stp q2, q1, [x8, #224]
+; CHECK-SD-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-SD-NEXT:    ucvtf v1.2d, v7.2d
+; CHECK-SD-NEXT:    stp q0, q16, [x8, #192]
+; CHECK-SD-NEXT:    ucvtf v2.2d, v3.2d
+; CHECK-SD-NEXT:    ucvtf v0.2d, v4.2d
+; CHECK-SD-NEXT:    stp q1, q5, [x8, #96]
+; CHECK-SD-NEXT:    stp q0, q2, [x8, #64]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: utofp_v32i8_v32f64:

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
index 07772b716ec58..c039da26b7c15 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -554,11 +554,9 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %word) {
 define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
 ; CHECK-LE-LABEL: bitcast_i16_to_v2i8:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    fmov s0, w0
-; CHECK-LE-NEXT:    umov w8, v0.b[0]
-; CHECK-LE-NEXT:    umov w9, v0.b[1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    fmov s1, w0
+; CHECK-LE-NEXT:    mov v0.b[0], v1.b[0]
+; CHECK-LE-NEXT:    mov v0.b[4], v1.b[1]
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
@@ -566,11 +564,9 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    fmov s0, w0
 ; CHECK-BE-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-BE-NEXT:    umov w8, v0.b[0]
-; CHECK-BE-NEXT:    umov w9, v0.b[1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
-; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-BE-NEXT:    mov v1.b[4], v0.b[1]
+; CHECK-BE-NEXT:    rev64 v0.2s, v1.2s
 ; CHECK-BE-NEXT:    ret
   %ret = bitcast i16 %word to <2 x i8>
   ret <2 x i8> %ret

diff  --git a/llvm/test/CodeGen/AArch64/shuffle-extend.ll b/llvm/test/CodeGen/AArch64/shuffle-extend.ll
index bb31380cc3ade..7658e5ab6936b 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-extend.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-extend.ll
@@ -4,10 +4,8 @@
 define <2 x i8> @test_v16i8_v2i32_824(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_v16i8_v2i32_824:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.b[8]
-; CHECK-NEXT:    umov w9, v1.b[8]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    mov v0.b[0], v0.b[8]
+; CHECK-NEXT:    mov v0.b[4], v1.b[8]
 ; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <2 x i32> <i32 8, i32 24>
@@ -18,10 +16,8 @@ define <2 x i8> @test_v16i8_v2i32_824(<16 x i8> %a, <16 x i8> %b) {
 define <2 x i8> @test_v16i8_v2i32_016(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_v16i8_v2i32_016:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    umov w9, v1.b[0]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-NEXT:    mov v0.b[4], v1.b[0]
 ; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <2 x i32> <i32 0, i32 16>
@@ -33,11 +29,9 @@ define <2 x i8> @test_v8i8_v2i32_08(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_v8i8_v2i32_08:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    umov w9, v1.b[0]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-NEXT:    mov v0.b[4], v1.b[0]
 ; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i8> %a, <8 x i8> %b, <2 x i32> <i32 0, i32 8>
@@ -48,10 +42,8 @@ define <2 x i8> @test_v8i8_v2i32_08(<8 x i8> %a, <8 x i8> %b) {
 define <2 x i16> @test_v8i16_v2i32_08(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_v8i16_v2i32_08:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    umov w9, v1.h[0]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    mov v0.h[0], v0.h[0]
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <2 x i32> <i32 0, i32 8>
@@ -63,11 +55,9 @@ define <2 x i16> @test_v4i16_v2i32_04(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: test_v4i16_v2i32_04:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.h[0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    umov w9, v1.h[0]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    mov v0.h[0], v0.h[0]
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    ret
   %c = shufflevector <4 x i16> %a, <4 x i16> %b, <2 x i32> <i32 0, i32 4>
@@ -79,14 +69,10 @@ define <2 x i16> @test_v4i16_v2i32_04(<4 x i16> %a, <4 x i16> %b) {
 define <4 x i8> @test_v16i8_v4i16_824(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_v16i8_v4i16_824:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.b[8]
-; CHECK-NEXT:    umov w9, v1.b[8]
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    umov w8, v1.b[0]
-; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    mov v2.b[0], v0.b[8]
+; CHECK-NEXT:    mov v2.b[2], v1.b[8]
+; CHECK-NEXT:    mov v2.b[4], v0.b[0]
+; CHECK-NEXT:    mov v2.b[6], v1.b[0]
 ; CHECK-NEXT:    add v0.4h, v2.4h, v2.4h
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <4 x i32> <i32 8, i32 24, i32 0, i32 16>
@@ -97,14 +83,10 @@ define <4 x i8> @test_v16i8_v4i16_824(<16 x i8> %a, <16 x i8> %b) {
 define <4 x i8> @test_v16i8_v4i16_016(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_v16i8_v4i16_016:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    umov w9, v1.b[0]
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    umov w8, v0.b[4]
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    umov w8, v1.b[4]
-; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-NEXT:    mov v2.b[2], v1.b[0]
+; CHECK-NEXT:    mov v2.b[4], v0.b[4]
+; CHECK-NEXT:    mov v2.b[6], v1.b[4]
 ; CHECK-NEXT:    add v0.4h, v2.4h, v2.4h
 ; CHECK-NEXT:    ret
   %c = shufflevector <16 x i8> %a, <16 x i8> %b, <4 x i32> <i32 0, i32 16, i32 4, i32 20>
@@ -116,15 +98,11 @@ define <4 x i8> @test_v8i8_v4i16_08(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_v8i8_v4i16_08:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    umov w9, v1.b[0]
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    umov w8, v0.b[4]
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    umov w8, v1.b[4]
-; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    mov v2.b[2], v1.b[0]
+; CHECK-NEXT:    mov v2.b[4], v0.b[4]
+; CHECK-NEXT:    mov v2.b[6], v1.b[4]
 ; CHECK-NEXT:    add v0.4h, v2.4h, v2.4h
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 8, i32 4, i32 12>
@@ -215,23 +193,19 @@ define i1 @test2(ptr %add.ptr, ptr %result, <2 x i64> %hi, <2 x i64> %lo) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v3.16b, #1
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    cmgt v0.2d, v2.2d, v0.2d
 ; CHECK-NEXT:    cmgt v4.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    sub v1.2d, v2.2d, v1.2d
+; CHECK-NEXT:    dup v2.2d, x9
 ; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    and v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    umov w8, v0.b[8]
-; CHECK-NEXT:    umov w9, v3.b[8]
-; CHECK-NEXT:    umov w10, v0.b[0]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    umov w8, v3.b[0]
-; CHECK-NEXT:    fmov s3, w10
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    mov v3.s[1], w8
-; CHECK-NEXT:    dup v2.2d, x9
-; CHECK-NEXT:    add v0.2s, v0.2s, v0.2s
-; CHECK-NEXT:    orr v0.8b, v0.8b, v3.8b
+; CHECK-NEXT:    mov v5.b[0], v0.b[8]
+; CHECK-NEXT:    mov v0.b[0], v0.b[0]
+; CHECK-NEXT:    mov v5.b[4], v3.b[8]
+; CHECK-NEXT:    mov v0.b[4], v3.b[0]
+; CHECK-NEXT:    add v3.2s, v5.2s, v5.2s
+; CHECK-NEXT:    orr v0.8b, v3.8b, v0.8b
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    add v0.2d, v1.2d, v2.2d

diff  --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index a6b43d514594e..d31659c30f21d 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -243,26 +243,18 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) {
 ; CHECK-LABEL: sitofp_v8i8_double:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    umov w9, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[4]
-; CHECK-NEXT:    umov w12, v0.b[6]
-; CHECK-NEXT:    umov w10, v0.b[1]
-; CHECK-NEXT:    umov w13, v0.b[3]
-; CHECK-NEXT:    umov w14, v0.b[5]
-; CHECK-NEXT:    umov w15, v0.b[7]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fmov s2, w11
-; CHECK-NEXT:    fmov s3, w12
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w13
-; CHECK-NEXT:    mov v2.s[1], w14
-; CHECK-NEXT:    mov v3.s[1], w15
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-NEXT:    shl v2.2s, v2.2s, #24
-; CHECK-NEXT:    shl v3.2s, v3.2s, #24
+; CHECK-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-NEXT:    mov v2.b[0], v0.b[2]
+; CHECK-NEXT:    mov v3.b[0], v0.b[4]
+; CHECK-NEXT:    mov v4.b[0], v0.b[6]
+; CHECK-NEXT:    mov v1.b[4], v0.b[1]
+; CHECK-NEXT:    mov v2.b[4], v0.b[3]
+; CHECK-NEXT:    mov v3.b[4], v0.b[5]
+; CHECK-NEXT:    mov v4.b[4], v0.b[7]
+; CHECK-NEXT:    shl v0.2s, v1.2s, #24
+; CHECK-NEXT:    shl v1.2s, v2.2s, #24
+; CHECK-NEXT:    shl v2.2s, v3.2s, #24
+; CHECK-NEXT:    shl v3.2s, v4.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
 ; CHECK-NEXT:    sshr v2.2s, v2.2s, #24
@@ -283,68 +275,52 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) {
 define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) {
 ; CHECK-LABEL: sitofp_v16i8_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umov w8, v0.b[0]
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    umov w9, v0.b[1]
-; CHECK-NEXT:    umov w10, v0.b[2]
-; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    umov w14, v0.b[6]
-; CHECK-NEXT:    umov w11, v0.b[3]
-; CHECK-NEXT:    umov w13, v0.b[5]
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    umov w15, v1.b[0]
-; CHECK-NEXT:    umov w17, v1.b[2]
-; CHECK-NEXT:    umov w0, v1.b[4]
-; CHECK-NEXT:    umov w16, v1.b[1]
-; CHECK-NEXT:    umov w18, v1.b[3]
-; CHECK-NEXT:    umov w8, v0.b[7]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    umov w10, v1.b[5]
-; CHECK-NEXT:    mov v2.s[1], w9
-; CHECK-NEXT:    umov w9, v1.b[6]
-; CHECK-NEXT:    fmov s3, w12
-; CHECK-NEXT:    umov w12, v1.b[7]
-; CHECK-NEXT:    fmov s1, w14
-; CHECK-NEXT:    fmov s4, w15
-; CHECK-NEXT:    fmov s5, w17
-; CHECK-NEXT:    fmov s6, w0
-; CHECK-NEXT:    mov v0.s[1], w11
-; CHECK-NEXT:    mov v3.s[1], w13
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    mov v1.s[1], w8
-; CHECK-NEXT:    mov v4.s[1], w16
-; CHECK-NEXT:    mov v5.s[1], w18
-; CHECK-NEXT:    mov v6.s[1], w10
-; CHECK-NEXT:    shl v2.2s, v2.2s, #24
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    mov v7.s[1], w12
-; CHECK-NEXT:    shl v3.2s, v3.2s, #24
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-NEXT:    shl v4.2s, v4.2s, #24
-; CHECK-NEXT:    sshr v2.2s, v2.2s, #24
-; CHECK-NEXT:    shl v5.2s, v5.2s, #24
-; CHECK-NEXT:    shl v6.2s, v6.2s, #24
+; CHECK-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-NEXT:    mov v3.b[0], v0.b[2]
+; CHECK-NEXT:    mov v4.b[0], v0.b[4]
+; CHECK-NEXT:    mov v5.b[0], v0.b[6]
+; CHECK-NEXT:    mov v6.b[0], v1.b[0]
+; CHECK-NEXT:    mov v7.b[0], v1.b[2]
+; CHECK-NEXT:    mov v16.b[0], v1.b[4]
+; CHECK-NEXT:    mov v17.b[0], v1.b[6]
+; CHECK-NEXT:    mov v2.b[4], v0.b[1]
+; CHECK-NEXT:    mov v3.b[4], v0.b[3]
+; CHECK-NEXT:    mov v4.b[4], v0.b[5]
+; CHECK-NEXT:    mov v5.b[4], v0.b[7]
+; CHECK-NEXT:    mov v6.b[4], v1.b[1]
+; CHECK-NEXT:    mov v7.b[4], v1.b[3]
+; CHECK-NEXT:    mov v16.b[4], v1.b[5]
+; CHECK-NEXT:    mov v17.b[4], v1.b[7]
+; CHECK-NEXT:    shl v0.2s, v2.2s, #24
+; CHECK-NEXT:    shl v1.2s, v3.2s, #24
+; CHECK-NEXT:    shl v2.2s, v4.2s, #24
+; CHECK-NEXT:    shl v3.2s, v5.2s, #24
+; CHECK-NEXT:    shl v4.2s, v6.2s, #24
+; CHECK-NEXT:    shl v5.2s, v7.2s, #24
+; CHECK-NEXT:    shl v6.2s, v16.2s, #24
+; CHECK-NEXT:    shl v7.2s, v17.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-NEXT:    sshr v2.2s, v2.2s, #24
 ; CHECK-NEXT:    sshr v3.2s, v3.2s, #24
-; CHECK-NEXT:    shl v7.2s, v7.2s, #24
 ; CHECK-NEXT:    sshr v4.2s, v4.2s, #24
-; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
 ; CHECK-NEXT:    sshr v5.2s, v5.2s, #24
 ; CHECK-NEXT:    sshr v6.2s, v6.2s, #24
+; CHECK-NEXT:    sshr v7.2s, v7.2s, #24
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-NEXT:    sshll v16.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v3.2d, v3.2s, #0
-; CHECK-NEXT:    sshr v7.2s, v7.2s, #24
 ; CHECK-NEXT:    sshll v4.2d, v4.2s, #0
-; CHECK-NEXT:    sshll v17.2d, v1.2s, #0
 ; CHECK-NEXT:    sshll v5.2d, v5.2s, #0
 ; CHECK-NEXT:    sshll v6.2d, v6.2s, #0
-; CHECK-NEXT:    scvtf v0.2d, v2.2d
-; CHECK-NEXT:    scvtf v1.2d, v16.2d
-; CHECK-NEXT:    scvtf v2.2d, v3.2d
 ; CHECK-NEXT:    sshll v7.2d, v7.2s, #0
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v3.2d, v17.2d
 ; CHECK-NEXT:    scvtf v5.2d, v5.2d
 ; CHECK-NEXT:    scvtf v6.2d, v6.2d
 ; CHECK-NEXT:    scvtf v7.2d, v7.2d
@@ -420,27 +396,19 @@ define <8 x double> @uitofp_v8i8_double(<8 x i8> %a) {
 ; CHECK-LABEL: uitofp_v8i8_double:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    umov w9, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[4]
-; CHECK-NEXT:    umov w12, v0.b[6]
-; CHECK-NEXT:    umov w10, v0.b[1]
-; CHECK-NEXT:    umov w13, v0.b[3]
-; CHECK-NEXT:    umov w14, v0.b[5]
-; CHECK-NEXT:    umov w15, v0.b[7]
+; CHECK-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-NEXT:    mov v3.b[0], v0.b[2]
+; CHECK-NEXT:    mov v4.b[0], v0.b[4]
+; CHECK-NEXT:    mov v5.b[0], v0.b[6]
 ; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    fmov s3, w11
-; CHECK-NEXT:    fmov s4, w12
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v2.s[1], w13
-; CHECK-NEXT:    mov v3.s[1], w14
-; CHECK-NEXT:    mov v4.s[1], w15
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v2.8b, v2.8b, v1.8b
-; CHECK-NEXT:    and v3.8b, v3.8b, v1.8b
-; CHECK-NEXT:    and v1.8b, v4.8b, v1.8b
+; CHECK-NEXT:    mov v2.b[4], v0.b[1]
+; CHECK-NEXT:    mov v3.b[4], v0.b[3]
+; CHECK-NEXT:    mov v4.b[4], v0.b[5]
+; CHECK-NEXT:    mov v5.b[4], v0.b[7]
+; CHECK-NEXT:    and v0.8b, v2.8b, v1.8b
+; CHECK-NEXT:    and v2.8b, v3.8b, v1.8b
+; CHECK-NEXT:    and v3.8b, v4.8b, v1.8b
+; CHECK-NEXT:    and v1.8b, v5.8b, v1.8b
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-NEXT:    ushll v3.2d, v3.2s, #0
@@ -458,63 +426,47 @@ define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) {
 ; CHECK-LABEL: uitofp_v16i8_double:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    umov w10, v0.b[2]
-; CHECK-NEXT:    umov w9, v0.b[1]
-; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    umov w11, v0.b[3]
-; CHECK-NEXT:    umov w13, v0.b[5]
-; CHECK-NEXT:    umov w18, v0.b[6]
+; CHECK-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-NEXT:    mov v4.b[0], v0.b[2]
+; CHECK-NEXT:    mov v5.b[0], v0.b[4]
+; CHECK-NEXT:    mov v6.b[0], v0.b[6]
 ; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    umov w14, v2.b[0]
-; CHECK-NEXT:    umov w16, v2.b[2]
-; CHECK-NEXT:    umov w0, v2.b[4]
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    umov w8, v0.b[7]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    umov w10, v2.b[6]
-; CHECK-NEXT:    umov w15, v2.b[1]
-; CHECK-NEXT:    umov w17, v2.b[3]
-; CHECK-NEXT:    fmov s4, w12
-; CHECK-NEXT:    umov w12, v2.b[5]
-; CHECK-NEXT:    fmov s7, w18
-; CHECK-NEXT:    mov v3.s[1], w9
-; CHECK-NEXT:    umov w9, v2.b[7]
-; CHECK-NEXT:    fmov s2, w14
-; CHECK-NEXT:    fmov s5, w16
-; CHECK-NEXT:    fmov s6, w0
-; CHECK-NEXT:    mov v0.s[1], w11
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    mov v4.s[1], w13
-; CHECK-NEXT:    mov v7.s[1], w8
-; CHECK-NEXT:    mov v2.s[1], w15
-; CHECK-NEXT:    mov v5.s[1], w17
-; CHECK-NEXT:    mov v6.s[1], w12
-; CHECK-NEXT:    and v3.8b, v3.8b, v1.8b
-; CHECK-NEXT:    mov v16.s[1], w9
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    and v4.8b, v4.8b, v1.8b
-; CHECK-NEXT:    and v7.8b, v7.8b, v1.8b
-; CHECK-NEXT:    and v2.8b, v2.8b, v1.8b
-; CHECK-NEXT:    ushll v3.2d, v3.2s, #0
-; CHECK-NEXT:    and v5.8b, v5.8b, v1.8b
-; CHECK-NEXT:    and v6.8b, v6.8b, v1.8b
-; CHECK-NEXT:    and v1.8b, v16.8b, v1.8b
-; CHECK-NEXT:    ushll v16.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v17.2d, v4.2s, #0
+; CHECK-NEXT:    mov v7.b[0], v2.b[0]
+; CHECK-NEXT:    mov v16.b[0], v2.b[2]
+; CHECK-NEXT:    mov v17.b[0], v2.b[4]
+; CHECK-NEXT:    mov v18.b[0], v2.b[6]
+; CHECK-NEXT:    mov v3.b[4], v0.b[1]
+; CHECK-NEXT:    mov v4.b[4], v0.b[3]
+; CHECK-NEXT:    mov v5.b[4], v0.b[5]
+; CHECK-NEXT:    mov v6.b[4], v0.b[7]
+; CHECK-NEXT:    mov v7.b[4], v2.b[1]
+; CHECK-NEXT:    mov v16.b[4], v2.b[3]
+; CHECK-NEXT:    mov v17.b[4], v2.b[5]
+; CHECK-NEXT:    mov v18.b[4], v2.b[7]
+; CHECK-NEXT:    and v0.8b, v3.8b, v1.8b
+; CHECK-NEXT:    and v2.8b, v4.8b, v1.8b
+; CHECK-NEXT:    and v3.8b, v5.8b, v1.8b
+; CHECK-NEXT:    and v4.8b, v6.8b, v1.8b
+; CHECK-NEXT:    and v5.8b, v7.8b, v1.8b
+; CHECK-NEXT:    and v6.8b, v16.8b, v1.8b
+; CHECK-NEXT:    and v7.8b, v17.8b, v1.8b
+; CHECK-NEXT:    and v1.8b, v18.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
-; CHECK-NEXT:    ucvtf v0.2d, v3.2d
+; CHECK-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-NEXT:    ushll v4.2d, v4.2s, #0
 ; CHECK-NEXT:    ushll v5.2d, v5.2s, #0
 ; CHECK-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-NEXT:    ushll v18.2d, v1.2s, #0
-; CHECK-NEXT:    ucvtf v1.2d, v16.2d
-; CHECK-NEXT:    ucvtf v4.2d, v2.2d
-; CHECK-NEXT:    ucvtf v2.2d, v17.2d
-; CHECK-NEXT:    ucvtf v3.2d, v7.2d
-; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    ucvtf v6.2d, v6.2d
-; CHECK-NEXT:    ucvtf v7.2d, v18.2d
+; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-NEXT:    ushll v16.2d, v1.2s, #0
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v2.2d
+; CHECK-NEXT:    ucvtf v2.2d, v3.2d
+; CHECK-NEXT:    ucvtf v3.2d, v4.2d
+; CHECK-NEXT:    ucvtf v4.2d, v5.2d
+; CHECK-NEXT:    ucvtf v5.2d, v6.2d
+; CHECK-NEXT:    ucvtf v6.2d, v7.2d
+; CHECK-NEXT:    ucvtf v7.2d, v16.2d
 ; CHECK-NEXT:    ret
   %1 = uitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %1


        


More information about the llvm-commits mailing list