[llvm] bc010ce - [AArch64][SME]: Add precursory tests for D138281

Hassnaa Hamdi via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 21 10:13:05 PST 2022


Author: Hassnaa Hamdi
Date: 2022-11-21T18:12:57Z
New Revision: bc010cec3d6435b98952b7780d45899fffd6a52f

URL: https://github.com/llvm/llvm-project/commit/bc010cec3d6435b98952b7780d45899fffd6a52f
DIFF: https://github.com/llvm/llvm-project/commit/bc010cec3d6435b98952b7780d45899fffd6a52f.diff

LOG: [AArch64][SME]: Add precursory tests for D138281

Added: 
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
new file mode 100644
index 000000000000..700b94894a64
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Ensure we don't crash when trying to combine fp<->int conversions
+define void @fp_convert_combine_crash(<8 x float> *%a, <8 x i32> *%b) #0 {
+; CHECK-LABEL: fp_convert_combine_crash:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s, #3
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s, #3
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %f = load <8 x float>, <8 x float>* %a
+  %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
+                                 float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+  %vcvt.i = fptosi <8 x float> %mul.i to <8 x i32>
+  store <8 x i32> %vcvt.i, <8 x i32>* %b
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
new file mode 100644
index 000000000000..0bea4f11f466
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -0,0 +1,1559 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FCVTZU H -> H
+;
+
+define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v4f16_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %res = fptoui <4 x half> %op1 to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define void @fcvtzu_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v8f16_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    fcvtzu v0.8h, v0.8h
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x half>, <8 x half>* %a
+  %res = fptoui <8 x half> %op1 to <8 x i16>
+  store <8 x i16> %res, <8 x i16>* %b
+  ret void
+}
+
+define void @fcvtzu_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v16f16_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzu v0.8h, v0.8h
+; CHECK-NEXT:    fcvtzu v1.8h, v1.8h
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, <16 x half>* %a
+  %res = fptoui <16 x half> %op1 to <16 x i16>
+  store <16 x i16> %res, <16 x i16>* %b
+  ret void
+}
+
+;
+; FCVTZU H -> S
+;
+
+define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x half> %op1 to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v4f16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = fptoui <4 x half> %op1 to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define void @fcvtzu_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v8f16_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x half>, <8 x half>* %a
+  %res = fptoui <8 x half> %op1 to <8 x i32>
+  store <8 x i32> %res, <8 x i32>* %b
+  ret void
+}
+
+define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v16f16_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    fcvtzu v3.4s, v3.4s
+; CHECK-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, <16 x half>* %a
+  %res = fptoui <16 x half> %op1 to <16 x i32>
+  store <16 x i32> %res, <16 x i32>* %b
+  ret void
+}
+
+;
+; FCVTZU H -> D
+;
+
+define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v1f16_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %res = fptoui <1 x half> %op1 to <1 x i64>
+  ret <1 x i64> %res
+}
+
+define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f16_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x9, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x half> %op1 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define void @fcvtzu_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v4f16_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    fcvtzu x11, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %op1 = load <4 x half>, <4 x half>* %a
+  %res = fptoui <4 x half> %op1 to <4 x i64>
+  store <4 x i64> %res, <4 x i64>* %b
+  ret void
+}
+
+define void @fcvtzu_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v8f16_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    mov z1.h, z0.h[2]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu x11, h1
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x12, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzu x8, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    fcvtzu x10, h0
+; CHECK-NEXT:    stp x12, x8, [sp]
+; CHECK-NEXT:    ldp q3, q2, [sp, #32]
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    stp q1, q0, [x1, #32]
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %op1 = load <8 x half>, <8 x half>* %a
+  %res = fptoui <8 x half> %op1 to <8 x i64>
+  store <8 x i64> %res, <8 x i64>* %b
+  ret void
+}
+
+define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v16f16_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #128
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    fcvtzu x9, h2
+; CHECK-NEXT:    mov z2.h, z1.h[2]
+; CHECK-NEXT:    fcvtzu x8, h1
+; CHECK-NEXT:    fcvtzu x10, h3
+; CHECK-NEXT:    fcvtzu x11, h2
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtzu x12, h1
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
+; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzu x9, h3
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    fcvtzu x8, h2
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    stp x12, x8, [sp]
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    mov z1.h, z0.h[2]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu x11, h1
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    stp x8, x9, [sp, #96]
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    stp x11, x10, [sp, #112]
+; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    fcvtzu x11, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #64]
+; CHECK-NEXT:    ldp q0, q1, [sp, #32]
+; CHECK-NEXT:    stp x11, x10, [sp, #80]
+; CHECK-NEXT:    ldp q2, q3, [sp]
+; CHECK-NEXT:    ldp q5, q4, [sp, #64]
+; CHECK-NEXT:    ldp q7, q6, [sp, #96]
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q5, q4, [x1, #96]
+; CHECK-NEXT:    stp q7, q6, [x1, #64]
+; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, <16 x half>* %a
+  %res = fptoui <16 x half> %op1 to <16 x i64>
+  store <16 x i64> %res, <16 x i64>* %b
+  ret void
+}
+
+;
+; FCVTZU S -> H
+;
+
+define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f32_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x float> %op1 to <2 x i16>
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v4f32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    mov z1.s, z0.s[3]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z2.s, z0.s[2]
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    strh w10, [sp, #12]
+; CHECK-NEXT:    strh w11, [sp, #10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %res = fptoui <4 x float> %op1 to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @fcvtzu_v8f32_v8i16(<8 x float>* %a) #0 {
+; CHECK-LABEL: fcvtzu_v8f32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z5.s, z1.s[2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z2.s, z0.s[3]
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    strh w9, [sp]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z0.s, z1.s[3]
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    strh w10, [sp, #6]
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, <8 x float>* %a
+  %res = fptoui <8 x float> %op1 to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define void @fcvtzu_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v16f32_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z7.s, z1.s[2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z4.s, z0.s[3]
+; CHECK-NEXT:    mov z5.s, z0.s[2]
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov z6.s, z0.s[1]
+; CHECK-NEXT:    mov z0.s, z1.s[3]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    fcvtzu v3.4s, v3.4s
+; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z0.s, z1.s[1]
+; CHECK-NEXT:    fcvtzu v1.4s, v2.4s
+; CHECK-NEXT:    strh w9, [sp]
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    strh w10, [sp, #6]
+; CHECK-NEXT:    mov z2.s, z3.s[2]
+; CHECK-NEXT:    mov z4.s, z3.s[1]
+; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    mov z0.s, z3.s[3]
+; CHECK-NEXT:    strh w9, [sp, #24]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
+; CHECK-NEXT:    mov z6.s, z1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w9, [sp, #28]
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    strh w10, [sp, #26]
+; CHECK-NEXT:    fmov w10, s6
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    strh w9, [sp, #20]
+; CHECK-NEXT:    strh w10, [sp, #18]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %op1 = load <16 x float>, <16 x float>* %a
+  %res = fptoui <16 x float> %op1 to <16 x i16>
+  store <16 x i16> %res, <16 x i16>* %b
+  ret void
+}
+
+;
+; FCVTZU S -> S
+;
+
+define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x float> %op1 to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v4f32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = fptoui <4 x float> %op1 to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define void @fcvtzu_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v8f32_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, <8 x float>* %a
+  %res = fptoui <8 x float> %op1 to <8 x i32>
+  store <8 x i32> %res, <8 x i32>* %b
+  ret void
+}
+
+;
+; FCVTZU S -> D
+;
+
+define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v1f32_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %res = fptoui <1 x float> %op1 to <1 x i64>
+  ret <1 x i64> %res
+}
+
+define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x float> %op1 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define void @fcvtzu_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v4f32_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x float>, <4 x float>* %a
+  %res = fptoui <4 x float> %op1 to <4 x i64>
+  store <4 x i64> %res, <4 x i64>* %b
+  ret void
+}
+
+define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v8f32_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzu v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzu v2.2d, v2.2d
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, <8 x float>* %a
+  %res = fptoui <8 x float> %op1 to <8 x i64>
+  store <8 x i64> %res, <8 x i64>* %b
+  ret void
+}
+
+;
+; FCVTZU D -> H
+;
+
+define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v1f64_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %res = fptoui <1 x double> %op1 to <1 x i16>
+  ret <1 x i16> %res
+}
+
+define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f64_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x double> %op1 to <2 x i16>
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @fcvtzu_v4f64_v4i16(<4 x double>* %a) #0 {
+; CHECK-LABEL: fcvtzu_v4f64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, <4 x double>* %a
+  %res = fptoui <4 x double> %op1 to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @fcvtzu_v8f64_v8i16(<8 x double>* %a) #0 {
+; CHECK-LABEL: fcvtzu_v8f64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %op1 = load <8 x double>, <8 x double>* %a
+  %res = fptoui <8 x double> %op1 to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define void @fcvtzu_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v16f64_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    mov z16.s, z1.s[1]
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fcvtzs v2.2d, v7.2d
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s16
+; CHECK-NEXT:    mov z3.s, z3.s[1]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fcvtzs v1.2d, v6.2d
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    xtn v0.2s, v2.2d
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fcvtzs v2.2d, v5.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v4.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    xtn v1.2s, v3.2d
+; CHECK-NEXT:    strh w9, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #28]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z3.s, z2.s[1]
+; CHECK-NEXT:    mov z2.s, z1.s[1]
+; CHECK-NEXT:    strh w9, [sp, #20]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    strh w9, [sp, #26]
+; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #18]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %op1 = load <16 x double>, <16 x double>* %a
+  %res = fptoui <16 x double> %op1 to <16 x i16>
+  store <16 x i16> %res, <16 x i16>* %b
+  ret void
+}
+
+;
+; FCVTZU D -> S
+;
+
+define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v1f64_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptoui <1 x double> %op1 to <1 x i32>
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x double> %op1 to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @fcvtzu_v4f64_v4i32(<4 x double>* %a) #0 {
+; CHECK-LABEL: fcvtzu_v4f64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, <4 x double>* %a
+  %res = fptoui <4 x double> %op1 to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define void @fcvtzu_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v8f64_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzu v2.2d, v2.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fcvtzu v3.2d, v3.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x double>, <8 x double>* %a
+  %res = fptoui <8 x double> %op1 to <8 x i32>
+  store <8 x i32> %res, <8 x i32>* %b
+  ret void
+}
+
+;
+; FCVTZU D -> D
+;
+
+define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v1f64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %res = fptoui <1 x double> %op1 to <1 x i64>
+  ret <1 x i64> %res
+}
+
+define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzu_v2f64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptoui <2 x double> %op1 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define void @fcvtzu_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzu_v4f64_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, <4 x double>* %a
+  %res = fptoui <4 x double> %op1 to <4 x i64>
+  store <4 x i64> %res, <4 x i64>* %b
+  ret void
+}
+
+;
+; FCVTZS H -> H
+;
+
+define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v4f16_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %res = fptosi <4 x half> %op1 to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define void @fcvtzs_v8f16_v8i16(<8 x half>* %a, <8 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v8f16_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    fcvtzs v0.8h, v0.8h
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x half>, <8 x half>* %a
+  %res = fptosi <8 x half> %op1 to <8 x i16>
+  store <8 x i16> %res, <8 x i16>* %b
+  ret void
+}
+
+define void @fcvtzs_v16f16_v16i16(<16 x half>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v16f16_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzs v0.8h, v0.8h
+; CHECK-NEXT:    fcvtzs v1.8h, v1.8h
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, <16 x half>* %a
+  %res = fptosi <16 x half> %op1 to <16 x i16>
+  store <16 x i16> %res, <16 x i16>* %b
+  ret void
+}
+
+;
+; FCVTZS H -> S
+;
+
+define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x half> %op1 to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v4f16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = fptosi <4 x half> %op1 to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define void @fcvtzs_v8f16_v8i32(<8 x half>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v8f16_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x half>, <8 x half>* %a
+  %res = fptosi <8 x half> %op1 to <8 x i32>
+  store <8 x i32> %res, <8 x i32>* %b
+  ret void
+}
+
+define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v16f16_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, <16 x half>* %a
+  %res = fptosi <16 x half> %op1 to <16 x i32>
+  store <16 x i32> %res, <16 x i32>* %b
+  ret void
+}
+
+;
+; FCVTZS H -> D
+;
+
+define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v1f16_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %res = fptosi <1 x half> %op1 to <1 x i64>
+  ret <1 x i64> %res
+}
+
+; v2f16 is not legal for NEON, so use SVE
+define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f16_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x9, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x half> %op1 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define void @fcvtzs_v4f16_v4i64(<4 x half>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v4f16_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    fcvtzs x11, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %op1 = load <4 x half>, <4 x half>* %a
+  %res = fptosi <4 x half> %op1 to <4 x i64>
+  store <4 x i64> %res, <4 x i64>* %b
+  ret void
+}
+
+define void @fcvtzs_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v8f16_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    mov z1.h, z0.h[2]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x10, h0
+; CHECK-NEXT:    stp x12, x8, [sp]
+; CHECK-NEXT:    ldp q3, q2, [sp, #32]
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    stp q1, q0, [x1, #32]
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ret
+  %op1 = load <8 x half>, <8 x half>* %a
+  %res = fptosi <8 x half> %op1 to <8 x i64>
+  store <8 x i64> %res, <8 x i64>* %b
+  ret void
+}
+
+define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v16f16_v16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #128
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    fcvtzs x9, h2
+; CHECK-NEXT:    mov z2.h, z1.h[2]
+; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    fcvtzs x10, h3
+; CHECK-NEXT:    fcvtzs x11, h2
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtzs x12, h1
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
+; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    fcvtzs x8, h2
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    stp x10, x9, [sp, #16]
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    stp x12, x8, [sp]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    mov z1.h, z0.h[2]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    stp x8, x9, [sp, #96]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    stp x11, x10, [sp, #112]
+; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    fcvtzs x11, h0
+; CHECK-NEXT:    stp x8, x9, [sp, #64]
+; CHECK-NEXT:    ldp q0, q1, [sp, #32]
+; CHECK-NEXT:    stp x11, x10, [sp, #80]
+; CHECK-NEXT:    ldp q2, q3, [sp]
+; CHECK-NEXT:    ldp q5, q4, [sp, #64]
+; CHECK-NEXT:    ldp q7, q6, [sp, #96]
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q5, q4, [x1, #96]
+; CHECK-NEXT:    stp q7, q6, [x1, #64]
+; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    ret
+  %op1 = load <16 x half>, <16 x half>* %a
+  %res = fptosi <16 x half> %op1 to <16 x i64>
+  store <16 x i64> %res, <16 x i64>* %b
+  ret void
+}
+
+;
+; FCVTZS S -> H
+;
+
+define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f32_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x float> %op1 to <2 x i16>
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v4f32_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    mov z1.s, z0.s[3]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z2.s, z0.s[2]
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    strh w10, [sp, #12]
+; CHECK-NEXT:    strh w11, [sp, #10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %res = fptosi <4 x float> %op1 to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @fcvtzs_v8f32_v8i16(<8 x float>* %a) #0 {
+; CHECK-LABEL: fcvtzs_v8f32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z5.s, z1.s[2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z2.s, z0.s[3]
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    strh w9, [sp]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z0.s, z1.s[3]
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    strh w10, [sp, #6]
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, <8 x float>* %a
+  %res = fptosi <8 x float> %op1 to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define void @fcvtzs_v16f32_v16i16(<16 x float>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v16f32_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z7.s, z1.s[2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z4.s, z0.s[3]
+; CHECK-NEXT:    mov z5.s, z0.s[2]
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov z6.s, z0.s[1]
+; CHECK-NEXT:    mov z0.s, z1.s[3]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z0.s, z1.s[1]
+; CHECK-NEXT:    fcvtzs v1.4s, v2.4s
+; CHECK-NEXT:    strh w9, [sp]
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    strh w10, [sp, #6]
+; CHECK-NEXT:    mov z2.s, z3.s[2]
+; CHECK-NEXT:    mov z4.s, z3.s[1]
+; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    mov z0.s, z3.s[3]
+; CHECK-NEXT:    strh w9, [sp, #24]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
+; CHECK-NEXT:    mov z6.s, z1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w9, [sp, #28]
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    strh w10, [sp, #26]
+; CHECK-NEXT:    fmov w10, s6
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    strh w9, [sp, #20]
+; CHECK-NEXT:    strh w10, [sp, #18]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %op1 = load <16 x float>, <16 x float>* %a
+  %res = fptosi <16 x float> %op1 to <16 x i16>
+  store <16 x i16> %res, <16 x i16>* %b
+  ret void
+}
+
+;
+; FCVTZS S -> S
+;
+
+define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x float> %op1 to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v4f32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = fptosi <4 x float> %op1 to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define void @fcvtzs_v8f32_v8i32(<8 x float>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v8f32_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, <8 x float>* %a
+  %res = fptosi <8 x float> %op1 to <8 x i32>
+  store <8 x i32> %res, <8 x i32>* %b
+  ret void
+}
+
+;
+; FCVTZS S -> D
+;
+
+define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v1f32_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %res = fptosi <1 x float> %op1 to <1 x i64>
+  ret <1 x i64> %res
+}
+
+define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x float> %op1 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define void @fcvtzs_v4f32_v4i64(<4 x float>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v4f32_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x float>, <4 x float>* %a
+  %res = fptosi <4 x float> %op1 to <4 x i64>
+  store <4 x i64> %res, <4 x i64>* %b
+  ret void
+}
+
+define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v8f32_v8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x float>, <8 x float>* %a
+  %res = fptosi <8 x float> %op1 to <8 x i64>
+  store <8 x i64> %res, <8 x i64>* %b
+  ret void
+}
+
+
+;
+; FCVTZS D -> H
+;
+
+; v1f64 is perfered to be widened to v4f64, so use SVE
+define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v1f64_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %res = fptosi <1 x double> %op1 to <1 x i16>
+  ret <1 x i16> %res
+}
+
+define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f64_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x double> %op1 to <2 x i16>
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @fcvtzs_v4f64_v4i16(<4 x double>* %a) #0 {
+; CHECK-LABEL: fcvtzs_v4f64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, <4 x double>* %a
+  %res = fptosi <4 x double> %op1 to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @fcvtzs_v8f64_v8i16(<8 x double>* %a) #0 {
+; CHECK-LABEL: fcvtzs_v8f64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %op1 = load <8 x double>, <8 x double>* %a
+  %res = fptosi <8 x double> %op1 to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define void @fcvtzs_v16f64_v16i16(<16 x double>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v16f64_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    mov z16.s, z1.s[1]
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fcvtzs v2.2d, v7.2d
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s16
+; CHECK-NEXT:    mov z3.s, z3.s[1]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fcvtzs v1.2d, v6.2d
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    xtn v0.2s, v2.2d
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fcvtzs v2.2d, v5.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v4.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    xtn v1.2s, v3.2d
+; CHECK-NEXT:    strh w9, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #28]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z3.s, z2.s[1]
+; CHECK-NEXT:    mov z2.s, z1.s[1]
+; CHECK-NEXT:    strh w9, [sp, #20]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    strh w9, [sp, #26]
+; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #18]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %op1 = load <16 x double>, <16 x double>* %a
+  %res = fptosi <16 x double> %op1 to <16 x i16>
+  store <16 x i16> %res, <16 x i16>* %b
+  ret void
+}
+
+;
+; FCVTZS D -> S
+;
+
+define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v1f64_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptosi <1 x double> %op1 to <1 x i32>
+  ret <1 x i32> %res
+}
+
+define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x double> %op1 to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @fcvtzs_v4f64_v4i32(<4 x double>* %a) #0 {
+; CHECK-LABEL: fcvtzs_v4f64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, <4 x double>* %a
+  %res = fptosi <4 x double> %op1 to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define void @fcvtzs_v8f64_v8i32(<8 x double>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v8f64_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x double>, <8 x double>* %a
+  %res = fptosi <8 x double> %op1 to <8 x i32>
+  store <8 x i32> %res, <8 x i32>* %b
+  ret void
+}
+
+;
+; FCVTZS D -> D
+;
+
+define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v1f64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %res = fptosi <1 x double> %op1 to <1 x i64>
+  ret <1 x i64> %res
+}
+
+define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) #0 {
+; CHECK-LABEL: fcvtzs_v2f64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = fptosi <2 x double> %op1 to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define void @fcvtzs_v4f64_v4i64(<4 x double>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: fcvtzs_v4f64_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x double>, <4 x double>* %a
+  %res = fptosi <4 x double> %op1 to <4 x i64>
+  store <4 x i64> %res, <4 x i64>* %b
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
new file mode 100644
index 000000000000..3ddf794a74a9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -0,0 +1,1018 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; UCVTF H -> H
+;
+
+define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
+; CHECK-LABEL: ucvtf_v4i16_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %res = uitofp <4 x i16> %op1 to <4 x half>
+  ret <4 x half> %res
+}
+
+define void @ucvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
+; CHECK-LABEL: ucvtf_v8i16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ucvtf v0.8h, v0.8h
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i16>, <8 x i16>* %a
+  %res = uitofp <8 x i16> %op1 to <8 x half>
+  store <8 x half> %res, <8 x half>* %b
+  ret void
+}
+
+define void @ucvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 {
+; CHECK-LABEL: ucvtf_v16i16_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ucvtf v0.8h, v0.8h
+; CHECK-NEXT:    ucvtf v1.8h, v1.8h
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %res = uitofp <16 x i16> %op1 to <16 x half>
+  store <16 x half> %res, <16 x half>* %b
+  ret void
+}
+
+;
+; UCVTF H -> S
+;
+
+define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i16_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ucvtf v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i16> %op1 to <2 x float>
+  ret <2 x float> %res
+}
+
+define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
+; CHECK-LABEL: ucvtf_v4i16_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = uitofp <4 x i16> %op1 to <4 x float>
+  ret <4 x float> %res
+}
+
+define void @ucvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: ucvtf_v8i16_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i16>, <8 x i16>* %a
+  %res = uitofp <8 x i16> %op1 to <8 x float>
+  store <8 x float> %res, <8 x float>* %b
+  ret void
+}
+
+define void @ucvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 {
+; CHECK-LABEL: ucvtf_v16i16_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %res = uitofp <16 x i16> %op1 to <16 x float>
+  store <16 x float> %res, <16 x float>* %b
+  ret void
+}
+
+;
+; UCVTF H -> D
+;
+
+define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) #0 {
+; CHECK-LABEL: ucvtf_v1i16_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0xffff
+; CHECK-NEXT:    ucvtf d0, w8
+; CHECK-NEXT:    ret
+  %res = uitofp <1 x i16> %op1 to <1 x double>
+  ret <1 x double> %res
+}
+
+define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i16_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i16> %op1 to <2 x double>
+  ret <2 x double> %res
+}
+
+define void @ucvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: ucvtf_v4i16_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i16>, <4 x i16>* %a
+  %res = uitofp <4 x i16> %op1 to <4 x double>
+  store <4 x double> %res, <4 x double>* %b
+  ret void
+}
+
+define void @ucvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 {
+; CHECK-LABEL: ucvtf_v8i16_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z3.d, z1.s
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i16>, <8 x i16>* %a
+  %res = uitofp <8 x i16> %op1 to <8 x double>
+  store <8 x double> %res, <8 x double>* %b
+  ret void
+}
+
+define void @ucvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 {
+; CHECK-LABEL: ucvtf_v16i16_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z4.d, z2.s
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpklo z6.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    uunpklo z7.d, z0.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-NEXT:    stp q6, q1, [x1, #96]
+; CHECK-NEXT:    uunpklo z1.d, z2.s
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q7, q0, [x1, #32]
+; CHECK-NEXT:    stp q4, q1, [x1]
+; CHECK-NEXT:    stp q5, q3, [x1, #64]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %res = uitofp <16 x i16> %op1 to <16 x double>
+  store <16 x double> %res, <16 x double>* %b
+  ret void
+}
+
+;
+; UCVTF S -> H
+;
+
+define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i32_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i32> %op1 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
+; CHECK-LABEL: ucvtf_v4i32_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = uitofp <4 x i32> %op1 to <4 x half>
+  ret <4 x half> %res
+}
+
+define <8 x half> @ucvtf_v8i32_v8f16(<8 x i32>* %a) #0 {
+; CHECK-LABEL: ucvtf_v8i32_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %res = uitofp <8 x i32> %op1 to <8 x half>
+  ret <8 x half> %res
+}
+
+define void @ucvtf_v16i32_v16f16(<16 x i32>* %a, <16 x half>* %b) #0 {
+; CHECK-LABEL: ucvtf_v16i32_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i32>, <16 x i32>* %a
+  %res = uitofp <16 x i32> %op1 to <16 x half>
+  store <16 x half> %res, <16 x half>* %b
+  ret void
+}
+
+;
+; UCVTF S -> S
+;
+
+define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i32_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i32> %op1 to <2 x float>
+  ret <2 x float> %res
+}
+
+define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
+; CHECK-LABEL: ucvtf_v4i32_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = uitofp <4 x i32> %op1 to <4 x float>
+  ret <4 x float> %res
+}
+
+define void @ucvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: ucvtf_v8i32_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %res = uitofp <8 x i32> %op1 to <8 x float>
+  store <8 x float> %res, <8 x float>* %b
+  ret void
+}
+
+;
+; UCVTF S -> D
+;
+
+define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i32_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i32> %op1 to <2 x double>
+  ret <2 x double> %res
+}
+
+define void @ucvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: ucvtf_v4i32_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i32>, <4 x i32>* %a
+  %res = uitofp <4 x i32> %op1 to <4 x double>
+  store <4 x double> %res, <4 x double>* %b
+  ret void
+}
+
+define void @ucvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 {
+; CHECK-LABEL: ucvtf_v8i32_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z3.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %res = uitofp <8 x i32> %op1 to <8 x double>
+  store <8 x double> %res, <8 x double>* %b
+  ret void
+}
+
+;
+; UCVTF D -> H
+;
+
+define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i64_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    ucvtf h0, x8
+; CHECK-NEXT:    ucvtf h1, x9
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    str h1, [sp, #10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i64> %op1 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x half> @ucvtf_v4i64_v4f16(<4 x i64>* %a) #0 {
+; CHECK-LABEL: ucvtf_v4i64_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %res = uitofp <4 x i64> %op1 to <4 x half>
+  ret <4 x half> %res
+}
+
+define <8 x half> @ucvtf_v8i64_v8f16(<8 x i64>* %a) #0 {
+; CHECK-LABEL: ucvtf_v8i64_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcvtn v3.2s, v3.2d
+; CHECK-NEXT:    fcvtn v1.4h, v0.4s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i64>, <8 x i64>* %a
+  %res = uitofp <8 x i64> %op1 to <8 x half>
+  ret <8 x half> %res
+}
+
+;
+; UCVTF D -> S
+;
+
+define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i64_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i64> %op1 to <2 x float>
+  ret <2 x float> %res
+}
+
+define <4 x float> @ucvtf_v4i64_v4f32(<4 x i64>* %a) #0 {
+; CHECK-LABEL: ucvtf_v4i64_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %res = uitofp <4 x i64> %op1 to <4 x float>
+  ret <4 x float> %res
+}
+
+define void @ucvtf_v8i64_v8f32(<8 x i64>* %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: ucvtf_v8i64_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcvtn v3.2s, v3.2d
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i64>, <8 x i64>* %a
+  %res = uitofp <8 x i64> %op1 to <8 x float>
+  store <8 x float> %res, <8 x float>* %b
+  ret void
+}
+
+;
+; UCVTF D -> D
+;
+
+define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
+; CHECK-LABEL: ucvtf_v2i64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = uitofp <2 x i64> %op1 to <2 x double>
+  ret <2 x double> %res
+}
+
+define void @ucvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: ucvtf_v4i64_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %res = uitofp <4 x i64> %op1 to <4 x double>
+  store <4 x double> %res, <4 x double>* %b
+  ret void
+}
+
+;
+; SCVTF H -> H
+;
+
+define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) #0 {
+; CHECK-LABEL: scvtf_v4i16_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %res = sitofp <4 x i16> %op1 to <4 x half>
+  ret <4 x half> %res
+}
+
+define void @scvtf_v8i16_v8f16(<8 x i16>* %a, <8 x half>* %b) #0 {
+; CHECK-LABEL: scvtf_v8i16_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    scvtf v0.8h, v0.8h
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i16>, <8 x i16>* %a
+  %res = sitofp <8 x i16> %op1 to <8 x half>
+  store <8 x half> %res, <8 x half>* %b
+  ret void
+}
+
+define void @scvtf_v16i16_v16f16(<16 x i16>* %a, <16 x half>* %b) #0 {
+; CHECK-LABEL: scvtf_v16i16_v16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    scvtf v0.8h, v0.8h
+; CHECK-NEXT:    scvtf v1.8h, v1.8h
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %res = sitofp <16 x i16> %op1 to <16 x half>
+  store <16 x half> %res, <16 x half>* %b
+  ret void
+}
+
+; SCVTF H -> S
+;
+
+define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i16_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI33_0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI33_0]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    scvtf v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i16> %op1 to <2 x float>
+  ret <2 x float> %res
+}
+
+define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) #0 {
+; CHECK-LABEL: scvtf_v4i16_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = sitofp <4 x i16> %op1 to <4 x float>
+  ret <4 x float> %res
+}
+
+define void @scvtf_v8i16_v8f32(<8 x i16>* %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: scvtf_v8i16_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    sunpklo z1.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i16>, <8 x i16>* %a
+  %res = sitofp <8 x i16> %op1 to <8 x float>
+  store <8 x float> %res, <8 x float>* %b
+  ret void
+}
+
+define void @scvtf_v16i16_v16f32(<16 x i16>* %a, <16 x float>* %b) #0 {
+; CHECK-LABEL: scvtf_v16i16_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-NEXT:    sunpklo z2.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %res = sitofp <16 x i16> %op1 to <16 x float>
+  store <16 x float> %res, <16 x float>* %b
+  ret void
+}
+
+;
+; SCVTF H -> D
+;
+
+define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i16_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI37_0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI37_0]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i16> %op1 to <2 x double>
+  ret <2 x double> %res
+}
+
+define void @scvtf_v4i16_v4f64(<4 x i16>* %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v4i16_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z1.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i16>, <4 x i16>* %a
+  %res = sitofp <4 x i16> %op1 to <4 x double>
+  store <4 x double> %res, <4 x double>* %b
+  ret void
+}
+
+define void @scvtf_v8i16_v8f64(<8 x i16>* %a, <8 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v8i16_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    sunpklo z1.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z3.d, z1.s
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i16>, <8 x i16>* %a
+  %res = sitofp <8 x i16> %op1 to <8 x double>
+  store <8 x double> %res, <8 x double>* %b
+  ret void
+}
+
+define void @scvtf_v16i16_v16f64(<16 x i16>* %a, <16 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v16i16_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    sunpklo z2.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z4.d, z2.s
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpklo z6.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf v6.2d, v6.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    sunpklo z7.d, z0.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-NEXT:    stp q6, q1, [x1, #96]
+; CHECK-NEXT:    sunpklo z1.d, z2.s
+; CHECK-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q7, q0, [x1, #32]
+; CHECK-NEXT:    stp q4, q1, [x1]
+; CHECK-NEXT:    stp q5, q3, [x1, #64]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %res = sitofp <16 x i16> %op1 to <16 x double>
+  store <16 x double> %res, <16 x double>* %b
+  ret void
+}
+
+;
+; SCVTF S -> H
+;
+
+define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i32_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i32> %op1 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) #0 {
+; CHECK-LABEL: scvtf_v4i32_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = sitofp <4 x i32> %op1 to <4 x half>
+  ret <4 x half> %res
+}
+
+define <8 x half> @scvtf_v8i32_v8f16(<8 x i32>* %a) #0 {
+; CHECK-LABEL: scvtf_v8i32_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %res = sitofp <8 x i32> %op1 to <8 x half>
+  ret <8 x half> %res
+}
+
+;
+; SCVTF S -> S
+;
+
+define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i32_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i32> %op1 to <2 x float>
+  ret <2 x float> %res
+}
+
+define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) #0 {
+; CHECK-LABEL: scvtf_v4i32_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %res = sitofp <4 x i32> %op1 to <4 x float>
+  ret <4 x float> %res
+}
+
+define void @scvtf_v8i32_v8f32(<8 x i32>* %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: scvtf_v8i32_v8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %res = sitofp <8 x i32> %op1 to <8 x float>
+  store <8 x float> %res, <8 x float>* %b
+  ret void
+}
+
+;
+; SCVTF S -> D
+;
+
+define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i32_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i32> %op1 to <2 x double>
+  ret <2 x double> %res
+}
+
+define void @scvtf_v4i32_v4f64(<4 x i32>* %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v4i32_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    sunpklo z1.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i32>, <4 x i32>* %a
+  %res = sitofp <4 x i32> %op1 to <4 x double>
+  store <4 x double> %res, <4 x double>* %b
+  ret void
+}
+
+define void @scvtf_v8i32_v8f64(<8 x i32>* %a, <8 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v8i32_v8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z3.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %res = sitofp <8 x i32> %op1 to <8 x double>
+  store <8 x double> %res, <8 x double>* %b
+  ret void
+}
+
+define void @scvtf_v16i32_v16f64(<16 x i32>* %a, <16 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v16i32_v16f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z4.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    sunpklo z5.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    sunpklo z7.d, z3.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-NEXT:    sunpklo z6.d, z2.s
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    scvtf v6.2d, v6.2d
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    stp q4, q1, [x1]
+; CHECK-NEXT:    stp q5, q0, [x1, #32]
+; CHECK-NEXT:    stp q7, q3, [x1, #64]
+; CHECK-NEXT:    stp q6, q2, [x1, #96]
+; CHECK-NEXT:    ret
+  %op1 = load <16 x i32>, <16 x i32>* %a
+  %res = sitofp <16 x i32> %op1 to <16 x double>
+  store <16 x double> %res, <16 x double>* %b
+  ret void
+}
+
+;
+; SCVTF D -> H
+;
+
+define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i64_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    scvtf h0, x8
+; CHECK-NEXT:    scvtf h1, x9
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    str h1, [sp, #10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i64> %op1 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x half> @scvtf_v4i64_v4f16(<4 x i64>* %a) #0 {
+; CHECK-LABEL: scvtf_v4i64_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %res = sitofp <4 x i64> %op1 to <4 x half>
+  ret <4 x half> %res
+}
+
+;
+; SCVTF D -> S
+;
+
+define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i64_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i64> %op1 to <2 x float>
+  ret <2 x float> %res
+}
+
+define <4 x float> @scvtf_v4i64_v4f32(<4 x i64>* %a) #0 {
+; CHECK-LABEL: scvtf_v4i64_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %res = sitofp <4 x i64> %op1 to <4 x float>
+  ret <4 x float> %res
+}
+
+;
+; SCVTF D -> D
+;
+
+define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) #0 {
+; CHECK-LABEL: scvtf_v2i64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %res = sitofp <2 x i64> %op1 to <2 x double>
+  ret <2 x double> %res
+}
+
+define void @scvtf_v4i64_v4f64(<4 x i64>* %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: scvtf_v4i64_v4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %res = sitofp <4 x i64> %op1 to <4 x double>
+  store <4 x double> %res, <4 x double>* %b
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
new file mode 100644
index 000000000000..f54ee92f5854
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible-sve  < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; truncate i16 -> i8
+;
+
+define <16 x i8> @trunc_v16i16_v16i8(<16 x i16>* %in) vscale_range(2,0) #0 {
+; CHECK-LABEL: trunc_v16i16_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <16 x i16>, <16 x i16>* %in
+  %b = trunc <16 x i16> %a to <16 x i8>
+  ret <16 x i8> %b
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 {
+; CHECK-LABEL: trunc_v32i16_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z17.h, z1.h[6]
+; CHECK-NEXT:    mov z18.h, z1.h[5]
+; CHECK-NEXT:    mov z19.h, z1.h[4]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z2.h, z0.h[7]
+; CHECK-NEXT:    mov z3.h, z0.h[6]
+; CHECK-NEXT:    mov z4.h, z0.h[5]
+; CHECK-NEXT:    ldp q22, q23, [x0]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    strb w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w9, [sp, #16]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z5.h, z0.h[4]
+; CHECK-NEXT:    mov z6.h, z0.h[3]
+; CHECK-NEXT:    mov z7.h, z0.h[2]
+; CHECK-NEXT:    strb w10, [sp, #31]
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    strb w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    strb w9, [sp, #29]
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    mov z16.h, z0.h[1]
+; CHECK-NEXT:    mov z0.h, z1.h[7]
+; CHECK-NEXT:    strb w10, [sp, #28]
+; CHECK-NEXT:    fmov w10, s16
+; CHECK-NEXT:    strb w8, [sp, #27]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w9, [sp, #26]
+; CHECK-NEXT:    fmov w9, s17
+; CHECK-NEXT:    mov z20.h, z1.h[3]
+; CHECK-NEXT:    strb w10, [sp, #25]
+; CHECK-NEXT:    fmov w10, s18
+; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    strb w9, [sp, #22]
+; CHECK-NEXT:    fmov w9, s20
+; CHECK-NEXT:    mov z21.h, z1.h[2]
+; CHECK-NEXT:    mov z0.h, z1.h[1]
+; CHECK-NEXT:    strb w10, [sp, #21]
+; CHECK-NEXT:    fmov w10, s21
+; CHECK-NEXT:    strb w8, [sp, #20]
+; CHECK-NEXT:    strb w9, [sp, #19]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w9, s23
+; CHECK-NEXT:    mov z0.h, z23.h[7]
+; CHECK-NEXT:    mov z1.h, z23.h[6]
+; CHECK-NEXT:    strb w10, [sp, #18]
+; CHECK-NEXT:    fmov w10, s22
+; CHECK-NEXT:    strb w8, [sp, #17]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z2.h, z23.h[5]
+; CHECK-NEXT:    mov z3.h, z23.h[4]
+; CHECK-NEXT:    mov z4.h, z23.h[3]
+; CHECK-NEXT:    strb w10, [sp]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    strb w8, [sp, #15]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w9, [sp, #14]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z5.h, z23.h[2]
+; CHECK-NEXT:    mov z6.h, z23.h[1]
+; CHECK-NEXT:    mov z7.h, z22.h[7]
+; CHECK-NEXT:    strb w10, [sp, #13]
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    strb w9, [sp, #11]
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    mov z16.h, z22.h[6]
+; CHECK-NEXT:    mov z17.h, z22.h[5]
+; CHECK-NEXT:    mov z18.h, z22.h[4]
+; CHECK-NEXT:    strb w10, [sp, #10]
+; CHECK-NEXT:    fmov w10, s16
+; CHECK-NEXT:    strb w8, [sp, #9]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    strb w9, [sp, #7]
+; CHECK-NEXT:    fmov w9, s18
+; CHECK-NEXT:    mov z19.h, z22.h[3]
+; CHECK-NEXT:    mov z20.h, z22.h[2]
+; CHECK-NEXT:    mov z21.h, z22.h[1]
+; CHECK-NEXT:    strb w10, [sp, #6]
+; CHECK-NEXT:    fmov w10, s19
+; CHECK-NEXT:    strb w8, [sp, #5]
+; CHECK-NEXT:    fmov w8, s20
+; CHECK-NEXT:    strb w9, [sp, #4]
+; CHECK-NEXT:    fmov w9, s21
+; CHECK-NEXT:    strb w10, [sp, #3]
+; CHECK-NEXT:    strb w8, [sp, #2]
+; CHECK-NEXT:    strb w9, [sp, #1]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    add z1.b, z1.b, z1.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %a = load <32 x i16>, <32 x i16>* %in
+  %b = trunc <32 x i16> %a to <32 x i8>
+  %c = add <32 x i8> %b, %b
+  store <32 x i8> %c, <32 x i8>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: trunc_v64i16_v64i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <64 x i16>, <64 x i16>* %in
+  %b = trunc <64 x i16> %a to <64 x i8>
+  %c = add <64 x i8> %b, %b
+  store <64 x i8> %c, <64 x i8>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: trunc_v128i16_v128i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <128 x i16>, <128 x i16>* %in
+  %b = trunc <128 x i16> %a to <128 x i8>
+  %c = add <128 x i8> %b, %b
+  store <128 x i8> %c, <128 x i8>* %out
+  ret void
+}
+
+;
+; truncate i32 -> i8
+;
+
+define <8 x i8> @trunc_v8i32_v8i8(<8 x i32>* %in) vscale_range(2,0) #0 {
+; CHECK-LABEL: trunc_v8i32_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <8 x i32>, <8 x i32>* %in
+  %b = trunc <8 x i32> %a to <8 x i8>
+  ret <8 x i8> %b
+}
+
+define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %in) #0 {
+; CHECK-LABEL: trunc_v16i32_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z7.s, z1.s[2]
+; CHECK-NEXT:    mov z16.s, z1.s[1]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z4.s, z0.s[3]
+; CHECK-NEXT:    mov z5.s, z0.s[2]
+; CHECK-NEXT:    mov z6.s, z0.s[1]
+; CHECK-NEXT:    strb w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z0.s, z1.s[3]
+; CHECK-NEXT:    mov z19.s, z2.s[2]
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    strb w9, [sp, #15]
+; CHECK-NEXT:    strb w8, [sp]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov z1.s, z3.s[3]
+; CHECK-NEXT:    strb w10, [sp, #4]
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z17.s, z3.s[2]
+; CHECK-NEXT:    mov z18.s, z3.s[1]
+; CHECK-NEXT:    strb w10, [sp, #14]
+; CHECK-NEXT:    fmov w10, s7
+; CHECK-NEXT:    strb w9, [sp, #11]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    strb w8, [sp, #9]
+; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    strb w10, [sp, #10]
+; CHECK-NEXT:    fmov w10, s17
+; CHECK-NEXT:    mov z3.s, z2.s[3]
+; CHECK-NEXT:    mov z20.s, z2.s[1]
+; CHECK-NEXT:    strb w9, [sp, #7]
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    strb w10, [sp, #6]
+; CHECK-NEXT:    fmov w10, s19
+; CHECK-NEXT:    strb w8, [sp, #5]
+; CHECK-NEXT:    fmov w8, s20
+; CHECK-NEXT:    strb w9, [sp, #3]
+; CHECK-NEXT:    strb w10, [sp, #2]
+; CHECK-NEXT:    strb w8, [sp, #1]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %a = load <16 x i32>, <16 x i32>* %in
+  %b = trunc <16 x i32> %a to <16 x i8>
+  ret <16 x i8> %b
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: trunc_v32i32_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <32 x i32>, <32 x i32>* %in
+  %b = trunc <32 x i32> %a to <32 x i8>
+  %c = add <32 x i8> %b, %b
+  store <32 x i8> %c, <32 x i8>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: trunc_v64i32_v64i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <64 x i32>, <64 x i32>* %in
+  %b = trunc <64 x i32> %a to <64 x i8>
+  %c = add <64 x i8> %b, %b
+  store <64 x i8> %c, <64 x i8>* %out
+  ret void
+}
+
+;
+; truncate i32 -> i16
+;
+
+define <8 x i16> @trunc_v8i32_v8i16(<8 x i32>* %in) vscale_range(2,0) #0 {
+; CHECK-LABEL: trunc_v8i32_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <8 x i32>, <8 x i32>* %in
+  %b = trunc <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %b
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 {
+; CHECK-LABEL: trunc_v16i32_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z5.s, z1.s[2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z2.s, z0.s[3]
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    ldp q6, q7, [x0]
+; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z4.s, z0.s[1]
+; CHECK-NEXT:    mov z0.s, z1.s[3]
+; CHECK-NEXT:    strh w9, [sp, #16]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    strh w10, [sp, #30]
+; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    strh w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z0.s, z1.s[1]
+; CHECK-NEXT:    strh w9, [sp, #26]
+; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov w10, s6
+; CHECK-NEXT:    mov z0.s, z7.s[3]
+; CHECK-NEXT:    mov z1.s, z7.s[2]
+; CHECK-NEXT:    mov z2.s, z7.s[1]
+; CHECK-NEXT:    strh w8, [sp, #18]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    strh w10, [sp]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z3.s, z6.s[3]
+; CHECK-NEXT:    mov z4.s, z6.s[2]
+; CHECK-NEXT:    mov z5.s, z6.s[1]
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    strh w9, [sp, #4]
+; CHECK-NEXT:    strh w10, [sp, #2]
+; CHECK-NEXT:    ldp q1, q0, [sp]
+; CHECK-NEXT:    add z1.h, z1.h, z1.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+  %a = load <16 x i32>, <16 x i32>* %in
+  %b = trunc <16 x i32> %a to <16 x i16>
+  %c = add <16 x i16> %b, %b
+  store <16 x i16> %c, <16 x i16>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: trunc_v32i32_v32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <32 x i32>, <32 x i32>* %in
+  %b = trunc <32 x i32> %a to <32 x i16>
+  %c = add <32 x i16> %b, %b
+  store <32 x i16> %c, <32 x i16>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: trunc_v64i32_v64i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <64 x i32>, <64 x i32>* %in
+  %b = trunc <64 x i32> %a to <64 x i16>
+  %c = add <64 x i16> %b, %b
+  store <64 x i16> %c, <64 x i16>* %out
+  ret void
+}
+
+;
+; truncate i64 -> i8
+;
+
+; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
+define <4 x i8> @trunc_v4i64_v4i8(<4 x i64>* %in) vscale_range(2,0) #0 {
+; CHECK-LABEL: trunc_v4i64_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <4 x i64>, <4 x i64>* %in
+  %b = trunc <4 x i64> %a to <4 x i8>
+  ret <4 x i8> %b
+}
+
+define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %in) #0 {
+; CHECK-LABEL: trunc_v8i64_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    strb w9, [sp, #12]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    strb w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    strb w9, [sp, #15]
+; CHECK-NEXT:    strb w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w10, [sp, #10]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    strb w10, [sp, #13]
+; CHECK-NEXT:    strb w9, [sp, #9]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %a = load <8 x i64>, <8 x i64>* %in
+  %b = trunc <8 x i64> %a to <8 x i8>
+  ret <8 x i8> %b
+}
+
+define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %in) vscale_range(8,0) #0 {
+; CHECK-LABEL: trunc_v16i64_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <16 x i64>, <16 x i64>* %in
+  %b = trunc <16 x i64> %a to <16 x i8>
+  ret <16 x i8> %b
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: trunc_v32i64_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <32 x i64>, <32 x i64>* %in
+  %b = trunc <32 x i64> %a to <32 x i8>
+  %c = add <32 x i8> %b, %b
+  store <32 x i8> %c, <32 x i8>* %out
+  ret void
+}
+
+;
+; truncate i64 -> i16
+;
+
+define <4 x i16> @trunc_v4i64_v4i16(<4 x i64>* %in) vscale_range(2,0) #0 {
+; CHECK-LABEL: trunc_v4i64_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <4 x i64>, <4 x i64>* %in
+  %b = trunc <4 x i64> %a to <4 x i16>
+  ret <4 x i16> %b
+}
+
+define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %in) #0 {
+; CHECK-LABEL: trunc_v8i64_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    strh w9, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+  %a = load <8 x i64>, <8 x i64>* %in
+  %b = trunc <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %b
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: trunc_v16i64_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <16 x i64>, <16 x i64>* %in
+  %b = trunc <16 x i64> %a to <16 x i16>
+  %c = add <16 x i16> %b, %b
+  store <16 x i16> %c, <16 x i16>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: trunc_v32i64_v32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <32 x i64>, <32 x i64>* %in
+  %b = trunc <32 x i64> %a to <32 x i16>
+  %c = add <32 x i16> %b, %b
+  store <32 x i16> %c, <32 x i16>* %out
+  ret void
+}
+
+;
+; truncate i64 -> i32
+;
+
+define <4 x i32> @trunc_v4i64_v4i32(<4 x i64>* %in) vscale_range(2,0) #0 {
+; CHECK-LABEL: trunc_v4i64_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %a = load <4 x i64>, <4 x i64>* %in
+  %b = trunc <4 x i64> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 {
+; CHECK-LABEL: trunc_v8i64_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    add z1.s, z2.s, z2.s
+; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <8 x i64>, <8 x i64>* %in
+  %b = trunc <8 x i64> %a to <8 x i32>
+  %c = add <8 x i32> %b, %b
+  store <8 x i32> %c, <8 x i32>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) vscale_range(8,0) #0 {
+; CHECK-LABEL: trunc_v16i64_v16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <16 x i64>, <16 x i64>* %in
+  %b = trunc <16 x i64> %a to <16 x i32>
+  %c = add <16 x i32> %b, %b
+  store <16 x i32> %c, <16 x i32>* %out
+  ret void
+}
+
+; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
+define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) vscale_range(16,0) #0 {
+; CHECK-LABEL: trunc_v32i64_v32i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %a = load <32 x i64>, <32 x i64>* %in
+  %b = trunc <32 x i64> %a to <32 x i32>
+  %c = add <32 x i32> %b, %b
+  store <32 x i32> %c, <32 x i32>* %out
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sve" }


        


More information about the llvm-commits mailing list