[llvm] c85766f - [ARM] MVE tests for vmull from a splat. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 6 14:30:15 PDT 2021


Author: David Green
Date: 2021-06-06T22:30:02+01:00
New Revision: c85766f79b2e2ebdb2a33e3456936cec11b10dc5

URL: https://github.com/llvm/llvm-project/commit/c85766f79b2e2ebdb2a33e3456936cec11b10dc5
DIFF: https://github.com/llvm/llvm-project/commit/c85766f79b2e2ebdb2a33e3456936cec11b10dc5.diff

LOG: [ARM] MVE tests for vmull from a splat. NFC

Added: 
    llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
new file mode 100644
index 0000000000000..418c56d7b1c1f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -0,0 +1,1444 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_0246_0ext(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0246_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r0
+; CHECK-NEXT:    vmullb.s32 q1, q0, q2
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = sext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out1, %out2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_0ext_0246(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0ext_0246:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r0
+; CHECK-NEXT:    vmullb.s32 q1, q2, q0
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = sext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0246_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    umull r2, r5, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    mla r4, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r2, r3, r2, r5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    mla r1, r1, r0, r4
+; CHECK-NEXT:    mla r0, r3, r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ext = sext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %out1, %shuf2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_ext0_0246:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    asrs r4, r0, #31
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull lr, r12, r0, r1
+; CHECK-NEXT:    umull r2, r5, r0, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r0, r0, r2, r5
+; CHECK-NEXT:    mla r0, r4, r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ext = sext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %shuf2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_1357_0ext(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_1357_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r0
+; CHECK-NEXT:    vrev64.32 q2, q0
+; CHECK-NEXT:    vmullb.s32 q0, q2, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = sext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out1, %out2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_0ext_1357(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0ext_1357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r0
+; CHECK-NEXT:    vmullb.s32 q0, q2, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = sext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_1357_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    umull r2, r5, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    mla r4, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r2, r3, r2, r5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    mla r1, r1, r0, r4
+; CHECK-NEXT:    mla r0, r3, r0, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ext = sext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %out1, %shuf2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_ext0_1357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    asrs r4, r0, #31
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull lr, r12, r0, r1
+; CHECK-NEXT:    umull r2, r5, r0, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r0, r0, r2, r5
+; CHECK-NEXT:    mla r0, r4, r3, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = sext <2 x i32> %shuf1 to <2 x i64>
+  %ext = sext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %shuf2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @sext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0213_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vmullb.s32 q0, q2, q3
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmullb.s32 q1, q2, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = sext <4 x i32> %shuf1 to <4 x i64>
+  %ins = insertelement <8 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <4 x i32> zeroinitializer
+  %out2 = sext <4 x i32> %shuf2 to <4 x i64>
+  %out = mul <4 x i64> %out1, %out2
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0ext_0213:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vmullb.s32 q0, q3, q2
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmullb.s32 q1, q3, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = sext <4 x i32> %shuf1 to <4 x i64>
+  %ins = insertelement <8 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <4 x i32> zeroinitializer
+  %out2 = sext <4 x i32> %shuf2 to <4 x i64>
+  %out = mul <4 x i64> %out2, %out1
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_0213_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    umull r2, r5, r3, r0
+; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    mla r4, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r5, r3, r2, r5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    mla r1, r1, r0, r4
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    mla r3, r3, r0, r5
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    umull r5, lr, r4, r0
+; CHECK-NEXT:    umull r3, r12, r1, r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
+; CHECK-NEXT:    mla r3, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r2, r4, r2, lr
+; CHECK-NEXT:    mla r1, r1, r0, r3
+; CHECK-NEXT:    asrs r3, r4, #31
+; CHECK-NEXT:    mla r0, r3, r0, r2
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = sext <4 x i32> %shuf1 to <4 x i64>
+  %ext = sext i32 %src2 to i64
+  %ins = insertelement <4 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i64> %out1, %shuf2
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: sext32_ext0_0213:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    asrs r4, r0, #31
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    umull r2, r5, r0, r3
+; CHECK-NEXT:    umull lr, r12, r0, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r2, r0, r2, r5
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    mla r2, r4, r3, r2
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    umull r3, lr, r0, r5
+; CHECK-NEXT:    umull r2, r12, r0, r1
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r5, #31
+; CHECK-NEXT:    mla r0, r0, r2, lr
+; CHECK-NEXT:    mla r0, r4, r5, r0
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = sext <4 x i32> %shuf1 to <4 x i64>
+  %ext = sext i32 %src2 to i64
+  %ins = insertelement <4 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i64> %shuf2, %out1
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_0246_0ext(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0246_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r0
+; CHECK-NEXT:    vmullb.u32 q1, q0, q2
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = zext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out1, %out2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_0ext_0246(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0ext_0246:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r0
+; CHECK-NEXT:    vmullb.u32 q1, q2, q0
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = zext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_0246_ext0(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0246_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r1, r2, r1, r0
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ext = zext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %out1, %shuf2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_ext0_0246(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_ext0_0246:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r1, r2, r0, r1
+; CHECK-NEXT:    umull r0, r3, r0, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ext = zext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %shuf2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_1357_0ext(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_1357_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r0
+; CHECK-NEXT:    vrev64.32 q2, q0
+; CHECK-NEXT:    vmullb.u32 q0, q2, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = zext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out1, %out2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_0ext_1357(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0ext_1357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov q2[2], q2[0], r0, r0
+; CHECK-NEXT:    vmullb.u32 q0, q2, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ins = insertelement <4 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <2 x i32> zeroinitializer
+  %out2 = zext <2 x i32> %shuf2 to <2 x i64>
+  %out = mul <2 x i64> %out2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_1357_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull r1, r2, r1, r0
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ext = zext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %out1, %shuf2
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <2 x i64> @zext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_ext0_1357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    umull r1, r2, r0, r1
+; CHECK-NEXT:    umull r0, r3, r0, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %out1 = zext <2 x i32> %shuf1 to <2 x i64>
+  %ext = zext i32 %src2 to i64
+  %ins = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <2 x i64> %ins, <2 x i64> undef, <2 x i32> zeroinitializer
+  %out = mul <2 x i64> %shuf2, %out1
+  ret <2 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @zext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0213_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vmullb.u32 q0, q2, q3
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmullb.u32 q1, q2, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = zext <4 x i32> %shuf1 to <4 x i64>
+  %ins = insertelement <8 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <4 x i32> zeroinitializer
+  %out2 = zext <4 x i32> %shuf2 to <4 x i64>
+  %out = mul <4 x i64> %out1, %out2
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0ext_0213:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vmullb.u32 q0, q3, q2
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmullb.u32 q1, q3, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = zext <4 x i32> %shuf1 to <4 x i64>
+  %ins = insertelement <8 x i32> poison, i32 %src2, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <4 x i32> zeroinitializer
+  %out2 = zext <4 x i32> %shuf2 to <4 x i64>
+  %out = mul <4 x i64> %out2, %out1
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @zext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_0213_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    umull r3, r2, r3, r0
+; CHECK-NEXT:    umull r1, r12, r1, r0
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    umull r1, r2, r1, r0
+; CHECK-NEXT:    umull r0, r3, r3, r0
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = zext <4 x i32> %shuf1 to <4 x i64>
+  %ext = zext i32 %src2 to i64
+  %ins = insertelement <4 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i64> %out1, %shuf2
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i64> @zext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
+; CHECK-LABEL: zext32_ext0_0213:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s5, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    umull r3, r2, r0, r3
+; CHECK-NEXT:    umull r1, r12, r0, r1
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT:    umull r1, r2, r0, r1
+; CHECK-NEXT:    umull r0, r3, r0, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %out1 = zext <4 x i32> %shuf1 to <4 x i64>
+  %ext = zext i32 %src2 to i64
+  %ins = insertelement <4 x i64> poison, i64 %ext, i32 0
+  %shuf2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i64> %shuf2, %out1
+  ret <4 x i64> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_02468101214_0ext(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_02468101214_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmullb.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = sext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out1, %out2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_0ext_02468101214(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_0ext_02468101214:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmullb.s16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = sext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_02468101214_ext0(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_02468101214_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ext = sext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %out1, %shuf2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_ext0_02468101214(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_ext0_02468101214:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ext = sext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %shuf2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_13579111315_0ext(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_13579111315_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vrev32.16 q0, q0
+; CHECK-NEXT:    vmullb.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = sext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out1, %out2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_0ext_13579111315(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_0ext_13579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev32.16 q0, q0
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmullb.s16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = sext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_13579111315_ext0(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_13579111315_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.s16 q0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ext = sext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %out1, %shuf2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @sext16_ext0_13579111315(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_ext0_13579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.s16 q0, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <4 x i16> %shuf1 to <4 x i32>
+  %ext = sext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %shuf2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @sext16_02461357_0ext(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_02461357_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmullb.s16 q1, q1, q2
+; CHECK-NEXT:    vmullb.s16 q0, q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <8 x i16> %shuf1 to <8 x i32>
+  %ins = insertelement <16 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <8 x i32> zeroinitializer
+  %out2 = sext <8 x i16> %shuf2 to <8 x i32>
+  %out = mul <8 x i32> %out1, %out2
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @sext16_0ext_02461357(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_0ext_02461357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vmullb.s16 q1, q2, q1
+; CHECK-NEXT:    vmullb.s16 q0, q2, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <8 x i16> %shuf1 to <8 x i32>
+  %ins = insertelement <16 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <8 x i32> zeroinitializer
+  %out2 = sext <8 x i16> %shuf2 to <8 x i32>
+  %out = mul <8 x i32> %out2, %out1
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @sext16_02461357_ext0(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_02461357_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s16 q1, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmul.i32 q2, q1, r0
+; CHECK-NEXT:    vmovlt.s16 q0, q0
+; CHECK-NEXT:    vmul.i32 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <8 x i16> %shuf1 to <8 x i32>
+  %ext = sext i16 %src2 to i32
+  %ins = insertelement <8 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i32> %out1, %shuf2
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @sext16_ext0_02461357(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: sext16_ext0_02461357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s16 q1, q0
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    vmul.i32 q2, q1, r0
+; CHECK-NEXT:    vmovlt.s16 q0, q0
+; CHECK-NEXT:    vmul.i32 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = sext <8 x i16> %shuf1 to <8 x i32>
+  %ext = sext i16 %src2 to i32
+  %ins = insertelement <8 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i32> %shuf2, %out1
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_02468101214_0ext(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_02468101214_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmullb.u16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = zext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out1, %out2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_0ext_02468101214(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_0ext_02468101214:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmullb.u16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = zext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_02468101214_ext0(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_02468101214_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ext = zext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %out1, %shuf2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_ext0_02468101214(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_ext0_02468101214:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ext = zext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %shuf2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_13579111315_0ext(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_13579111315_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vrev32.16 q0, q0
+; CHECK-NEXT:    vmullb.u16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = zext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out1, %out2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_0ext_13579111315(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_0ext_13579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev32.16 q0, q0
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vmullb.u16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ins = insertelement <8 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <4 x i32> zeroinitializer
+  %out2 = zext <4 x i16> %shuf2 to <4 x i32>
+  %out = mul <4 x i32> %out2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_13579111315_ext0(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_13579111315_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.u16 q0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ext = zext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %out1, %shuf2
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @zext16_ext0_13579111315(<8 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_ext0_13579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.u16 q0, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <4 x i16> %shuf1 to <4 x i32>
+  %ext = zext i16 %src2 to i32
+  %ins = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+  %out = mul <4 x i32> %shuf2, %out1
+  ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @zext16_02461357_0ext(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_02461357_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmullb.u16 q1, q1, q2
+; CHECK-NEXT:    vmullb.u16 q0, q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <8 x i16> %shuf1 to <8 x i32>
+  %ins = insertelement <16 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <8 x i32> zeroinitializer
+  %out2 = zext <8 x i16> %shuf2 to <8 x i32>
+  %out = mul <8 x i32> %out1, %out2
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @zext16_0ext_02461357(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_0ext_02461357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vmullb.u16 q1, q2, q1
+; CHECK-NEXT:    vmullb.u16 q0, q2, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <8 x i16> %shuf1 to <8 x i32>
+  %ins = insertelement <16 x i16> poison, i16 %src2, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <8 x i32> zeroinitializer
+  %out2 = zext <8 x i16> %shuf2 to <8 x i32>
+  %out = mul <8 x i32> %out2, %out1
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @zext16_02461357_ext0(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_02461357_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u16 q1, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmul.i32 q2, q1, r0
+; CHECK-NEXT:    vmovlt.u16 q0, q0
+; CHECK-NEXT:    vmul.i32 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <8 x i16> %shuf1 to <8 x i32>
+  %ext = zext i16 %src2 to i32
+  %ins = insertelement <8 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i32> %out1, %shuf2
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i32> @zext16_ext0_02461357(<16 x i16> %src1, i16 %src2) {
+; CHECK-LABEL: zext16_ext0_02461357:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u16 q1, q0
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmul.i32 q2, q1, r0
+; CHECK-NEXT:    vmovlt.u16 q0, q0
+; CHECK-NEXT:    vmul.i32 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i16> %src1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %out1 = zext <8 x i16> %shuf1 to <8 x i32>
+  %ext = zext i16 %src2 to i32
+  %ins = insertelement <8 x i32> poison, i32 %ext, i32 0
+  %shuf2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i32> %shuf2, %out1
+  ret <8 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_024681012141618202224262830_0ext(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_024681012141618202224262830_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vmullb.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = sext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out1, %out2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_0ext_024681012141618202224262830(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_0ext_024681012141618202224262830:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vmullb.s8 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = sext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_024681012141618202224262830_ext0(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_024681012141618202224262830_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ext = sext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %out1, %shuf2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_ext0_024681012141618202224262830(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_ext0_024681012141618202224262830:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ext = sext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %shuf2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_135791113151719212325272931_0ext(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_135791113151719212325272931_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vrev16.8 q0, q0
+; CHECK-NEXT:    vmullb.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = sext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out1, %out2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_0ext_135791113151719212325272931(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_0ext_135791113151719212325272931:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev16.8 q0, q0
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vmullb.s8 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = sext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_135791113151719212325272931_ext0(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_135791113151719212325272931_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.s8 q0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ext = sext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %out1, %shuf2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @sext8_ext0_135791113151719212325272931(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_ext0_135791113151719212325272931:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.s8 q0, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <8 x i8> %shuf1 to <8 x i16>
+  %ext = sext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %shuf2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @sext8_0246810121413579111315_0ext(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_0246810121413579111315_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vmullb.s8 q1, q1, q2
+; CHECK-NEXT:    vmullb.s8 q0, q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <16 x i8> %shuf1 to <16 x i16>
+  %ins = insertelement <32 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <16 x i32> zeroinitializer
+  %out2 = sext <16 x i8> %shuf2 to <16 x i16>
+  %out = mul <16 x i16> %out1, %out2
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @sext8_0ext_0246810121413579111315(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_0ext_0246810121413579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vmullb.s8 q1, q2, q1
+; CHECK-NEXT:    vmullb.s8 q0, q2, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <16 x i8> %shuf1 to <16 x i16>
+  %ins = insertelement <32 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <16 x i32> zeroinitializer
+  %out2 = sext <16 x i8> %shuf2 to <16 x i16>
+  %out = mul <16 x i16> %out2, %out1
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @sext8_0246810121413579111315_ext0(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_0246810121413579111315_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s8 q1, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q2, q1, r0
+; CHECK-NEXT:    vmovlt.s8 q0, q0
+; CHECK-NEXT:    vmul.i16 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <16 x i8> %shuf1 to <16 x i16>
+  %ext = sext i8 %src2 to i16
+  %ins = insertelement <16 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
+  %out = mul <16 x i16> %out1, %shuf2
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @sext8_ext0_0246810121413579111315(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: sext8_ext0_0246810121413579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.s8 q1, q0
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q2, q1, r0
+; CHECK-NEXT:    vmovlt.s8 q0, q0
+; CHECK-NEXT:    vmul.i16 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = sext <16 x i8> %shuf1 to <16 x i16>
+  %ext = sext i8 %src2 to i16
+  %ins = insertelement <16 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
+  %out = mul <16 x i16> %shuf2, %out1
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_024681012141618202224262830_0ext(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_024681012141618202224262830_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vmullb.u8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = zext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out1, %out2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_0ext_024681012141618202224262830(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_0ext_024681012141618202224262830:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vmullb.u8 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = zext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_024681012141618202224262830_ext0(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_024681012141618202224262830_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ext = zext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %out1, %shuf2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_ext0_024681012141618202224262830(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_ext0_024681012141618202224262830:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ext = zext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %shuf2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_135791113151719212325272931_0ext(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_135791113151719212325272931_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vrev16.8 q0, q0
+; CHECK-NEXT:    vmullb.u8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = zext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out1, %out2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_0ext_135791113151719212325272931(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_0ext_135791113151719212325272931:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev16.8 q0, q0
+; CHECK-NEXT:    vdup.16 q1, r0
+; CHECK-NEXT:    vmullb.u8 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ins = insertelement <16 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <16 x i8> %ins, <16 x i8> undef, <8 x i32> zeroinitializer
+  %out2 = zext <8 x i8> %shuf2 to <8 x i16>
+  %out = mul <8 x i16> %out2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_135791113151719212325272931_ext0(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_135791113151719212325272931_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.u8 q0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ext = zext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %out1, %shuf2
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @zext8_ext0_135791113151719212325272931(<16 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_ext0_135791113151719212325272931:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlt.u8 q0, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <16 x i8> %src1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <8 x i8> %shuf1 to <8 x i16>
+  %ext = zext i8 %src2 to i16
+  %ins = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <8 x i16> %ins, <8 x i16> undef, <8 x i32> zeroinitializer
+  %out = mul <8 x i16> %shuf2, %out1
+  ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @zext8_0246810121413579111315_0ext(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_0246810121413579111315_0ext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vmullb.u8 q1, q1, q2
+; CHECK-NEXT:    vmullb.u8 q0, q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <16 x i8> %shuf1 to <16 x i16>
+  %ins = insertelement <32 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <16 x i32> zeroinitializer
+  %out2 = zext <16 x i8> %shuf2 to <16 x i16>
+  %out = mul <16 x i16> %out1, %out2
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @zext8_0ext_0246810121413579111315(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_0ext_0246810121413579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vmullb.u8 q1, q2, q1
+; CHECK-NEXT:    vmullb.u8 q0, q2, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <16 x i8> %shuf1 to <16 x i16>
+  %ins = insertelement <32 x i8> poison, i8 %src2, i32 0
+  %shuf2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <16 x i32> zeroinitializer
+  %out2 = zext <16 x i8> %shuf2 to <16 x i16>
+  %out = mul <16 x i16> %out2, %out1
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @zext8_0246810121413579111315_ext0(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_0246810121413579111315_ext0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u8 q1, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q2, q1, r0
+; CHECK-NEXT:    vmovlt.u8 q0, q0
+; CHECK-NEXT:    vmul.i16 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <16 x i8> %shuf1 to <16 x i16>
+  %ext = zext i8 %src2 to i16
+  %ins = insertelement <16 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
+  %out = mul <16 x i16> %out1, %shuf2
+  ret <16 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i16> @zext8_ext0_0246810121413579111315(<32 x i8> %src1, i8 %src2) {
+; CHECK-LABEL: zext8_ext0_0246810121413579111315:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovlb.u8 q1, q0
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    vmul.i16 q2, q1, r0
+; CHECK-NEXT:    vmovlt.u8 q0, q0
+; CHECK-NEXT:    vmul.i16 q1, q0, r0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %shuf1 = shufflevector <32 x i8> %src1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %out1 = zext <16 x i8> %shuf1 to <16 x i16>
+  %ext = zext i8 %src2 to i16
+  %ins = insertelement <16 x i16> poison, i16 %ext, i32 0
+  %shuf2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
+  %out = mul <16 x i16> %shuf2, %out1
+  ret <16 x i16> %out
+}


        


More information about the llvm-commits mailing list