[llvm] 3a68c6d - [ARM] Extend MVE lane interleaving to handle other non-instruction leaves

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 29 01:06:01 PDT 2021


Author: David Green
Date: 2021-03-29T09:05:45+01:00
New Revision: 3a68c6d26c9438eff1dc4483082076d3b1e117b5

URL: https://github.com/llvm/llvm-project/commit/3a68c6d26c9438eff1dc4483082076d3b1e117b5
DIFF: https://github.com/llvm/llvm-project/commit/3a68c6d26c9438eff1dc4483082076d3b1e117b5.diff

LOG: [ARM] Extend MVE lane interleaving to handle other non-instruction leaves

This extends the recent MVE lane interleaving passto handle other
non-instruction leaves, for which a new shuffle is added. This helps
especially for constants and potentially for arguments.

Differential Revision: https://reviews.llvm.org/D97289

Added: 
    

Modified: 
    llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
    llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vabdus.ll
    llvm/test/CodeGen/Thumb2/mve-vmulh.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
index 1ccd64acb410..ce01245fbd14 100644
--- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
+++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@@ -162,6 +162,7 @@ static bool tryInterleave(Instruction *Start,
 
   SmallSetVector<Instruction *, 4> Truncs;
   SmallSetVector<Instruction *, 4> Exts;
+  SmallSetVector<Use *, 4> OtherLeafs;
   SmallSetVector<Instruction *, 4> Ops;
 
   while (!Worklist.empty()) {
@@ -204,7 +205,7 @@ static bool tryInterleave(Instruction *Start,
         if (isa<Instruction>(Op))
           Worklist.push_back(cast<Instruction>(&Op));
         else
-          return false;
+          OtherLeafs.insert(&Op);
       }
 
       for (auto *Use : I->users())
@@ -217,6 +218,9 @@ static bool tryInterleave(Instruction *Start,
     }
   }
 
+  if (Exts.empty() && OtherLeafs.empty())
+    return false;
+
   LLVM_DEBUG({
     dbgs() << "Found group:\n  Exts:";
     for (auto *I : Exts)
@@ -224,13 +228,15 @@ static bool tryInterleave(Instruction *Start,
     dbgs() << "  Ops:";
     for (auto *I : Ops)
       dbgs() << "  " << *I << "\n";
+    dbgs() << "  OtherLeafs:";
+    for (auto *I : OtherLeafs)
+      dbgs() << "  " << *I << "\n";
     dbgs() << "Truncs:";
     for (auto *I : Truncs)
       dbgs() << "  " << *I << "\n";
   });
 
   assert(!Truncs.empty() && "Expected some truncs");
-  assert(!Exts.empty() && "Expected some leaves");
 
   // Check types
   unsigned NumElts = VT->getNumElements();
@@ -292,6 +298,14 @@ static bool tryInterleave(Instruction *Start,
     LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
   }
 
+  for (Use *I : OtherLeafs) {
+    LLVM_DEBUG(dbgs() << "Replacing leaf " << *I << "\n");
+    Builder.SetInsertPoint(cast<Instruction>(I->getUser()));
+    Value *Shuffle = Builder.CreateShuffleVector(I->get(), LeafMask);
+    I->getUser()->setOperand(I->getOperandNo(), Shuffle);
+    LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");
+  }
+
   for (Instruction *I : Truncs) {
     LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n");
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index c8f0a7a85873..ed5b86b9ecb5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -238,55 +238,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @ext_add_ashr_trunc_i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: ext_add_ashr_trunc_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u16 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmovlb.s16 q3, q3
-; CHECK-NEXT:    vadd.i32 q2, q3, q2
-; CHECK-NEXT:    vshr.u32 q3, q2, #1
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q1, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmovlb.s16 q0, q3
+; CHECK-NEXT:    vmovlb.u16 q2, q1
+; CHECK-NEXT:    vmovlb.s16 q3, q0
+; CHECK-NEXT:    vmovlt.u16 q1, q1
+; CHECK-NEXT:    vmovlt.s16 q0, q0
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vshr.u32 q0, q0, #1
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vadd.i32 q2, q3, q2
+; CHECK-NEXT:    vshr.u32 q1, q0, #1
+; CHECK-NEXT:    vshr.u32 q0, q2, #1
+; CHECK-NEXT:    vmovnt.i32 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %sa = sext <8 x i16> %a to <8 x i32>
@@ -300,111 +260,15 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: ext_add_ashr_trunc_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmovlb.u8 q2, q2
-; CHECK-NEXT:    vmovlb.s8 q3, q3
-; CHECK-NEXT:    vadd.i16 q2, q3, q2
-; CHECK-NEXT:    vshr.u16 q3, q2, #1
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.8 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.8 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.8 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.8 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.8 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.8 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.8 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.8 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.u8 q1, q3
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmovlb.s8 q0, q3
+; CHECK-NEXT:    vmovlb.u8 q2, q1
+; CHECK-NEXT:    vmovlb.s8 q3, q0
+; CHECK-NEXT:    vmovlt.u8 q1, q1
+; CHECK-NEXT:    vmovlt.s8 q0, q0
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vshr.u16 q0, q0, #1
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q2[8], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q2[9], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.8 q2[10], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q2[11], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q2[12], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.8 q2[13], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q2[14], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vadd.i16 q2, q3, q2
+; CHECK-NEXT:    vshr.u16 q1, q0, #1
+; CHECK-NEXT:    vshr.u16 q0, q2, #1
+; CHECK-NEXT:    vmovnt.i16 q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %sa = sext <16 x i8> %a to <16 x i16>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index f69eeb773a9f..180fc78db665 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -1186,8 +1186,7 @@ define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB6_8
+; CHECK-NEXT:    cbz r3, .LBB6_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #7
 ; CHECK-NEXT:    bhi .LBB6_3
@@ -1207,16 +1206,13 @@ define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
 ; CHECK-NEXT:  .LBB6_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
-; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vldrh.s32 q1, [r1], #16
-; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
-; CHECK-NEXT:    vstrh.32 q0, [r2, #8]
-; CHECK-NEXT:    vldrh.s32 q0, [r0], #16
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
+; CHECK-NEXT:    vmullt.s16 q2, q1, q0
+; CHECK-NEXT:    vmullb.s16 q0, q1, q0
 ; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
-; CHECK-NEXT:    vstrh.32 q0, [r2], #16
+; CHECK-NEXT:    vqshrnt.s32 q0, q2, #15
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB6_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
@@ -1540,14 +1536,12 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: ssatmul_8t_q15:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq.w .LBB9_3
+; CHECK-NEXT:    beq .LBB9_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    add.w r12, r3, #7
 ; CHECK-NEXT:    adr r4, .LCPI9_0
@@ -1556,19 +1550,17 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    sub.w r12, r12, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI9_1
-; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
 ; CHECK-NEXT:    sub.w r12, r3, #1
 ; CHECK-NEXT:    vldrw.u32 q4, [r4]
+; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
-; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vdup.32 q6, r5
-; CHECK-NEXT:    adds r5, #8
+; CHECK-NEXT:    vdup.32 q6, r3
+; CHECK-NEXT:    adds r3, #8
 ; CHECK-NEXT:    vorr q5, q6, q0
 ; CHECK-NEXT:    vorr q6, q6, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q5
@@ -1591,63 +1583,19 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    vmov.16 q5[6], r4
 ; CHECK-NEXT:    vmov r4, s27
 ; CHECK-NEXT:    vmov.16 q5[7], r4
-; CHECK-NEXT:    vpt.i16 ne, q5, zr
-; CHECK-NEXT:    vldrht.u16 q6, [r0], #16
-; CHECK-NEXT:    vmov.u16 r4, q6[2]
-; CHECK-NEXT:    vmov.u16 r3, q6[0]
-; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
-; CHECK-NEXT:    vmov.u16 r3, q6[3]
-; CHECK-NEXT:    vmov.u16 r4, q6[1]
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrht.u16 q7, [r1], #16
-; CHECK-NEXT:    vmov q5[3], q5[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[2]
-; CHECK-NEXT:    vmov.u16 r4, q7[0]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[3]
-; CHECK-NEXT:    vmov.u16 r4, q7[1]
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r4, q6[4]
-; CHECK-NEXT:    vmullb.s16 q0, q0, q5
-; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q5[0], r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov.16 q5[1], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.16 q5[2], r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.16 q5[3], r3
-; CHECK-NEXT:    vmov.u16 r3, q6[6]
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q6[7]
-; CHECK-NEXT:    vmov.u16 r4, q6[5]
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[6]
-; CHECK-NEXT:    vmov.u16 r4, q7[4]
-; CHECK-NEXT:    vmov q6[2], q6[0], r4, r3
-; CHECK-NEXT:    vmov.u16 r3, q7[7]
-; CHECK-NEXT:    vmov.u16 r4, q7[5]
-; CHECK-NEXT:    vmov q6[3], q6[1], r4, r3
-; CHECK-NEXT:    vmullb.s16 q0, q6, q0
-; CHECK-NEXT:    vqshrnb.s32 q0, q0, #15
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q5[4], r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmov.16 q5[5], r3
-; CHECK-NEXT:    vmov r3, s2
-; CHECK-NEXT:    vmov.16 q5[6], r3
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov.16 q5[7], r3
+; CHECK-NEXT:    vptt.i16 ne, q5, zr
+; CHECK-NEXT:    vldrht.u16 q5, [r0], #16
+; CHECK-NEXT:    vldrht.u16 q6, [r1], #16
+; CHECK-NEXT:    vmullt.s16 q7, q6, q5
+; CHECK-NEXT:    vmullb.s16 q5, q6, q5
+; CHECK-NEXT:    vqshrnb.s32 q5, q5, #15
+; CHECK-NEXT:    vqshrnt.s32 q5, q7, #15
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q5, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB9_2
 ; CHECK-NEXT:  .LBB9_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:  .LCPI9_0:
@@ -1989,16 +1937,16 @@ define arm_aapcs_vfpcc void @usatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    add.w r6, r1, r5, lsl #1
 ; CHECK-NEXT:  .LBB12_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
-; CHECK-NEXT:    vldrh.u32 q1, [r1, #8]
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vldrh.u32 q1, [r1], #16
-; CHECK-NEXT:    vqshrnb.u32 q0, q0, #15
-; CHECK-NEXT:    vstrh.32 q0, [r2, #8]
-; CHECK-NEXT:    vldrh.u32 q0, [r0], #16
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
+; CHECK-NEXT:    vmullt.u16 q2, q1, q0
+; CHECK-NEXT:    vmullb.u16 q0, q1, q0
+; CHECK-NEXT:    vqshrnb.u32 q2, q2, #15
 ; CHECK-NEXT:    vqshrnb.u32 q0, q0, #15
-; CHECK-NEXT:    vstrh.32 q0, [r2], #16
+; CHECK-NEXT:    vmovlb.u16 q2, q2
+; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    vmovnt.i32 q0, q2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB12_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
@@ -2358,8 +2306,7 @@ define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB15_8
+; CHECK-NEXT:    cbz r3, .LBB15_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #15
 ; CHECK-NEXT:    bhi .LBB15_3
@@ -2379,16 +2326,13 @@ define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    adds r6, r1, r5
 ; CHECK-NEXT:  .LBB15_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.s16 q0, [r0, #8]
-; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
-; CHECK-NEXT:    vldrb.s16 q1, [r1], #16
-; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
-; CHECK-NEXT:    vstrb.16 q0, [r2, #8]
-; CHECK-NEXT:    vldrb.s16 q0, [r0], #16
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vmullt.s8 q2, q1, q0
+; CHECK-NEXT:    vmullb.s8 q0, q1, q0
 ; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
-; CHECK-NEXT:    vstrb.16 q0, [r2], #16
+; CHECK-NEXT:    vqshrnt.s16 q0, q2, #7
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB15_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
@@ -2776,181 +2720,75 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB18_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vdup.32 q4, r3
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vdup.32 q0, r3
 ; CHECK-NEXT:    adds r3, #16
-; CHECK-NEXT:    vorr q0, q4, q0
-; CHECK-NEXT:    vcmp.u32 cs, q1, q0
-; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov.16 q0[0], r4
-; CHECK-NEXT:    vmov r4, s21
-; CHECK-NEXT:    vmov.16 q0[1], r4
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov.16 q0[2], r4
-; CHECK-NEXT:    vmov r4, s23
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q0[3], r4
-; CHECK-NEXT:    vorr q5, q4, q5
-; CHECK-NEXT:    vcmp.u32 cs, q1, q5
-; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov.16 q0[4], r4
-; CHECK-NEXT:    vmov r4, s21
-; CHECK-NEXT:    vmov.16 q0[5], r4
-; CHECK-NEXT:    vmov r4, s22
-; CHECK-NEXT:    vmov.16 q0[6], r4
-; CHECK-NEXT:    vmov r4, s23
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
-; CHECK-NEXT:    vpsel q5, q3, q2
-; CHECK-NEXT:    vmov.u16 r4, q5[0]
-; CHECK-NEXT:    vmov.8 q0[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[1]
-; CHECK-NEXT:    vmov.8 q0[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[2]
-; CHECK-NEXT:    vmov.8 q0[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[3]
-; CHECK-NEXT:    vmov.8 q0[3], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[4]
-; CHECK-NEXT:    vmov.8 q0[4], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[5]
-; CHECK-NEXT:    vmov.8 q0[5], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[6]
-; CHECK-NEXT:    vmov.8 q0[6], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[7]
-; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q0[7], r4
-; CHECK-NEXT:    vorr q5, q4, q5
-; CHECK-NEXT:    vorr q4, q4, q6
-; CHECK-NEXT:    vcmp.u32 cs, q1, q5
-; CHECK-NEXT:    vpsel q7, q3, q2
+; CHECK-NEXT:    vorr q4, q0, q4
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
-; CHECK-NEXT:    vmov r4, s28
 ; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov r4, s29
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s30
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s31
-; CHECK-NEXT:    vmov.16 q5[3], r4
 ; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov r4, s18
-; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov r4, s19
-; CHECK-NEXT:    vmov.16 q5[7], r4
-; CHECK-NEXT:    vcmp.i16 ne, q5, zr
-; CHECK-NEXT:    vpsel q4, q3, q2
-; CHECK-NEXT:    vmov.u16 r4, q4[0]
-; CHECK-NEXT:    vmov.8 q0[8], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[1]
-; CHECK-NEXT:    vmov.8 q0[9], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[2]
-; CHECK-NEXT:    vmov.8 q0[10], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[3]
-; CHECK-NEXT:    vmov.8 q0[11], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[4]
-; CHECK-NEXT:    vmov.8 q0[12], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[5]
-; CHECK-NEXT:    vmov.8 q0[13], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[6]
-; CHECK-NEXT:    vmov.8 q0[14], r4
-; CHECK-NEXT:    vmov.u16 r4, q4[7]
-; CHECK-NEXT:    vmov.8 q0[15], r4
-; CHECK-NEXT:    vpt.i8 ne, q0, zr
-; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
-; CHECK-NEXT:    vmov.u8 r4, q0[0]
 ; CHECK-NEXT:    vmov.16 q7[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[1]
+; CHECK-NEXT:    vmov r4, s17
 ; CHECK-NEXT:    vmov.16 q7[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[2]
+; CHECK-NEXT:    vmov r4, s18
 ; CHECK-NEXT:    vmov.16 q7[2], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[3]
+; CHECK-NEXT:    vmov r4, s19
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.16 q7[3], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[4]
+; CHECK-NEXT:    vorr q4, q0, q4
+; CHECK-NEXT:    vcmp.u32 cs, q1, q4
+; CHECK-NEXT:    vpsel q4, q3, q2
+; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    vmov.16 q7[4], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[5]
+; CHECK-NEXT:    vmov r4, s17
 ; CHECK-NEXT:    vmov.16 q7[5], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[6]
+; CHECK-NEXT:    vmov r4, s18
 ; CHECK-NEXT:    vmov.16 q7[6], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[7]
+; CHECK-NEXT:    vmov r4, s19
 ; CHECK-NEXT:    vmov.16 q7[7], r4
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u8 q4, [r1], #16
-; CHECK-NEXT:    vmov.u8 r4, q4[0]
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[1]
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[2]
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[3]
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[4]
-; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[5]
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[6]
-; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[7]
-; CHECK-NEXT:    vmov.16 q5[7], r4
-; CHECK-NEXT:    vmullb.s8 q5, q5, q7
-; CHECK-NEXT:    vqshrnb.s16 q5, q5, #7
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmov.u16 r4, q5[0]
+; CHECK-NEXT:    vcmp.i16 ne, q7, zr
+; CHECK-NEXT:    vpsel q4, q3, q2
+; CHECK-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-NEXT:    vmov.8 q7[0], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[1]
+; CHECK-NEXT:    vmov.u16 r4, q4[1]
 ; CHECK-NEXT:    vmov.8 q7[1], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[2]
+; CHECK-NEXT:    vmov.u16 r4, q4[2]
 ; CHECK-NEXT:    vmov.8 q7[2], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[3]
+; CHECK-NEXT:    vmov.u16 r4, q4[3]
 ; CHECK-NEXT:    vmov.8 q7[3], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[4]
+; CHECK-NEXT:    vmov.u16 r4, q4[4]
 ; CHECK-NEXT:    vmov.8 q7[4], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[5]
+; CHECK-NEXT:    vmov.u16 r4, q4[5]
 ; CHECK-NEXT:    vmov.8 q7[5], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[6]
+; CHECK-NEXT:    vmov.u16 r4, q4[6]
 ; CHECK-NEXT:    vmov.8 q7[6], r4
-; CHECK-NEXT:    vmov.u16 r4, q5[7]
+; CHECK-NEXT:    vmov.u16 r4, q4[7]
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.8 q7[7], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[8]
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[9]
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[10]
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[11]
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[12]
-; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[13]
-; CHECK-NEXT:    vmov.16 q5[5], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[14]
-; CHECK-NEXT:    vmov.16 q5[6], r4
-; CHECK-NEXT:    vmov.u8 r4, q0[15]
-; CHECK-NEXT:    vmov.16 q5[7], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[8]
-; CHECK-NEXT:    vmov.16 q0[0], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[9]
-; CHECK-NEXT:    vmov.16 q0[1], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[10]
-; CHECK-NEXT:    vmov.16 q0[2], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[11]
-; CHECK-NEXT:    vmov.16 q0[3], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[12]
-; CHECK-NEXT:    vmov.16 q0[4], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[13]
-; CHECK-NEXT:    vmov.16 q0[5], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[14]
-; CHECK-NEXT:    vmov.16 q0[6], r4
-; CHECK-NEXT:    vmov.u8 r4, q4[15]
-; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    vmullb.s8 q0, q0, q5
-; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
-; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vorr q4, q0, q4
+; CHECK-NEXT:    vorr q0, q0, q6
+; CHECK-NEXT:    vcmp.u32 cs, q1, q4
+; CHECK-NEXT:    vpsel q5, q3, q2
+; CHECK-NEXT:    vcmp.u32 cs, q1, q0
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vpsel q0, q3, q2
+; CHECK-NEXT:    vmov.16 q4[0], r4
+; CHECK-NEXT:    vmov r4, s21
+; CHECK-NEXT:    vmov.16 q4[1], r4
+; CHECK-NEXT:    vmov r4, s22
+; CHECK-NEXT:    vmov.16 q4[2], r4
+; CHECK-NEXT:    vmov r4, s23
+; CHECK-NEXT:    vmov.16 q4[3], r4
+; CHECK-NEXT:    vmov r4, s0
+; CHECK-NEXT:    vmov.16 q4[4], r4
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vmov.16 q4[5], r4
+; CHECK-NEXT:    vmov r4, s2
+; CHECK-NEXT:    vmov.16 q4[6], r4
+; CHECK-NEXT:    vmov r4, s3
+; CHECK-NEXT:    vmov.16 q4[7], r4
+; CHECK-NEXT:    vcmp.i16 ne, q4, zr
+; CHECK-NEXT:    vpsel q0, q3, q2
 ; CHECK-NEXT:    vmov.u16 r4, q0[0]
 ; CHECK-NEXT:    vmov.8 q7[8], r4
 ; CHECK-NEXT:    vmov.u16 r4, q0[1]
@@ -2967,8 +2805,15 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    vmov.8 q7[14], r4
 ; CHECK-NEXT:    vmov.u16 r4, q0[7]
 ; CHECK-NEXT:    vmov.8 q7[15], r4
+; CHECK-NEXT:    vptt.i8 ne, q7, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrbt.u8 q4, [r1], #16
+; CHECK-NEXT:    vmullt.s8 q5, q4, q0
+; CHECK-NEXT:    vmullb.s8 q0, q4, q0
+; CHECK-NEXT:    vqshrnb.s16 q0, q0, #7
+; CHECK-NEXT:    vqshrnt.s16 q0, q5, #7
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrbt.8 q7, [r2], #16
+; CHECK-NEXT:    vstrbt.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB18_2
 ; CHECK-NEXT:  .LBB18_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #56
@@ -3404,18 +3249,16 @@ define arm_aapcs_vfpcc void @usatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    adds r6, r1, r5
 ; CHECK-NEXT:  .LBB21_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
-; CHECK-NEXT:    vldrb.u16 q1, [r1, #8]
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
-; CHECK-NEXT:    vldrb.u16 q1, [r1], #16
-; CHECK-NEXT:    vqshrnb.u16 q0, q0, #7
-; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vstrb.16 q0, [r2, #8]
-; CHECK-NEXT:    vldrb.u16 q0, [r0], #16
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vmullt.u8 q2, q1, q0
+; CHECK-NEXT:    vmullb.u8 q0, q1, q0
+; CHECK-NEXT:    vqshrnb.u16 q2, q2, #7
 ; CHECK-NEXT:    vqshrnb.u16 q0, q0, #7
+; CHECK-NEXT:    vmovlb.u8 q2, q2
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vstrb.16 q0, [r2], #16
+; CHECK-NEXT:    vmovnt.i16 q0, q2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB21_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index cb5093787788..55ec20bb6097 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -4,111 +4,15 @@
 define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: vabd_s8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmovlb.s8 q3, q3
+; CHECK-NEXT:    vmovlt.s8 q2, q1
+; CHECK-NEXT:    vmovlt.s8 q3, q0
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q0, q0
 ; CHECK-NEXT:    vsub.i16 q2, q3, q2
-; CHECK-NEXT:    vabs.s16 q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.8 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.8 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.8 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.8 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.8 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.8 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.8 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.8 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.s8 q1, q3
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmovlb.s8 q0, q3
 ; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vabs.s16 q2, q2
 ; CHECK-NEXT:    vabs.s16 q0, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q2[8], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q2[9], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.8 q2[10], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q2[11], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q2[12], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.8 q2[13], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q2[14], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i16 q0, q2
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
   %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
@@ -123,55 +27,15 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
 define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vabd_s16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmovlb.s16 q3, q3
+; CHECK-NEXT:    vmovlt.s16 q2, q1
+; CHECK-NEXT:    vmovlt.s16 q3, q0
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vsub.i32 q2, q3, q2
-; CHECK-NEXT:    vabs.s32 q3, q2
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q1, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmovlb.s16 q0, q3
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vabs.s32 q2, q2
 ; CHECK-NEXT:    vabs.s32 q0, q0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i32 q0, q2
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
@@ -240,111 +104,15 @@ define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: vabd_u8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmovlb.u8 q2, q2
-; CHECK-NEXT:    vmovlb.u8 q3, q3
+; CHECK-NEXT:    vmovlt.u8 q2, q1
+; CHECK-NEXT:    vmovlt.u8 q3, q0
+; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vmovlb.u8 q0, q0
 ; CHECK-NEXT:    vsub.i16 q2, q3, q2
-; CHECK-NEXT:    vabs.s16 q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.8 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.8 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.8 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.8 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.8 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.8 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.8 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.8 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.u8 q1, q3
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmovlb.u8 q0, q3
 ; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vabs.s16 q2, q2
 ; CHECK-NEXT:    vabs.s16 q0, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q2[8], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q2[9], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.8 q2[10], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q2[11], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q2[12], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.8 q2[13], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q2[14], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i16 q0, q2
 ; CHECK-NEXT:    bx lr
   %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
   %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
@@ -359,55 +127,15 @@ define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
 define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vabd_u16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u16 q2, q2
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmovlb.u16 q3, q3
+; CHECK-NEXT:    vmovlt.u16 q2, q1
+; CHECK-NEXT:    vmovlt.u16 q3, q0
+; CHECK-NEXT:    vmovlb.u16 q1, q1
+; CHECK-NEXT:    vmovlb.u16 q0, q0
 ; CHECK-NEXT:    vsub.i32 q2, q3, q2
-; CHECK-NEXT:    vabs.s32 q3, q2
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q1, q3
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmovlb.u16 q0, q3
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vabs.s32 q2, q2
 ; CHECK-NEXT:    vabs.s32 q0, q0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i32 q0, q2
 ; CHECK-NEXT:    bx lr
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
index 6f7cef66809b..057f3f24d0ea 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@@ -139,51 +139,11 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vmulhs_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmullb.s16 q2, q3, q2
-; CHECK-NEXT:    vshr.u32 q3, q2, #16
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmullb.s16 q0, q1, q3
+; CHECK-NEXT:    vmullt.s16 q2, q0, q1
+; CHECK-NEXT:    vmullb.s16 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q2, q2, #16
 ; CHECK-NEXT:    vshr.u32 q0, q0, #16
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i32 q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i32>
@@ -197,51 +157,11 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vmulhu_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmullb.u16 q2, q3, q2
-; CHECK-NEXT:    vshr.u32 q3, q2, #16
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s15
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
-; CHECK-NEXT:    vmullb.u16 q0, q1, q3
+; CHECK-NEXT:    vmullt.u16 q2, q0, q1
+; CHECK-NEXT:    vmullb.u16 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q2, q2, #16
 ; CHECK-NEXT:    vshr.u32 q0, q0, #16
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i32 q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i32>
@@ -285,107 +205,11 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: vmulhs_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmullb.s8 q2, q3, q2
-; CHECK-NEXT:    vshr.u16 q3, q2, #8
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.8 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.8 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.8 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.8 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.8 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.8 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.8 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.8 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmullb.s8 q0, q1, q3
+; CHECK-NEXT:    vmullt.s8 q2, q0, q1
+; CHECK-NEXT:    vmullb.s8 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q2, q2, #8
 ; CHECK-NEXT:    vshr.u16 q0, q0, #8
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q2[8], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q2[9], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.8 q2[10], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q2[11], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q2[12], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.8 q2[13], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q2[14], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i16 q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i16>
@@ -399,107 +223,11 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: vmulhu_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmullb.u8 q2, q3, q2
-; CHECK-NEXT:    vshr.u16 q3, q2, #8
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.8 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.8 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.8 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.8 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.8 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.8 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.8 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.8 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmullb.u8 q0, q1, q3
+; CHECK-NEXT:    vmullt.u8 q2, q0, q1
+; CHECK-NEXT:    vmullb.u8 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q2, q2, #8
 ; CHECK-NEXT:    vshr.u16 q0, q0, #8
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.8 q2[8], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.8 q2[9], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.8 q2[10], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.8 q2[11], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.8 q2[12], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.8 q2[13], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.8 q2[14], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.8 q2[15], r0
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovnt.i16 q0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i16>


        


More information about the llvm-commits mailing list