[llvm] [CodeGen] Combine two loops in SloIndexes.cpp file (PR #127631)

via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 25 08:21:19 PST 2025


https://github.com/Rifet-c updated https://github.com/llvm/llvm-project/pull/127631

>From 3a8618b22ce3faab05b1faf956c5d537ac64f208 Mon Sep 17 00:00:00 2001
From: Aleksandr Levin <aleksandr.levin at codasip.com>
Date: Tue, 18 Feb 2025 14:06:48 +0100
Subject: [PATCH 1/6] Merged two loops that were iterating over the same
 machine basic block into one, also did some minor readability improvements
 (variable renaming, commenting and absorbing if condition into a variable)

---
 llvm/lib/CodeGen/SlotIndexes.cpp | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index 1b92a5aa59d18..7810a813a24a0 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -212,40 +212,42 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
   IndexList::iterator ListI = endIdx.listEntry()->getIterator();
   MachineBasicBlock::iterator MBBI = End;
   bool pastStart = false;
+  bool oldIndexesRemoved = false;
   while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) {
     assert(ListI->getIndex() >= startIdx.getIndex() &&
            (includeStart || !pastStart) &&
            "Decremented past the beginning of region to repair.");
 
-    MachineInstr *SlotMI = ListI->getInstr();
+    MachineInstr *indexedMI = ListI->getInstr();
     MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
     bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
+    bool MIIndexNotFound = MI && mi2iMap.find(MI) == mi2iMap.end();
 
-    if (SlotMI == MI && !MBBIAtBegin) {
+    if (indexedMI == MI && !MBBIAtBegin) {
       --ListI;
       if (MBBI != Begin)
         --MBBI;
       else
         pastStart = true;
-    } else if (MI && !mi2iMap.contains(MI)) {
+    } else if (MIIndexNotFound || oldIndexesRemoved) {
       if (MBBI != Begin)
         --MBBI;
       else
         pastStart = true;
     } else {
-      --ListI;
-      if (SlotMI)
-        removeMachineInstrFromMaps(*SlotMI);
+      // We ran through all the indexes on the interval
+      //   -> The only thing left is to go through all the
+      //   remaining MBB instructions and update their indexes
+      if (ListI == ListB)
+        oldIndexesRemoved = true;
+      else
+        --ListI;
+      if (indexedMI)
+        removeMachineInstrFromMaps(*indexedMI);
     }
-  }
-
-  // In theory this could be combined with the previous loop, but it is tricky
-  // to update the IndexList while we are iterating it.
-  for (MachineBasicBlock::iterator I = End; I != Begin;) {
-    --I;
-    MachineInstr &MI = *I;
-    if (!MI.isDebugOrPseudoInstr() && !mi2iMap.contains(&MI))
-      insertMachineInstrInMaps(MI);
+    // Insert isntruction back into the maps after passing it/removing the index
+    if (MIIndexNotFound && !MI->isDebugOrPseudoInstr())
+      insertMachineInstrInMaps(*MI);
   }
 }
 

>From 51cb4ca79ecfbcdde509b9144036b94a7978c25f Mon Sep 17 00:00:00 2001
From: Aleksandr Levin <aleksandr.levin at codasip.com>
Date: Thu, 20 Feb 2025 16:42:45 +0100
Subject: [PATCH 2/6] Fixed case when an instruction is inserted and then
 mistakenly removed afterwards

---
 llvm/lib/CodeGen/SlotIndexes.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index 7810a813a24a0..520bf0d4e137e 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -222,6 +222,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
     MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
     bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
     bool MIIndexNotFound = MI && mi2iMap.find(MI) == mi2iMap.end();
+    bool indexedMIRemoved = false;
 
     if (indexedMI == MI && !MBBIAtBegin) {
       --ListI;
@@ -242,12 +243,17 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
         oldIndexesRemoved = true;
       else
         --ListI;
-      if (indexedMI)
+      if (indexedMI) {
         removeMachineInstrFromMaps(*indexedMI);
+        indexedMIRemoved = true;
+      }
     }
+
+    MachineInstr *instrToInsert = indexedMIRemoved ? indexedMI : MI;
+
     // Insert isntruction back into the maps after passing it/removing the index
-    if (MIIndexNotFound && !MI->isDebugOrPseudoInstr())
-      insertMachineInstrInMaps(*MI);
+    if ((MIIndexNotFound || indexedMIRemoved) && instrToInsert->getParent() != nullptr && !instrToInsert->isDebugOrPseudoInstr())
+      insertMachineInstrInMaps(*instrToInsert);
   }
 }
 

>From 616103e7a64468ef2d672f30911d21a21b7296ff Mon Sep 17 00:00:00 2001
From: Aleksandr Levin <aleksandr.levin at codasip.com>
Date: Fri, 21 Feb 2025 14:54:00 +0100
Subject: [PATCH 3/6] Reverted the variable renaming

---
 llvm/lib/CodeGen/SlotIndexes.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index 520bf0d4e137e..1d6164132f9ab 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -218,13 +218,13 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
            (includeStart || !pastStart) &&
            "Decremented past the beginning of region to repair.");
 
-    MachineInstr *indexedMI = ListI->getInstr();
+    MachineInstr *slotMI = ListI->getInstr();
     MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
     bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
     bool MIIndexNotFound = MI && mi2iMap.find(MI) == mi2iMap.end();
-    bool indexedMIRemoved = false;
+    bool slotMIRemoved = false;
 
-    if (indexedMI == MI && !MBBIAtBegin) {
+    if (slotMI == MI && !MBBIAtBegin) {
       --ListI;
       if (MBBI != Begin)
         --MBBI;
@@ -243,16 +243,16 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
         oldIndexesRemoved = true;
       else
         --ListI;
-      if (indexedMI) {
-        removeMachineInstrFromMaps(*indexedMI);
-        indexedMIRemoved = true;
+      if (slotMI) {
+        removeMachineInstrFromMaps(*slotMI);
+        slotMIRemoved = true;
       }
     }
 
-    MachineInstr *instrToInsert = indexedMIRemoved ? indexedMI : MI;
+    MachineInstr *instrToInsert = slotMIRemoved ? slotMI : MI;
 
     // Insert isntruction back into the maps after passing it/removing the index
-    if ((MIIndexNotFound || indexedMIRemoved) && instrToInsert->getParent() != nullptr && !instrToInsert->isDebugOrPseudoInstr())
+    if ((MIIndexNotFound || slotMIRemoved) && instrToInsert->getParent() != nullptr && !instrToInsert->isDebugOrPseudoInstr())
       insertMachineInstrInMaps(*instrToInsert);
   }
 }

>From dbb5a75ff78b08be1e59ef9db13e0dfafb8ad17f Mon Sep 17 00:00:00 2001
From: Aleksandr Levin <aleksandr.levin at codasip.com>
Date: Tue, 25 Feb 2025 12:25:38 +0100
Subject: [PATCH 4/6] Changed tests to match new behavior (it seems like the
 changes just rearranged some instructions and registers with no difference in
 the result that they provide, but I might've overlooked something, so this
 needs a bit closer inspection by the reviewer).

---
 llvm/test/CodeGen/Thumb2/mve-shuffle.ll |   74 +-
 llvm/test/CodeGen/Thumb2/mve-vld3.ll    | 1262 +++++++++++++++--------
 2 files changed, 885 insertions(+), 451 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 33816fec69bac..4f0295d7b2760 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -223,18 +223,31 @@ entry:
 }
 
 define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
-; CHECK-LABEL: shuffle3_i16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s5, s3
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s1, s0
-; CHECK-NEXT:    vins.f16 s6, s4
-; CHECK-NEXT:    vins.f16 s5, s3
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: shuffle3_i16:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    vmovx.f16 s5, s3
+; CHECK-LV-NEXT:    vmovx.f16 s6, s1
+; CHECK-LV-NEXT:    vmovx.f16 s4, s0
+; CHECK-LV-NEXT:    vins.f16 s1, s0
+; CHECK-LV-NEXT:    vins.f16 s6, s4
+; CHECK-LV-NEXT:    vins.f16 s5, s3
+; CHECK-LV-NEXT:    vmov.f32 s4, s2
+; CHECK-LV-NEXT:    vmov.f32 s7, s1
+; CHECK-LV-NEXT:    vmov q0, q1
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: shuffle3_i16:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    vmov q1, q0
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s5
+; CHECK-LIS-NEXT:    vmovx.f16 s0, s4
+; CHECK-LIS-NEXT:    vins.f16 s5, s4
+; CHECK-LIS-NEXT:    vins.f16 s2, s0
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vmovx.f16 s1, s7
+; CHECK-LIS-NEXT:    vmov.f32 s3, s5
+; CHECK-LIS-NEXT:    vins.f16 s1, s7
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
   ret <8 x i16> %out
@@ -1145,18 +1158,31 @@ entry:
 }
 
 define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
-; CHECK-LABEL: shuffle3_f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s5, s3
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s1, s0
-; CHECK-NEXT:    vins.f16 s6, s4
-; CHECK-NEXT:    vins.f16 s5, s3
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: shuffle3_f16:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    vmovx.f16 s5, s3
+; CHECK-LV-NEXT:    vmovx.f16 s6, s1
+; CHECK-LV-NEXT:    vmovx.f16 s4, s0
+; CHECK-LV-NEXT:    vins.f16 s1, s0
+; CHECK-LV-NEXT:    vins.f16 s6, s4
+; CHECK-LV-NEXT:    vins.f16 s5, s3
+; CHECK-LV-NEXT:    vmov.f32 s4, s2
+; CHECK-LV-NEXT:    vmov.f32 s7, s1
+; CHECK-LV-NEXT:    vmov q0, q1
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: shuffle3_f16:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    vmov q1, q0
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s5
+; CHECK-LIS-NEXT:    vmovx.f16 s0, s4
+; CHECK-LIS-NEXT:    vins.f16 s5, s4
+; CHECK-LIS-NEXT:    vins.f16 s2, s0
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vmovx.f16 s1, s7
+; CHECK-LIS-NEXT:    vmov.f32 s3, s5
+; CHECK-LIS-NEXT:    vins.f16 s1, s7
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
   ret <8 x half> %out
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index b207ce7bdefd1..cdf3f81154d4a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -33,29 +33,53 @@ entry:
 }
 
 define void @vld3_v4i32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v4i32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s15, s18
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v4i32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9}
+; CHECK-LV-NEXT:    vpush {d8, d9}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-LV-NEXT:    vmov.f32 s10, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s3
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s9, s7
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vmov.f32 s15, s18
+; CHECK-LV-NEXT:    vmov.f32 s11, s17
+; CHECK-LV-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-LV-NEXT:    vmov.f32 s0, s6
+; CHECK-LV-NEXT:    vmov.f32 s2, s16
+; CHECK-LV-NEXT:    vmov.f32 s3, s19
+; CHECK-LV-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LV-NEXT:    vpop {d8, d9}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v4i32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9}
+; CHECK-LIS-NEXT:    vpush {d8, d9}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vmov.f32 s10, s2
+; CHECK-LIS-NEXT:    vmov.f32 s17, s0
+; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s9, s7
+; CHECK-LIS-NEXT:    vmov.f32 s16, s5
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vadd.i32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LIS-NEXT:    vpop {d8, d9}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <12 x i32>, ptr %src, align 4
   %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -68,46 +92,87 @@ entry:
 }
 
 define void @vld3_v8i32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v8i32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s15, s18
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s20, s8
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s7, s15
-; CHECK-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v8i32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-LV-NEXT:    vmov.f32 s10, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s3
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s9, s7
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vmov.f32 s15, s18
+; CHECK-LV-NEXT:    vmov.f32 s11, s17
+; CHECK-LV-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-LV-NEXT:    vmov.f32 s0, s6
+; CHECK-LV-NEXT:    vmov.f32 s2, s16
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LV-NEXT:    vmov.f32 s3, s19
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LV-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LV-NEXT:    vmov.f32 s17, s4
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LV-NEXT:    vmov.f32 s18, s7
+; CHECK-LV-NEXT:    vmov.f32 s22, s6
+; CHECK-LV-NEXT:    vmov.f32 s16, s9
+; CHECK-LV-NEXT:    vmov.f32 s19, s14
+; CHECK-LV-NEXT:    vmov.f32 s20, s8
+; CHECK-LV-NEXT:    vmov.f32 s21, s11
+; CHECK-LV-NEXT:    vmov.f32 s23, s13
+; CHECK-LV-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-LV-NEXT:    vmov.f32 s4, s10
+; CHECK-LV-NEXT:    vmov.f32 s6, s12
+; CHECK-LV-NEXT:    vmov.f32 s7, s15
+; CHECK-LV-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v8i32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vmov.f32 s10, s2
+; CHECK-LIS-NEXT:    vmov.f32 s17, s0
+; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s9, s7
+; CHECK-LIS-NEXT:    vmov.f32 s16, s5
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vadd.i32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LIS-NEXT:    vmov.f32 s17, s4
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s18, s7
+; CHECK-LIS-NEXT:    vmov.f32 s22, s6
+; CHECK-LIS-NEXT:    vmov.f32 s16, s9
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s20, s8
+; CHECK-LIS-NEXT:    vmov.f32 s21, s11
+; CHECK-LIS-NEXT:    vmov.f32 s23, s13
+; CHECK-LIS-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-LIS-NEXT:    vmov.f32 s4, s10
+; CHECK-LIS-NEXT:    vmov.f32 s6, s12
+; CHECK-LIS-NEXT:    vmov.f32 s7, s15
+; CHECK-LIS-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <24 x i32>, ptr %src, align 4
   %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -120,80 +185,155 @@ entry:
 }
 
 define void @vld3_v16i32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v16i32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s15, s18
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s20, s8
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s7, s15
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-NEXT:    vmov.f32 s18, s10
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s16, s12
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s20, s13
-; CHECK-NEXT:    vmov.f32 s23, s26
-; CHECK-NEXT:    vmov.f32 s19, s25
-; CHECK-NEXT:    vadd.i32 q4, q4, q5
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vmov.f32 s10, s24
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vmov.f32 s11, s27
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT:    vadd.i32 q2, q4, q2
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-NEXT:    vmov.f32 s25, s12
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s30, s14
-; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vmov.f32 s24, s17
-; CHECK-NEXT:    vmov.f32 s27, s22
-; CHECK-NEXT:    vmov.f32 s28, s16
-; CHECK-NEXT:    vmov.f32 s29, s19
-; CHECK-NEXT:    vmov.f32 s31, s21
-; CHECK-NEXT:    vadd.i32 q6, q7, q6
-; CHECK-NEXT:    vmov.f32 s12, s18
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vadd.i32 q3, q6, q3
-; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v16i32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-LV-NEXT:    vldrw.u32 q6, [r0, #176]
+; CHECK-LV-NEXT:    vmov.f32 s10, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s3
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s9, s7
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vmov.f32 s15, s18
+; CHECK-LV-NEXT:    vmov.f32 s11, s17
+; CHECK-LV-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-LV-NEXT:    vmov.f32 s0, s6
+; CHECK-LV-NEXT:    vmov.f32 s2, s16
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LV-NEXT:    vmov.f32 s3, s19
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LV-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LV-NEXT:    vmov.f32 s17, s4
+; CHECK-LV-NEXT:    vmov.f32 s18, s7
+; CHECK-LV-NEXT:    vmov.f32 s22, s6
+; CHECK-LV-NEXT:    vmov.f32 s16, s9
+; CHECK-LV-NEXT:    vmov.f32 s19, s14
+; CHECK-LV-NEXT:    vmov.f32 s20, s8
+; CHECK-LV-NEXT:    vmov.f32 s21, s11
+; CHECK-LV-NEXT:    vmov.f32 s23, s13
+; CHECK-LV-NEXT:    vmov.f32 s4, s10
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-LV-NEXT:    vmov.f32 s6, s12
+; CHECK-LV-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-LV-NEXT:    vmov.f32 s7, s15
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-LV-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-LV-NEXT:    vmov.f32 s18, s10
+; CHECK-LV-NEXT:    vmov.f32 s21, s8
+; CHECK-LV-NEXT:    vmov.f32 s22, s11
+; CHECK-LV-NEXT:    vmov.f32 s16, s12
+; CHECK-LV-NEXT:    vmov.f32 s17, s15
+; CHECK-LV-NEXT:    vmov.f32 s20, s13
+; CHECK-LV-NEXT:    vmov.f32 s23, s26
+; CHECK-LV-NEXT:    vmov.f32 s19, s25
+; CHECK-LV-NEXT:    vadd.i32 q4, q4, q5
+; CHECK-LV-NEXT:    vmov.f32 s8, s14
+; CHECK-LV-NEXT:    vmov.f32 s10, s24
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-LV-NEXT:    vmov.f32 s11, s27
+; CHECK-LV-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-LV-NEXT:    vadd.i32 q2, q4, q2
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-LV-NEXT:    vmov.f32 s25, s12
+; CHECK-LV-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-LV-NEXT:    vmov.f32 s26, s15
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LV-NEXT:    vmov.f32 s30, s14
+; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LV-NEXT:    vmov.f32 s24, s17
+; CHECK-LV-NEXT:    vmov.f32 s27, s22
+; CHECK-LV-NEXT:    vmov.f32 s28, s16
+; CHECK-LV-NEXT:    vmov.f32 s29, s19
+; CHECK-LV-NEXT:    vmov.f32 s31, s21
+; CHECK-LV-NEXT:    vadd.i32 q6, q7, q6
+; CHECK-LV-NEXT:    vmov.f32 s12, s18
+; CHECK-LV-NEXT:    vmov.f32 s14, s20
+; CHECK-LV-NEXT:    vmov.f32 s15, s23
+; CHECK-LV-NEXT:    vadd.i32 q3, q6, q3
+; CHECK-LV-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v16i32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vmov.f32 s10, s2
+; CHECK-LIS-NEXT:    vmov.f32 s17, s0
+; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s9, s7
+; CHECK-LIS-NEXT:    vmov.f32 s16, s5
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vadd.i32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LIS-NEXT:    vmov.f32 s17, s4
+; CHECK-LIS-NEXT:    vmov.f32 s18, s7
+; CHECK-LIS-NEXT:    vmov.f32 s22, s6
+; CHECK-LIS-NEXT:    vmov.f32 s16, s9
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s20, s8
+; CHECK-LIS-NEXT:    vmov.f32 s21, s11
+; CHECK-LIS-NEXT:    vmov.f32 s23, s13
+; CHECK-LIS-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-LIS-NEXT:    vmov.f32 s4, s10
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #176]
+; CHECK-LIS-NEXT:    vmov.f32 s6, s12
+; CHECK-LIS-NEXT:    vmov.f32 s7, s15
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-LIS-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-LIS-NEXT:    vmov.f32 s18, s10
+; CHECK-LIS-NEXT:    vmov.f32 s25, s8
+; CHECK-LIS-NEXT:    vmov.f32 s26, s11
+; CHECK-LIS-NEXT:    vmov.f32 s16, s12
+; CHECK-LIS-NEXT:    vmov.f32 s17, s15
+; CHECK-LIS-NEXT:    vmov.f32 s24, s13
+; CHECK-LIS-NEXT:    vmov.f32 s27, s22
+; CHECK-LIS-NEXT:    vmov.f32 s19, s21
+; CHECK-LIS-NEXT:    vmov.f32 s8, s14
+; CHECK-LIS-NEXT:    vadd.i32 q4, q4, q6
+; CHECK-LIS-NEXT:    vmov.f32 s10, s20
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-LIS-NEXT:    vmov.f32 s11, s23
+; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-LIS-NEXT:    vadd.i32 q2, q4, q2
+; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-LIS-NEXT:    vmov.f32 s25, s12
+; CHECK-LIS-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-LIS-NEXT:    vmov.f32 s26, s15
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s30, s14
+; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LIS-NEXT:    vmov.f32 s24, s17
+; CHECK-LIS-NEXT:    vmov.f32 s27, s22
+; CHECK-LIS-NEXT:    vmov.f32 s28, s16
+; CHECK-LIS-NEXT:    vmov.f32 s29, s19
+; CHECK-LIS-NEXT:    vmov.f32 s31, s21
+; CHECK-LIS-NEXT:    vadd.i32 q6, q7, q6
+; CHECK-LIS-NEXT:    vmov.f32 s12, s18
+; CHECK-LIS-NEXT:    vmov.f32 s14, s20
+; CHECK-LIS-NEXT:    vmov.f32 s15, s23
+; CHECK-LIS-NEXT:    vadd.i32 q3, q6, q3
+; CHECK-LIS-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <48 x i32>, ptr %src, align 4
   %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
@@ -792,35 +932,65 @@ entry:
 ; i64
 
 define void @vld3_v2i64(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v2i64:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s13, s3
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov r0, r3, d5
-; CHECK-NEXT:    vmov r2, r4, d3
-; CHECK-NEXT:    vmov r6, r7, d0
-; CHECK-NEXT:    vmov r5, r8, d6
-; CHECK-NEXT:    vmov lr, r12, d1
-; CHECK-NEXT:    adds.w r0, r0, lr
-; CHECK-NEXT:    adc.w r3, r3, r12
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adc.w r2, r3, r4
-; CHECK-NEXT:    vmov r3, r4, d4
-; CHECK-NEXT:    adds r6, r6, r5
-; CHECK-NEXT:    adc.w r7, r7, r8
-; CHECK-NEXT:    adds r3, r3, r6
-; CHECK-NEXT:    adcs r7, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
-; CHECK-NEXT:    vmov q0[3], q0[1], r7, r2
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-LV-LABEL: vld3_v2i64:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-LV-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-LV-NEXT:    vmov.f32 s12, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s3
+; CHECK-LV-NEXT:    vmov.f32 s2, s4
+; CHECK-LV-NEXT:    vmov.f32 s3, s5
+; CHECK-LV-NEXT:    vmov r0, r3, d5
+; CHECK-LV-NEXT:    vmov r2, r4, d3
+; CHECK-LV-NEXT:    vmov r6, r7, d0
+; CHECK-LV-NEXT:    vmov r5, r8, d6
+; CHECK-LV-NEXT:    vmov lr, r12, d1
+; CHECK-LV-NEXT:    adds.w r0, r0, lr
+; CHECK-LV-NEXT:    adc.w r3, r3, r12
+; CHECK-LV-NEXT:    adds r0, r0, r2
+; CHECK-LV-NEXT:    adc.w r2, r3, r4
+; CHECK-LV-NEXT:    vmov r3, r4, d4
+; CHECK-LV-NEXT:    adds r6, r6, r5
+; CHECK-LV-NEXT:    adc.w r7, r7, r8
+; CHECK-LV-NEXT:    adds r3, r3, r6
+; CHECK-LV-NEXT:    adcs r7, r4
+; CHECK-LV-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-LV-NEXT:    vmov q0[3], q0[1], r7, r2
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LV-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+
+; CHECK-LIS-LABEL: vld3_v2i64:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-LIS-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s8, s2
+; CHECK-LIS-NEXT:    vmov.f32 s9, s3
+; CHECK-LIS-NEXT:    vmov.f32 s2, s4
+; CHECK-LIS-NEXT:    vmov.f32 s3, s5
+; CHECK-LIS-NEXT:    vmov r0, r2, d7
+; CHECK-LIS-NEXT:    vmov r3, r4, d3
+; CHECK-LIS-NEXT:    vmov r6, r7, d0
+; CHECK-LIS-NEXT:    vmov r5, r8, d4
+; CHECK-LIS-NEXT:    vmov lr, r12, d1
+; CHECK-LIS-NEXT:    adds.w r0, r0, lr
+; CHECK-LIS-NEXT:    adc.w r2, r2, r12
+; CHECK-LIS-NEXT:    adds r0, r0, r3
+; CHECK-LIS-NEXT:    adcs r2, r4
+; CHECK-LIS-NEXT:    vmov r3, r4, d6
+; CHECK-LIS-NEXT:    adds r6, r6, r5
+; CHECK-LIS-NEXT:    adc.w r7, r7, r8
+; CHECK-LIS-NEXT:    adds r3, r3, r6
+; CHECK-LIS-NEXT:    adcs r7, r4
+; CHECK-LIS-NEXT:    vmov q0[2], q0[0], r3, r0
+; CHECK-LIS-NEXT:    vmov q0[3], q0[1], r7, r2
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LIS-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <6 x i64>, ptr %src, align 4
   %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
@@ -892,8 +1062,10 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
 ;
 ; CHECK-LIS-LABEL: vld3_v4i64:
 ; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .save {r4, r5, r6, r7, r8, lr}
-; CHECK-LIS-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-LIS-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-LIS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-LIS-NEXT:    .pad #4
+; CHECK-LIS-NEXT:    sub sp, #4
 ; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12}
 ; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12}
 ; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0]
@@ -906,46 +1078,47 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vmov.f32 s2, s12
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s13
 ; CHECK-LIS-NEXT:    vmov r5, r4, d5
-; CHECK-LIS-NEXT:    vmov r3, r8, d7
+; CHECK-LIS-NEXT:    vmov r2, r8, d7
 ; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
 ; CHECK-LIS-NEXT:    vmov.f32 s24, s22
 ; CHECK-LIS-NEXT:    vmov.f32 s25, s23
+; CHECK-LIS-NEXT:    vmov r3, r7, d10
 ; CHECK-LIS-NEXT:    vmov lr, r12, d1
 ; CHECK-LIS-NEXT:    vmov.f32 s2, s12
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s13
-; CHECK-LIS-NEXT:    vmov r7, r6, d12
-; CHECK-LIS-NEXT:    adds.w r0, r5, lr
-; CHECK-LIS-NEXT:    adc.w r5, r4, r12
-; CHECK-LIS-NEXT:    adds.w lr, r0, r3
-; CHECK-LIS-NEXT:    vmov r4, r2, d10
-; CHECK-LIS-NEXT:    adc.w r12, r5, r8
-; CHECK-LIS-NEXT:    vmov r5, r0, d8
-; CHECK-LIS-NEXT:    adds r7, r7, r4
-; CHECK-LIS-NEXT:    adcs r2, r6
-; CHECK-LIS-NEXT:    adds r7, r7, r5
-; CHECK-LIS-NEXT:    adc.w r8, r2, r0
-; CHECK-LIS-NEXT:    vmov r6, r4, d1
-; CHECK-LIS-NEXT:    vmov r2, r5, d9
-; CHECK-LIS-NEXT:    vmov r3, r0, d0
-; CHECK-LIS-NEXT:    adds r2, r2, r6
-; CHECK-LIS-NEXT:    adc.w r6, r5, r4
-; CHECK-LIS-NEXT:    vmov r5, r4, d7
-; CHECK-LIS-NEXT:    adds r2, r2, r5
-; CHECK-LIS-NEXT:    adcs r6, r4
-; CHECK-LIS-NEXT:    vmov r5, r4, d2
-; CHECK-LIS-NEXT:    vmov q1[2], q1[0], r7, r2
-; CHECK-LIS-NEXT:    vmov q1[3], q1[1], r8, r6
+; CHECK-LIS-NEXT:    vmov r6, r9, d12
+; CHECK-LIS-NEXT:    adds.w r5, r5, lr
+; CHECK-LIS-NEXT:    adc.w r4, r4, r12
+; CHECK-LIS-NEXT:    adds.w lr, r5, r2
+; CHECK-LIS-NEXT:    adc.w r12, r4, r8
+; CHECK-LIS-NEXT:    vmov r4, r0, d8
+; CHECK-LIS-NEXT:    adds r3, r3, r6
+; CHECK-LIS-NEXT:    vmov r6, r2, d1
+; CHECK-LIS-NEXT:    adc.w r7, r7, r9
+; CHECK-LIS-NEXT:    adds r3, r3, r4
+; CHECK-LIS-NEXT:    adc.w r8, r7, r0
+; CHECK-LIS-NEXT:    vmov r7, r4, d9
+; CHECK-LIS-NEXT:    vmov r5, r0, d0
+; CHECK-LIS-NEXT:    adds r7, r7, r6
+; CHECK-LIS-NEXT:    adcs r2, r4
+; CHECK-LIS-NEXT:    vmov r6, r4, d7
+; CHECK-LIS-NEXT:    adds r7, r7, r6
+; CHECK-LIS-NEXT:    adcs r2, r4
+; CHECK-LIS-NEXT:    vmov r6, r4, d2
+; CHECK-LIS-NEXT:    vmov q1[2], q1[0], r3, r7
+; CHECK-LIS-NEXT:    vmov q1[3], q1[1], r8, r2
 ; CHECK-LIS-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-LIS-NEXT:    adds r3, r3, r5
+; CHECK-LIS-NEXT:    adds r5, r5, r6
 ; CHECK-LIS-NEXT:    adcs r0, r4
-; CHECK-LIS-NEXT:    vmov r4, r5, d4
-; CHECK-LIS-NEXT:    adds r3, r3, r4
-; CHECK-LIS-NEXT:    vmov q0[2], q0[0], r3, lr
-; CHECK-LIS-NEXT:    adcs r0, r5
+; CHECK-LIS-NEXT:    vmov r4, r6, d4
+; CHECK-LIS-NEXT:    adds r4, r4, r5
+; CHECK-LIS-NEXT:    vmov q0[2], q0[0], r4, lr
+; CHECK-LIS-NEXT:    adcs r0, r6
 ; CHECK-LIS-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12}
-; CHECK-LIS-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-LIS-NEXT:    add sp, #4
+; CHECK-LIS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:
   %l1 = load <12 x i64>, ptr %src, align 4
   %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -960,19 +1133,33 @@ entry:
 ; f32
 
 define void @vld3_v2f32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v2f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldr s1, [r0, #16]
-; CHECK-NEXT:    vldr s5, [r0, #20]
-; CHECK-NEXT:    vmov.f32 s12, s8
-; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vadd.f32 q0, q3, q0
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vstmia r1, {s0, s1}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v2f32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LV-NEXT:    vldr s1, [r0, #16]
+; CHECK-LV-NEXT:    vldr s5, [r0, #20]
+; CHECK-LV-NEXT:    vmov.f32 s12, s8
+; CHECK-LV-NEXT:    vmov.f32 s13, s11
+; CHECK-LV-NEXT:    vmov.f32 s0, s9
+; CHECK-LV-NEXT:    vadd.f32 q0, q3, q0
+; CHECK-LV-NEXT:    vmov.f32 s4, s10
+; CHECK-LV-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-LV-NEXT:    vstmia r1, {s0, s1}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v2f32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LIS-NEXT:    vldr s5, [r0, #16]
+; CHECK-LIS-NEXT:    vldr s1, [r0, #20]
+; CHECK-LIS-NEXT:    vmov.f32 s12, s8
+; CHECK-LIS-NEXT:    vmov.f32 s13, s11
+; CHECK-LIS-NEXT:    vmov.f32 s4, s9
+; CHECK-LIS-NEXT:    vadd.f32 q1, q3, q1
+; CHECK-LIS-NEXT:    vmov.f32 s0, s10
+; CHECK-LIS-NEXT:    vadd.f32 q0, q1, q0
+; CHECK-LIS-NEXT:    vstmia r1, {s0, s1}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <6 x float>, ptr %src, align 4
   %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3>
@@ -985,29 +1172,53 @@ entry:
 }
 
 define void @vld3_v4f32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v4f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s15, s18
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v4f32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9}
+; CHECK-LV-NEXT:    vpush {d8, d9}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-LV-NEXT:    vmov.f32 s10, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s3
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s9, s7
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vmov.f32 s15, s18
+; CHECK-LV-NEXT:    vmov.f32 s11, s17
+; CHECK-LV-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-LV-NEXT:    vmov.f32 s0, s6
+; CHECK-LV-NEXT:    vmov.f32 s2, s16
+; CHECK-LV-NEXT:    vmov.f32 s3, s19
+; CHECK-LV-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LV-NEXT:    vpop {d8, d9}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v4f32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9}
+; CHECK-LIS-NEXT:    vpush {d8, d9}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vmov.f32 s10, s2
+; CHECK-LIS-NEXT:    vmov.f32 s17, s0
+; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s9, s7
+; CHECK-LIS-NEXT:    vmov.f32 s16, s5
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LIS-NEXT:    vpop {d8, d9}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <12 x float>, ptr %src, align 4
   %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1020,46 +1231,87 @@ entry:
 }
 
 define void @vld3_v8f32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v8f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s15, s18
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s20, s8
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s7, s15
-; CHECK-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v8f32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-LV-NEXT:    vmov.f32 s10, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s3
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s9, s7
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vmov.f32 s15, s18
+; CHECK-LV-NEXT:    vmov.f32 s11, s17
+; CHECK-LV-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-LV-NEXT:    vmov.f32 s0, s6
+; CHECK-LV-NEXT:    vmov.f32 s2, s16
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LV-NEXT:    vmov.f32 s3, s19
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LV-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LV-NEXT:    vmov.f32 s17, s4
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LV-NEXT:    vmov.f32 s18, s7
+; CHECK-LV-NEXT:    vmov.f32 s22, s6
+; CHECK-LV-NEXT:    vmov.f32 s16, s9
+; CHECK-LV-NEXT:    vmov.f32 s19, s14
+; CHECK-LV-NEXT:    vmov.f32 s20, s8
+; CHECK-LV-NEXT:    vmov.f32 s21, s11
+; CHECK-LV-NEXT:    vmov.f32 s23, s13
+; CHECK-LV-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-LV-NEXT:    vmov.f32 s4, s10
+; CHECK-LV-NEXT:    vmov.f32 s6, s12
+; CHECK-LV-NEXT:    vmov.f32 s7, s15
+; CHECK-LV-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v8f32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vmov.f32 s10, s2
+; CHECK-LIS-NEXT:    vmov.f32 s17, s0
+; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s9, s7
+; CHECK-LIS-NEXT:    vmov.f32 s16, s5
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LIS-NEXT:    vmov.f32 s17, s4
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s18, s7
+; CHECK-LIS-NEXT:    vmov.f32 s22, s6
+; CHECK-LIS-NEXT:    vmov.f32 s16, s9
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s20, s8
+; CHECK-LIS-NEXT:    vmov.f32 s21, s11
+; CHECK-LIS-NEXT:    vmov.f32 s23, s13
+; CHECK-LIS-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-LIS-NEXT:    vmov.f32 s4, s10
+; CHECK-LIS-NEXT:    vmov.f32 s6, s12
+; CHECK-LIS-NEXT:    vmov.f32 s7, s15
+; CHECK-LIS-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <24 x float>, ptr %src, align 4
   %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1072,80 +1324,155 @@ entry:
 }
 
 define void @vld3_v16f32(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v16f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s15, s18
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vadd.f32 q2, q2, q3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vadd.f32 q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s20, s8
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s7, s15
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-NEXT:    vmov.f32 s18, s10
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s16, s12
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s20, s13
-; CHECK-NEXT:    vmov.f32 s23, s26
-; CHECK-NEXT:    vmov.f32 s19, s25
-; CHECK-NEXT:    vadd.f32 q4, q4, q5
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vmov.f32 s10, s24
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vmov.f32 s11, s27
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT:    vadd.f32 q2, q4, q2
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-NEXT:    vmov.f32 s25, s12
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s30, s14
-; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vmov.f32 s24, s17
-; CHECK-NEXT:    vmov.f32 s27, s22
-; CHECK-NEXT:    vmov.f32 s28, s16
-; CHECK-NEXT:    vmov.f32 s29, s19
-; CHECK-NEXT:    vmov.f32 s31, s21
-; CHECK-NEXT:    vadd.f32 q6, q7, q6
-; CHECK-NEXT:    vmov.f32 s12, s18
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vadd.f32 q3, q6, q3
-; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v16f32:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-LV-NEXT:    vldrw.u32 q6, [r0, #176]
+; CHECK-LV-NEXT:    vmov.f32 s10, s2
+; CHECK-LV-NEXT:    vmov.f32 s13, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s3
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s9, s7
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vmov.f32 s15, s18
+; CHECK-LV-NEXT:    vmov.f32 s11, s17
+; CHECK-LV-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-LV-NEXT:    vmov.f32 s0, s6
+; CHECK-LV-NEXT:    vmov.f32 s2, s16
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LV-NEXT:    vmov.f32 s3, s19
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LV-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LV-NEXT:    vmov.f32 s17, s4
+; CHECK-LV-NEXT:    vmov.f32 s18, s7
+; CHECK-LV-NEXT:    vmov.f32 s22, s6
+; CHECK-LV-NEXT:    vmov.f32 s16, s9
+; CHECK-LV-NEXT:    vmov.f32 s19, s14
+; CHECK-LV-NEXT:    vmov.f32 s20, s8
+; CHECK-LV-NEXT:    vmov.f32 s21, s11
+; CHECK-LV-NEXT:    vmov.f32 s23, s13
+; CHECK-LV-NEXT:    vmov.f32 s4, s10
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-LV-NEXT:    vmov.f32 s6, s12
+; CHECK-LV-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-LV-NEXT:    vmov.f32 s7, s15
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-LV-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-LV-NEXT:    vmov.f32 s18, s10
+; CHECK-LV-NEXT:    vmov.f32 s21, s8
+; CHECK-LV-NEXT:    vmov.f32 s22, s11
+; CHECK-LV-NEXT:    vmov.f32 s16, s12
+; CHECK-LV-NEXT:    vmov.f32 s17, s15
+; CHECK-LV-NEXT:    vmov.f32 s20, s13
+; CHECK-LV-NEXT:    vmov.f32 s23, s26
+; CHECK-LV-NEXT:    vmov.f32 s19, s25
+; CHECK-LV-NEXT:    vadd.f32 q4, q4, q5
+; CHECK-LV-NEXT:    vmov.f32 s8, s14
+; CHECK-LV-NEXT:    vmov.f32 s10, s24
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-LV-NEXT:    vmov.f32 s11, s27
+; CHECK-LV-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-LV-NEXT:    vadd.f32 q2, q4, q2
+; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-LV-NEXT:    vmov.f32 s25, s12
+; CHECK-LV-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-LV-NEXT:    vmov.f32 s26, s15
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LV-NEXT:    vmov.f32 s30, s14
+; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LV-NEXT:    vmov.f32 s24, s17
+; CHECK-LV-NEXT:    vmov.f32 s27, s22
+; CHECK-LV-NEXT:    vmov.f32 s28, s16
+; CHECK-LV-NEXT:    vmov.f32 s29, s19
+; CHECK-LV-NEXT:    vmov.f32 s31, s21
+; CHECK-LV-NEXT:    vadd.f32 q6, q7, q6
+; CHECK-LV-NEXT:    vmov.f32 s12, s18
+; CHECK-LV-NEXT:    vmov.f32 s14, s20
+; CHECK-LV-NEXT:    vmov.f32 s15, s23
+; CHECK-LV-NEXT:    vadd.f32 q3, q6, q3
+; CHECK-LV-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v16f32:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vmov.f32 s10, s2
+; CHECK-LIS-NEXT:    vmov.f32 s17, s0
+; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s9, s7
+; CHECK-LIS-NEXT:    vmov.f32 s16, s5
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s0, s6
+; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-LIS-NEXT:    vmov.f32 s17, s4
+; CHECK-LIS-NEXT:    vmov.f32 s18, s7
+; CHECK-LIS-NEXT:    vmov.f32 s22, s6
+; CHECK-LIS-NEXT:    vmov.f32 s16, s9
+; CHECK-LIS-NEXT:    vmov.f32 s19, s14
+; CHECK-LIS-NEXT:    vmov.f32 s20, s8
+; CHECK-LIS-NEXT:    vmov.f32 s21, s11
+; CHECK-LIS-NEXT:    vmov.f32 s23, s13
+; CHECK-LIS-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-LIS-NEXT:    vmov.f32 s4, s10
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #176]
+; CHECK-LIS-NEXT:    vmov.f32 s6, s12
+; CHECK-LIS-NEXT:    vmov.f32 s7, s15
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-LIS-NEXT:    vadd.f32 q1, q4, q1
+; CHECK-LIS-NEXT:    vmov.f32 s18, s10
+; CHECK-LIS-NEXT:    vmov.f32 s25, s8
+; CHECK-LIS-NEXT:    vmov.f32 s26, s11
+; CHECK-LIS-NEXT:    vmov.f32 s16, s12
+; CHECK-LIS-NEXT:    vmov.f32 s17, s15
+; CHECK-LIS-NEXT:    vmov.f32 s24, s13
+; CHECK-LIS-NEXT:    vmov.f32 s27, s22
+; CHECK-LIS-NEXT:    vmov.f32 s19, s21
+; CHECK-LIS-NEXT:    vmov.f32 s8, s14
+; CHECK-LIS-NEXT:    vadd.f32 q4, q4, q6
+; CHECK-LIS-NEXT:    vmov.f32 s10, s20
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-LIS-NEXT:    vmov.f32 s11, s23
+; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-LIS-NEXT:    vadd.f32 q2, q4, q2
+; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-LIS-NEXT:    vmov.f32 s25, s12
+; CHECK-LIS-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-LIS-NEXT:    vmov.f32 s26, s15
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-LIS-NEXT:    vmov.f32 s30, s14
+; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
+; CHECK-LIS-NEXT:    vmov.f32 s24, s17
+; CHECK-LIS-NEXT:    vmov.f32 s27, s22
+; CHECK-LIS-NEXT:    vmov.f32 s28, s16
+; CHECK-LIS-NEXT:    vmov.f32 s29, s19
+; CHECK-LIS-NEXT:    vmov.f32 s31, s21
+; CHECK-LIS-NEXT:    vadd.f32 q6, q7, q6
+; CHECK-LIS-NEXT:    vmov.f32 s12, s18
+; CHECK-LIS-NEXT:    vmov.f32 s14, s20
+; CHECK-LIS-NEXT:    vmov.f32 s15, s23
+; CHECK-LIS-NEXT:    vadd.f32 q3, q6, q3
+; CHECK-LIS-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <48 x float>, ptr %src, align 4
   %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
@@ -1284,86 +1611,167 @@ entry:
 }
 
 define void @vld3_v16f16(ptr %src, ptr %dst) {
-; CHECK-LABEL: vld3_v16f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s9
-; CHECK-NEXT:    vmov.f32 s5, s8
-; CHECK-NEXT:    vmovx.f16 s7, s12
-; CHECK-NEXT:    vins.f16 s5, s6
-; CHECK-NEXT:    vmov.f32 s6, s11
-; CHECK-NEXT:    vins.f16 s6, s7
-; CHECK-NEXT:    vmovx.f16 s16, s15
-; CHECK-NEXT:    vmov.f32 s7, s14
-; CHECK-NEXT:    vmovx.f16 s17, s3
-; CHECK-NEXT:    vins.f16 s7, s16
-; CHECK-NEXT:    vmovx.f16 s16, s0
-; CHECK-NEXT:    vins.f16 s16, s2
-; CHECK-NEXT:    vmovx.f16 s2, s1
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmovx.f16 s2, s8
-; CHECK-NEXT:    vins.f16 s3, s2
-; CHECK-NEXT:    vmovx.f16 s2, s11
-; CHECK-NEXT:    vmovx.f16 s18, s10
-; CHECK-NEXT:    vins.f16 s10, s2
-; CHECK-NEXT:    vmovx.f16 s2, s14
-; CHECK-NEXT:    vmovx.f16 s19, s13
-; CHECK-NEXT:    vins.f16 s13, s2
-; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vins.f16 s18, s12
-; CHECK-NEXT:    vins.f16 s19, s15
-; CHECK-NEXT:    vins.f16 s17, s9
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s13
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
-; CHECK-NEXT:    vadd.f16 q3, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s14, s6
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmovx.f16 s14, s9
-; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vmovx.f16 s18, s10
-; CHECK-NEXT:    vins.f16 s13, s14
-; CHECK-NEXT:    vmovx.f16 s15, s0
-; CHECK-NEXT:    vmov.f32 s14, s11
-; CHECK-NEXT:    vins.f16 s18, s0
-; CHECK-NEXT:    vins.f16 s14, s15
-; CHECK-NEXT:    vmovx.f16 s16, s3
-; CHECK-NEXT:    vmov.f32 s15, s2
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vins.f16 s15, s16
-; CHECK-NEXT:    vmovx.f16 s16, s4
-; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s17, s7
-; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vmovx.f16 s0, s11
-; CHECK-NEXT:    vmovx.f16 s19, s1
-; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vins.f16 s1, s0
-; CHECK-NEXT:    vins.f16 s16, s6
-; CHECK-NEXT:    vmov.f32 s5, s7
-; CHECK-NEXT:    vins.f16 s19, s3
-; CHECK-NEXT:    vins.f16 s17, s9
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vadd.f16 q0, q1, q4
-; CHECK-NEXT:    vadd.f16 q0, q0, q3
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: vld3_v16f16:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9}
+; CHECK-LV-NEXT:    vpush {d8, d9}
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LV-NEXT:    vmovx.f16 s6, s2
+; CHECK-LV-NEXT:    vmov.f32 s4, s1
+; CHECK-LV-NEXT:    vins.f16 s4, s6
+; CHECK-LV-NEXT:    vmovx.f16 s6, s9
+; CHECK-LV-NEXT:    vmov.f32 s5, s8
+; CHECK-LV-NEXT:    vmovx.f16 s7, s12
+; CHECK-LV-NEXT:    vins.f16 s5, s6
+; CHECK-LV-NEXT:    vmov.f32 s6, s11
+; CHECK-LV-NEXT:    vins.f16 s6, s7
+; CHECK-LV-NEXT:    vmovx.f16 s16, s15
+; CHECK-LV-NEXT:    vmov.f32 s7, s14
+; CHECK-LV-NEXT:    vmovx.f16 s17, s3
+; CHECK-LV-NEXT:    vins.f16 s7, s16
+; CHECK-LV-NEXT:    vmovx.f16 s16, s0
+; CHECK-LV-NEXT:    vins.f16 s16, s2
+; CHECK-LV-NEXT:    vmovx.f16 s2, s1
+; CHECK-LV-NEXT:    vins.f16 s0, s2
+; CHECK-LV-NEXT:    vmovx.f16 s2, s8
+; CHECK-LV-NEXT:    vins.f16 s3, s2
+; CHECK-LV-NEXT:    vmovx.f16 s2, s11
+; CHECK-LV-NEXT:    vmovx.f16 s18, s10
+; CHECK-LV-NEXT:    vins.f16 s10, s2
+; CHECK-LV-NEXT:    vmovx.f16 s2, s14
+; CHECK-LV-NEXT:    vmovx.f16 s19, s13
+; CHECK-LV-NEXT:    vins.f16 s13, s2
+; CHECK-LV-NEXT:    vmov.f32 s1, s3
+; CHECK-LV-NEXT:    vins.f16 s18, s12
+; CHECK-LV-NEXT:    vins.f16 s19, s15
+; CHECK-LV-NEXT:    vins.f16 s17, s9
+; CHECK-LV-NEXT:    vmov.f32 s2, s10
+; CHECK-LV-NEXT:    vmov.f32 s3, s13
+; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-LV-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-LV-NEXT:    vadd.f16 q3, q0, q1
+; CHECK-LV-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-LV-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-LV-NEXT:    vmovx.f16 s14, s6
+; CHECK-LV-NEXT:    vmov.f32 s12, s5
+; CHECK-LV-NEXT:    vins.f16 s12, s14
+; CHECK-LV-NEXT:    vmovx.f16 s14, s9
+; CHECK-LV-NEXT:    vmov.f32 s13, s8
+; CHECK-LV-NEXT:    vmovx.f16 s18, s10
+; CHECK-LV-NEXT:    vins.f16 s13, s14
+; CHECK-LV-NEXT:    vmovx.f16 s15, s0
+; CHECK-LV-NEXT:    vmov.f32 s14, s11
+; CHECK-LV-NEXT:    vins.f16 s18, s0
+; CHECK-LV-NEXT:    vins.f16 s14, s15
+; CHECK-LV-NEXT:    vmovx.f16 s16, s3
+; CHECK-LV-NEXT:    vmov.f32 s15, s2
+; CHECK-LV-NEXT:    vmovx.f16 s0, s5
+; CHECK-LV-NEXT:    vins.f16 s15, s16
+; CHECK-LV-NEXT:    vmovx.f16 s16, s4
+; CHECK-LV-NEXT:    vins.f16 s4, s0
+; CHECK-LV-NEXT:    vmovx.f16 s0, s8
+; CHECK-LV-NEXT:    vmovx.f16 s17, s7
+; CHECK-LV-NEXT:    vins.f16 s7, s0
+; CHECK-LV-NEXT:    vmovx.f16 s0, s11
+; CHECK-LV-NEXT:    vmovx.f16 s19, s1
+; CHECK-LV-NEXT:    vins.f16 s10, s0
+; CHECK-LV-NEXT:    vmovx.f16 s0, s2
+; CHECK-LV-NEXT:    vins.f16 s1, s0
+; CHECK-LV-NEXT:    vins.f16 s16, s6
+; CHECK-LV-NEXT:    vmov.f32 s5, s7
+; CHECK-LV-NEXT:    vins.f16 s19, s3
+; CHECK-LV-NEXT:    vins.f16 s17, s9
+; CHECK-LV-NEXT:    vmov.f32 s6, s10
+; CHECK-LV-NEXT:    vmov.f32 s7, s1
+; CHECK-LV-NEXT:    vadd.f16 q0, q1, q4
+; CHECK-LV-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LV-NEXT:    vpop {d8, d9}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: vld3_v16f16:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9}
+; CHECK-LIS-NEXT:    vpush {d8, d9}
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vmovx.f16 s6, s2
+; CHECK-LIS-NEXT:    vmov.f32 s4, s1
+; CHECK-LIS-NEXT:    vins.f16 s4, s6
+; CHECK-LIS-NEXT:    vmovx.f16 s6, s9
+; CHECK-LIS-NEXT:    vmov.f32 s5, s8
+; CHECK-LIS-NEXT:    vmovx.f16 s7, s12
+; CHECK-LIS-NEXT:    vins.f16 s5, s6
+; CHECK-LIS-NEXT:    vmov.f32 s6, s11
+; CHECK-LIS-NEXT:    vins.f16 s6, s7
+; CHECK-LIS-NEXT:    vmovx.f16 s16, s15
+; CHECK-LIS-NEXT:    vmov.f32 s7, s14
+; CHECK-LIS-NEXT:    vmovx.f16 s17, s3
+; CHECK-LIS-NEXT:    vins.f16 s7, s16
+; CHECK-LIS-NEXT:    vmovx.f16 s16, s0
+; CHECK-LIS-NEXT:    vins.f16 s16, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s1
+; CHECK-LIS-NEXT:    vins.f16 s0, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s8
+; CHECK-LIS-NEXT:    vins.f16 s3, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s11
+; CHECK-LIS-NEXT:    vmovx.f16 s18, s10
+; CHECK-LIS-NEXT:    vins.f16 s10, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s14
+; CHECK-LIS-NEXT:    vmovx.f16 s19, s13
+; CHECK-LIS-NEXT:    vins.f16 s13, s2
+; CHECK-LIS-NEXT:    vmov.f32 s1, s3
+; CHECK-LIS-NEXT:    vins.f16 s18, s12
+; CHECK-LIS-NEXT:    vins.f16 s19, s15
+; CHECK-LIS-NEXT:    vins.f16 s17, s9
+; CHECK-LIS-NEXT:    vmov.f32 s2, s10
+; CHECK-LIS-NEXT:    vmov.f32 s3, s13
+; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-LIS-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-LIS-NEXT:    vadd.f16 q3, q0, q1
+; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-LIS-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-LIS-NEXT:    vmovx.f16 s14, s2
+; CHECK-LIS-NEXT:    vmov.f32 s12, s1
+; CHECK-LIS-NEXT:    vins.f16 s12, s14
+; CHECK-LIS-NEXT:    vmovx.f16 s14, s9
+; CHECK-LIS-NEXT:    vmov.f32 s13, s8
+; CHECK-LIS-NEXT:    vmovx.f16 s15, s4
+; CHECK-LIS-NEXT:    vins.f16 s13, s14
+; CHECK-LIS-NEXT:    vmov.f32 s14, s11
+; CHECK-LIS-NEXT:    vins.f16 s14, s15
+; CHECK-LIS-NEXT:    vmovx.f16 s16, s7
+; CHECK-LIS-NEXT:    vmov.f32 s15, s6
+; CHECK-LIS-NEXT:    vmovx.f16 s17, s3
+; CHECK-LIS-NEXT:    vins.f16 s15, s16
+; CHECK-LIS-NEXT:    vmovx.f16 s16, s0
+; CHECK-LIS-NEXT:    vins.f16 s16, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s1
+; CHECK-LIS-NEXT:    vins.f16 s0, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s8
+; CHECK-LIS-NEXT:    vins.f16 s3, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s11
+; CHECK-LIS-NEXT:    vmovx.f16 s18, s10
+; CHECK-LIS-NEXT:    vins.f16 s10, s2
+; CHECK-LIS-NEXT:    vmovx.f16 s2, s6
+; CHECK-LIS-NEXT:    vmovx.f16 s19, s5
+; CHECK-LIS-NEXT:    vins.f16 s5, s2
+; CHECK-LIS-NEXT:    vmov.f32 s1, s3
+; CHECK-LIS-NEXT:    vins.f16 s18, s4
+; CHECK-LIS-NEXT:    vins.f16 s19, s7
+; CHECK-LIS-NEXT:    vins.f16 s17, s9
+; CHECK-LIS-NEXT:    vmov.f32 s2, s10
+; CHECK-LIS-NEXT:    vmov.f32 s3, s5
+; CHECK-LIS-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-LIS-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
+; CHECK-LIS-NEXT:    vpop {d8, d9}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %l1 = load <48 x half>, ptr %src, align 4
   %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>

>From abcf0999590d9d08d05b5e04808628996ee941f9 Mon Sep 17 00:00:00 2001
From: Aleksandr Levin <aleksandr.levin at codasip.com>
Date: Tue, 25 Feb 2025 16:25:42 +0100
Subject: [PATCH 5/6] Adjusted tests to newly pulled and merged upstream
 changes

---
 llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 124 +++++++++++++++--------
 llvm/test/CodeGen/Thumb2/mve-vld3.ll    | 125 ++++++++++++------------
 2 files changed, 143 insertions(+), 106 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index d5dec9d529238..82c8d50e518b0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -1493,27 +1493,47 @@ entry:
   ret <2 x double> %out
 }
 define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) {
-; CHECK-LABEL: shuffle9_f64:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    vmov.f32 s16, s0
-; CHECK-NEXT:    vmov.f32 s18, s20
-; CHECK-NEXT:    vmov.f32 s20, s2
-; CHECK-NEXT:    vmov.f32 s10, s12
-; CHECK-NEXT:    vmov.f32 s19, s21
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vmov.f32 s21, s3
-; CHECK-NEXT:    vmov q0, q4
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s11, s13
-; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: shuffle9_f64:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vmov q5, q2
+; CHECK-LV-NEXT:    vmov.f32 s16, s0
+; CHECK-LV-NEXT:    vmov.f32 s18, s20
+; CHECK-LV-NEXT:    vmov.f32 s20, s2
+; CHECK-LV-NEXT:    vmov.f32 s10, s12
+; CHECK-LV-NEXT:    vmov.f32 s19, s21
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s17, s1
+; CHECK-LV-NEXT:    vmov.f32 s21, s3
+; CHECK-LV-NEXT:    vmov q0, q4
+; CHECK-LV-NEXT:    vmov.f32 s12, s6
+; CHECK-LV-NEXT:    vmov.f32 s11, s13
+; CHECK-LV-NEXT:    vmov.f32 s9, s5
+; CHECK-LV-NEXT:    vmov.f32 s13, s7
+; CHECK-LV-NEXT:    vmov q1, q5
+; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: shuffle9_f64:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vmov q5, q2
+; CHECK-LIS-NEXT:    vmov q4, q0
+; CHECK-LIS-NEXT:    vmov.f32 s2, s20
+; CHECK-LIS-NEXT:    vmov.f32 s20, s18
+; CHECK-LIS-NEXT:    vmov.f32 s10, s12
+; CHECK-LIS-NEXT:    vmov.f32 s3, s21
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s21, s19
+; CHECK-LIS-NEXT:    vmov.f32 s12, s6
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s9, s5
+; CHECK-LIS-NEXT:    vmov.f32 s13, s7
+; CHECK-LIS-NEXT:    vmov q1, q5
+; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x double> %out
@@ -1586,27 +1606,47 @@ entry:
   ret <2 x i64> %out
 }
 define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) {
-; CHECK-LABEL: shuffle9_i64:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    vmov.f32 s16, s0
-; CHECK-NEXT:    vmov.f32 s18, s20
-; CHECK-NEXT:    vmov.f32 s20, s2
-; CHECK-NEXT:    vmov.f32 s10, s12
-; CHECK-NEXT:    vmov.f32 s19, s21
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vmov.f32 s21, s3
-; CHECK-NEXT:    vmov q0, q4
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s11, s13
-; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-NEXT:    bx lr
+; CHECK-LV-LABEL: shuffle9_i64:
+; CHECK-LV:       @ %bb.0: @ %entry
+; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    vmov q5, q2
+; CHECK-LV-NEXT:    vmov.f32 s16, s0
+; CHECK-LV-NEXT:    vmov.f32 s18, s20
+; CHECK-LV-NEXT:    vmov.f32 s20, s2
+; CHECK-LV-NEXT:    vmov.f32 s10, s12
+; CHECK-LV-NEXT:    vmov.f32 s19, s21
+; CHECK-LV-NEXT:    vmov.f32 s8, s4
+; CHECK-LV-NEXT:    vmov.f32 s17, s1
+; CHECK-LV-NEXT:    vmov.f32 s21, s3
+; CHECK-LV-NEXT:    vmov q0, q4
+; CHECK-LV-NEXT:    vmov.f32 s12, s6
+; CHECK-LV-NEXT:    vmov.f32 s11, s13
+; CHECK-LV-NEXT:    vmov.f32 s9, s5
+; CHECK-LV-NEXT:    vmov.f32 s13, s7
+; CHECK-LV-NEXT:    vmov q1, q5
+; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LV-NEXT:    bx lr
+
+; CHECK-LIS-LABEL: shuffle9_i64:
+; CHECK-LIS:       @ %bb.0: @ %entry
+; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    vmov q5, q2
+; CHECK-LIS-NEXT:    vmov q4, q0
+; CHECK-LIS-NEXT:    vmov.f32 s2, s20
+; CHECK-LIS-NEXT:    vmov.f32 s20, s18
+; CHECK-LIS-NEXT:    vmov.f32 s10, s12
+; CHECK-LIS-NEXT:    vmov.f32 s3, s21
+; CHECK-LIS-NEXT:    vmov.f32 s8, s4
+; CHECK-LIS-NEXT:    vmov.f32 s21, s19
+; CHECK-LIS-NEXT:    vmov.f32 s12, s6
+; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s9, s5
+; CHECK-LIS-NEXT:    vmov.f32 s13, s7
+; CHECK-LIS-NEXT:    vmov q1, q5
+; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-LIS-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   ret <8 x i64> %out
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index edde3383d5c91..903382baac865 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -973,15 +973,15 @@ define void @vld3_v2i64(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vmov.f32 s9, s3
 ; CHECK-LIS-NEXT:    vmov.f32 s2, s4
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s5
-; CHECK-LIS-NEXT:    vmov r0, r2, d7
-; CHECK-LIS-NEXT:    vmov r3, r4, d3
+; CHECK-LIS-NEXT:    vmov r0, r3, d7
+; CHECK-LIS-NEXT:    vmov r2, r4, d3
 ; CHECK-LIS-NEXT:    vmov r6, r7, d0
 ; CHECK-LIS-NEXT:    vmov r5, r8, d4
 ; CHECK-LIS-NEXT:    vmov lr, r12, d1
 ; CHECK-LIS-NEXT:    adds.w r0, r0, lr
-; CHECK-LIS-NEXT:    adc.w r2, r2, r12
-; CHECK-LIS-NEXT:    adds r0, r0, r3
-; CHECK-LIS-NEXT:    adcs r2, r4
+; CHECK-LIS-NEXT:    adc.w r3, r3, r12
+; CHECK-LIS-NEXT:    adds r0, r0, r2
+; CHECK-LIS-NEXT:    adc.w r2, r3, r4
 ; CHECK-LIS-NEXT:    vmov r3, r4, d6
 ; CHECK-LIS-NEXT:    adds r6, r6, r5
 ; CHECK-LIS-NEXT:    adc.w r7, r7, r8
@@ -1064,10 +1064,8 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
 ;
 ; CHECK-LIS-LABEL: vld3_v4i64:
 ; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-LIS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-LIS-NEXT:    .pad #4
-; CHECK-LIS-NEXT:    sub sp, #4
+; CHECK-LIS-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-LIS-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12}
 ; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12}
 ; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0]
@@ -1080,7 +1078,7 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vmov.f32 s2, s12
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s13
 ; CHECK-LIS-NEXT:    vmov r5, r4, d5
-; CHECK-LIS-NEXT:    vmov r2, r8, d7
+; CHECK-LIS-NEXT:    vmov r3, r8, d7
 ; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
 ; CHECK-LIS-NEXT:    vmov.f32 s24, s18
 ; CHECK-LIS-NEXT:    vmov.f32 s25, s19
@@ -1089,40 +1087,39 @@ define void @vld3_v4i64(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vmov lr, r12, d1
 ; CHECK-LIS-NEXT:    vmov.f32 s2, s12
 ; CHECK-LIS-NEXT:    vmov.f32 s3, s13
-; CHECK-LIS-NEXT:    vmov r6, r7, d12
+; CHECK-LIS-NEXT:    vmov r7, r6, d12
 ; CHECK-LIS-NEXT:    adds.w r0, r5, lr
 ; CHECK-LIS-NEXT:    adc.w r5, r4, r12
 ; CHECK-LIS-NEXT:    adds.w lr, r0, r3
 ; CHECK-LIS-NEXT:    vmov r4, r2, d8
 ; CHECK-LIS-NEXT:    adc.w r12, r5, r8
 ; CHECK-LIS-NEXT:    vmov r5, r0, d10
-; CHECK-LIS-NEXT:    adds r6, r6, r4
-; CHECK-LIS-NEXT:    adcs r2, r7
-; CHECK-LIS-NEXT:    adds r6, r6, r5
+; CHECK-LIS-NEXT:    adds r7, r7, r4
+; CHECK-LIS-NEXT:    adcs r2, r6
+; CHECK-LIS-NEXT:    adds r7, r7, r5
 ; CHECK-LIS-NEXT:    adc.w r8, r2, r0
-; CHECK-LIS-NEXT:    vmov r7, r4, d1
+; CHECK-LIS-NEXT:    vmov r6, r4, d1
 ; CHECK-LIS-NEXT:    vmov r2, r5, d3
 ; CHECK-LIS-NEXT:    vmov r3, r0, d0
-; CHECK-LIS-NEXT:    adds r2, r2, r7
-; CHECK-LIS-NEXT:    adc.w r7, r5, r4
+; CHECK-LIS-NEXT:    adds r2, r2, r6
+; CHECK-LIS-NEXT:    adc.w r6, r5, r4
 ; CHECK-LIS-NEXT:    vmov r5, r4, d7
 ; CHECK-LIS-NEXT:    adds r2, r2, r5
-; CHECK-LIS-NEXT:    adcs r7, r4
+; CHECK-LIS-NEXT:    adcs r6, r4
 ; CHECK-LIS-NEXT:    vmov r5, r4, d2
-; CHECK-LIS-NEXT:    vmov q1[2], q1[0], r6, r2
-; CHECK-LIS-NEXT:    vmov q1[3], q1[1], r8, r7
+; CHECK-LIS-NEXT:    vmov q1[2], q1[0], r7, r2
+; CHECK-LIS-NEXT:    vmov q1[3], q1[1], r8, r6
 ; CHECK-LIS-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-LIS-NEXT:    adds r5, r5, r6
+; CHECK-LIS-NEXT:    adds r3, r3, r5
 ; CHECK-LIS-NEXT:    adcs r0, r4
-; CHECK-LIS-NEXT:    vmov r4, r6, d4
-; CHECK-LIS-NEXT:    adds r4, r4, r5
-; CHECK-LIS-NEXT:    vmov q0[2], q0[0], r4, lr
-; CHECK-LIS-NEXT:    adcs r0, r6
+; CHECK-LIS-NEXT:    vmov r4, r5, d4
+; CHECK-LIS-NEXT:    adds r3, r3, r4
+; CHECK-LIS-NEXT:    vmov q0[2], q0[0], r3, lr
+; CHECK-LIS-NEXT:    adcs r0, r5
 ; CHECK-LIS-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12}
-; CHECK-LIS-NEXT:    add sp, #4
-; CHECK-LIS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-LIS-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <12 x i64>, ptr %src, align 4
   %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -1206,19 +1203,19 @@ define void @vld3_v4f32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vpush {d8, d9}
 ; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s17, s0
-; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s13, s0
+; CHECK-LIS-NEXT:    vmov.f32 s14, s3
 ; CHECK-LIS-NEXT:    vmov.f32 s8, s4
 ; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s16, s5
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s11, s13
-; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q4
+; CHECK-LIS-NEXT:    vmov.f32 s12, s5
+; CHECK-LIS-NEXT:    vmov.f32 s15, s18
+; CHECK-LIS-NEXT:    vmov.f32 s11, s17
+; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vmov.f32 s2, s12
-; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vmov.f32 s2, s16
+; CHECK-LIS-NEXT:    vmov.f32 s3, s19
 ; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LIS-NEXT:    vpop {d8, d9}
@@ -1282,20 +1279,20 @@ define void @vld3_v8f32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #80]
 ; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s17, s0
-; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s13, s0
+; CHECK-LIS-NEXT:    vmov.f32 s14, s3
 ; CHECK-LIS-NEXT:    vmov.f32 s8, s4
 ; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s16, s5
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s12, s5
+; CHECK-LIS-NEXT:    vmov.f32 s15, s18
+; CHECK-LIS-NEXT:    vmov.f32 s11, s17
+; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q4
-; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vmov.f32 s2, s16
 ; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vmov.f32 s3, s19
 ; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
@@ -1409,20 +1406,21 @@ define void @vld3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-LIS-NEXT:    vldrw.u32 q6, [r0, #176]
 ; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s17, s0
-; CHECK-LIS-NEXT:    vmov.f32 s18, s3
+; CHECK-LIS-NEXT:    vmov.f32 s13, s0
+; CHECK-LIS-NEXT:    vmov.f32 s14, s3
 ; CHECK-LIS-NEXT:    vmov.f32 s8, s4
 ; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s16, s5
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s11, s13
+; CHECK-LIS-NEXT:    vmov.f32 s12, s5
+; CHECK-LIS-NEXT:    vmov.f32 s15, s18
+; CHECK-LIS-NEXT:    vmov.f32 s11, s17
+; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vadd.f32 q2, q2, q4
-; CHECK-LIS-NEXT:    vmov.f32 s2, s12
+; CHECK-LIS-NEXT:    vmov.f32 s2, s16
 ; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s3, s15
+; CHECK-LIS-NEXT:    vmov.f32 s3, s19
 ; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-LIS-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
@@ -1434,27 +1432,26 @@ define void @vld3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-LIS-NEXT:    vmov.f32 s20, s8
 ; CHECK-LIS-NEXT:    vmov.f32 s21, s11
 ; CHECK-LIS-NEXT:    vmov.f32 s23, s13
-; CHECK-LIS-NEXT:    vadd.f32 q4, q5, q4
 ; CHECK-LIS-NEXT:    vmov.f32 s4, s10
 ; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #176]
 ; CHECK-LIS-NEXT:    vmov.f32 s6, s12
+; CHECK-LIS-NEXT:    vadd.f32 q4, q5, q4
 ; CHECK-LIS-NEXT:    vmov.f32 s7, s15
 ; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #144]
 ; CHECK-LIS-NEXT:    vadd.f32 q1, q4, q1
 ; CHECK-LIS-NEXT:    vmov.f32 s18, s10
-; CHECK-LIS-NEXT:    vmov.f32 s25, s8
-; CHECK-LIS-NEXT:    vmov.f32 s26, s11
+; CHECK-LIS-NEXT:    vmov.f32 s21, s8
+; CHECK-LIS-NEXT:    vmov.f32 s22, s11
 ; CHECK-LIS-NEXT:    vmov.f32 s16, s12
 ; CHECK-LIS-NEXT:    vmov.f32 s17, s15
-; CHECK-LIS-NEXT:    vmov.f32 s24, s13
-; CHECK-LIS-NEXT:    vmov.f32 s27, s22
-; CHECK-LIS-NEXT:    vmov.f32 s19, s21
+; CHECK-LIS-NEXT:    vmov.f32 s20, s13
+; CHECK-LIS-NEXT:    vmov.f32 s23, s26
+; CHECK-LIS-NEXT:    vmov.f32 s19, s25
+; CHECK-LIS-NEXT:    vadd.f32 q4, q4, q5
 ; CHECK-LIS-NEXT:    vmov.f32 s8, s14
-; CHECK-LIS-NEXT:    vadd.f32 q4, q4, q6
-; CHECK-LIS-NEXT:    vmov.f32 s10, s20
+; CHECK-LIS-NEXT:    vmov.f32 s10, s24
 ; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-LIS-NEXT:    vmov.f32 s11, s23
+; CHECK-LIS-NEXT:    vmov.f32 s11, s27
 ; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #128]
 ; CHECK-LIS-NEXT:    vadd.f32 q2, q4, q2
 ; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #96]

>From cd24a10664068e78cd482a2f723f91cf867f10f8 Mon Sep 17 00:00:00 2001
From: Aleksandr Levin <aleksandr.levin at codasip.com>
Date: Tue, 25 Feb 2025 17:21:06 +0100
Subject: [PATCH 6/6] Typo fix

---
 llvm/lib/CodeGen/SlotIndexes.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index 1d6164132f9ab..98973b4911ec7 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -218,13 +218,13 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
            (includeStart || !pastStart) &&
            "Decremented past the beginning of region to repair.");
 
-    MachineInstr *slotMI = ListI->getInstr();
+    MachineInstr *SlotMI = ListI->getInstr();
     MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
     bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
     bool MIIndexNotFound = MI && mi2iMap.find(MI) == mi2iMap.end();
-    bool slotMIRemoved = false;
+    bool SlotMIRemoved = false;
 
-    if (slotMI == MI && !MBBIAtBegin) {
+    if (SlotMI == MI && !MBBIAtBegin) {
       --ListI;
       if (MBBI != Begin)
         --MBBI;
@@ -243,16 +243,16 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
         oldIndexesRemoved = true;
       else
         --ListI;
-      if (slotMI) {
-        removeMachineInstrFromMaps(*slotMI);
-        slotMIRemoved = true;
+      if (SlotMI) {
+        removeMachineInstrFromMaps(*SlotMI);
+        SlotMIRemoved = true;
       }
     }
 
-    MachineInstr *instrToInsert = slotMIRemoved ? slotMI : MI;
+    MachineInstr *instrToInsert = SlotMIRemoved ? SlotMI : MI;
 
     // Insert isntruction back into the maps after passing it/removing the index
-    if ((MIIndexNotFound || slotMIRemoved) && instrToInsert->getParent() != nullptr && !instrToInsert->isDebugOrPseudoInstr())
+    if ((MIIndexNotFound || SlotMIRemoved) && instrToInsert->getParent() != nullptr && !instrToInsert->isDebugOrPseudoInstr())
       insertMachineInstrInMaps(*instrToInsert);
   }
 }



More information about the llvm-commits mailing list