[llvm] [IA]: Construct (de)interleave4 out of (de)interleave2 (PR #89276)

Sun Jul 21 14:26:07 PDT 2024

https://github.com/hassnaaHamdi updated https://github.com/llvm/llvm-project/pull/89276

>From 611cbb4f2ba7d35de53953939c0031c4beb9ea55 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 15 May 2024 23:44:44 +0000
Subject: [PATCH 1/7] [AArch64][Interleave]: Add test precommit

Change-Id: I5e2613156a482dcadae3e4cfa1bacdf7f3293fe2
---
 .../AArch64/sve-interleave_accesses4-load.ll  | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
new file mode 100644
index 0000000000000..dcade71ccb684
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: interleave:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld2w { z1.s, z2.s }, p0/z, [x1]
+; CHECK-NEXT:    ld2w { z3.s, z4.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    uzp2 z5.s, z1.s, z3.s
+; CHECK-NEXT:    uzp1 z6.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z7.s, z2.s, z4.s
+; CHECK-NEXT:    uzp1 z1.s, z2.s, z4.s
+; CHECK-NEXT:    add z2.s, z0.s, z6.s
+; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    lsl z3.s, p0/m, z3.s, z0.s
+; CHECK-NEXT:    sub z1.s, z1.s, z0.s
+; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z7.s
+; CHECK-NEXT:    zip1 z4.s, z2.s, z3.s
+; CHECK-NEXT:    zip2 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z5.s, z1.s, z0.s
+; CHECK-NEXT:    zip2 z3.s, z1.s, z0.s
+; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
+; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    ret
+  %wide.vec = load <vscale x 16 x i32>, ptr %a, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = add nsw <vscale x 4 x i32> %x, %5
+  %10 = sub nsw <vscale x 4 x i32> %7, %x
+  %11 = shl <vscale x 4 x i32> %6, %x
+  %12 = ashr <vscale x 4 x i32> %8, %x
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %9, <vscale x 4 x i32> %11)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %10, <vscale x 4 x i32> %12)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %dst, align 4
+  ret void
+}
+
+define void @wide_interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 8 x i32> %x) {
+; CHECK-LABEL: wide_interleave:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld2w { z2.s, z3.s }, p0/z, [x1]
+; CHECK-NEXT:    ld2w { z4.s, z5.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    ld2w { z6.s, z7.s }, p0/z, [x1, #4, mul vl]
+; CHECK-NEXT:    ld2w { z24.s, z25.s }, p0/z, [x1, #6, mul vl]
+; CHECK-NEXT:    uzp2 z26.s, z2.s, z4.s
+; CHECK-NEXT:    uzp1 z27.s, z2.s, z4.s
+; CHECK-NEXT:    uzp2 z28.s, z3.s, z5.s
+; CHECK-NEXT:    uzp1 z2.s, z3.s, z5.s
+; CHECK-NEXT:    add z3.s, z0.s, z27.s
+; CHECK-NEXT:    movprfx z4, z26
+; CHECK-NEXT:    lsl z4.s, p0/m, z4.s, z0.s
+; CHECK-NEXT:    sub z2.s, z2.s, z0.s
+; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z28.s
+; CHECK-NEXT:    zip1 z26.s, z3.s, z4.s
+; CHECK-NEXT:    zip2 z3.s, z3.s, z4.s
+; CHECK-NEXT:    zip1 z27.s, z2.s, z0.s
+; CHECK-NEXT:    zip2 z4.s, z2.s, z0.s
+; CHECK-NEXT:    uzp2 z0.s, z6.s, z24.s
+; CHECK-NEXT:    uzp1 z2.s, z6.s, z24.s
+; CHECK-NEXT:    st2w { z26.s, z27.s }, p0, [x0]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    add z2.s, z1.s, z2.s
+; CHECK-NEXT:    st2w { z3.s, z4.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp2 z3.s, z7.s, z25.s
+; CHECK-NEXT:    uzp1 z4.s, z7.s, z25.s
+; CHECK-NEXT:    zip1 z5.s, z2.s, z0.s
+; CHECK-NEXT:    sub z4.s, z4.s, z1.s
+; CHECK-NEXT:    asrr z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    zip2 z2.s, z2.s, z0.s
+; CHECK-NEXT:    zip1 z6.s, z4.s, z1.s
+; CHECK-NEXT:    zip2 z3.s, z4.s, z1.s
+; CHECK-NEXT:    st2w { z5.s, z6.s }, p0, [x0, #4, mul vl]
+; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #6, mul vl]
+; CHECK-NEXT:    ret
+  %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
+  %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 1
+  %9 = add nsw <vscale x 8 x i32> %x, %5
+  %10 = sub nsw <vscale x 8 x i32> %7, %x
+  %11 = shl <vscale x 8 x i32> %6, %x
+  %12 = ashr <vscale x 8 x i32> %8, %x
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %9, <vscale x 8 x i32> %11)
+  %interleaved.vec61 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %10, <vscale x 8 x i32> %12)
+  %interleaved.vec62 = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.vec, <vscale x 16 x i32> %interleaved.vec61)
+  store <vscale x 32 x i32> %interleaved.vec62, ptr %dst, align 4
+  ret void
+}

>From 06510cd473757f765c3f1b9759eb89af79a56268 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 18 Apr 2024 17:30:51 +0000
Subject: [PATCH 2/7] [IA]: Construct (de)interleave4 out of (de)interleave2

- InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences,
  and emit equivalent sve.ld4 or sve.st4 intrinsics through targets that support SV.
- Tests are added for targets that support SV.

Change-Id: I76ef31080ddd72b182c1a3b1752a6178dc78ea84
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 83 +++++++++++++++++--
 .../Target/AArch64/AArch64ISelLowering.cpp    | 44 ++++++----
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 37 +++++++--
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  4 +-
 .../CodeGen/AArch64/sve-deinterleave-load.ll  | 78 +++++++++++++++++
 .../RISCV/rvv/sve-deinterleave-load.ll        | 74 +++++++++++++++++
 8 files changed, 296 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9d9886f4920a2..2f6db5b0ac3fa 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -57,6 +57,8 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
+#include <queue>
+#include <stack>
 #include <string>
 #include <utility>
 #include <vector>
@@ -3156,6 +3158,7 @@ class TargetLoweringBase {
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
   virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                SmallVector<Value *> &LeafNodes,
                                                 LoadInst *LI) const {
     return false;
   }
@@ -3167,6 +3170,7 @@ class TargetLoweringBase {
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                               SmallVector<Value *> &LeafNodes,
                                                StoreInst *SI) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8c9065aec7faa..1c42ac6016969 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,6 +70,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
+#include <queue>
 #include <utility>
 
 using namespace llvm;
@@ -488,12 +489,57 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
+  std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
+  SmallVector<Value *> TempLeafNodes, LeafNodes;
+  std::map<IntrinsicInst *, bool> mp;
+  SmallVector<Instruction *> TempDeadInsts;
+
+  DeinterleaveTreeQueue.push(DI);
+  while (!DeinterleaveTreeQueue.empty()) {
+    auto CurrentDI = DeinterleaveTreeQueue.top();
+    DeinterleaveTreeQueue.pop();
+    TempDeadInsts.push_back(CurrentDI);
+    // iterate over extract users of deinterleave
+    for (auto UserExtract : CurrentDI->users()) {
+      Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+      if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+        continue;
+      bool IsLeaf = true;
+      // iterate over deinterleave users of extract
+      for (auto UserDI : UserExtract->users()) {
+        IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+        if (!Child_DI || Child_DI->getIntrinsicID() !=
+                             Intrinsic::vector_deinterleave2)
+          continue;
+        IsLeaf = false;
+        if (mp.count(Child_DI) == 0) {
+          DeinterleaveTreeQueue.push(Child_DI);
+        }
+        continue;
+      }
+      if (IsLeaf) {
+        TempLeafNodes.push_back(UserExtract);
+        TempDeadInsts.push_back(Extract);
+      } else {
+        TempDeadInsts.push_back(Extract);
+      }
+    }
+  }
+  // sort the deinterleaved nodes in the order that
+  // they will be extracted from the target-specific intrinsic.
+  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+
+  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
     return false;
 
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.push_back(DI);
+  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
+                   TempDeadInsts.rend());
   DeadInsts.push_back(LI);
   return true;
 }
@@ -509,14 +555,38 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+  std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
+  SmallVector<Value *> TempLeafNodes, LeafNodes;
+  SmallVector<Instruction *> TempDeadInsts;
+
+  IeinterleaveTreeQueue.push(II);
+  while (!IeinterleaveTreeQueue.empty()) {
+    auto node = IeinterleaveTreeQueue.front();
+    TempDeadInsts.push_back(node);
+    IeinterleaveTreeQueue.pop();
+    for (unsigned i = 0; i < 2; i++) {
+      auto op = node->getOperand(i);
+      if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+        if (CurrentII->getIntrinsicID() !=
+            Intrinsic::vector_interleave2)
+          continue;
+        IeinterleaveTreeQueue.push(CurrentII);
+        continue;
+      }
+      TempLeafNodes.push_back(op);
+    }
+  }
+  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
-  DeadInsts.push_back(II);
+  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
   return true;
 }
 
@@ -537,7 +607,8 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
       // with a factor of 2.
       if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+
+      else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 84de1ee8f8923..e704095d8f9d0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16882,16 +16882,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, LoadInst *LI) const {
+    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  const unsigned Factor = std::max(2, (int)LeafNodes.size());
 
-  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
-  const DataLayout &DL = DI->getDataLayout();
+  VectorType *VTy = (LeafNodes.size() > 0)
+                        ? cast<VectorType>(LeafNodes.front()->getType())
+                        : cast<VectorType>(DI->getType()->getContainedType(0));
+  const DataLayout &DL = DI->getModule()->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -16946,9 +16947,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Result = Builder.CreateInsertValue(Result, Left, 0);
     Result = Builder.CreateInsertValue(Result, Right, 1);
   } else {
-    if (UseScalable)
+    if (UseScalable) {
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
-    else
+      if (Factor == 2) {
+        DI->replaceAllUsesWith(Result);
+        return true;
+      }
+      for (unsigned I = 0; I < LeafNodes.size(); I++) {
+        llvm::Value *CurrentExtract = LeafNodes[I];
+        Value *Newextrct = Builder.CreateExtractValue(Result, I);
+        CurrentExtract->replaceAllUsesWith(Newextrct);
+      }
+      return true;
+    } else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
 
@@ -16957,16 +16968,16 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, StoreInst *SI) const {
+    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
   // Only interleave2 supported at present.
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  // leaf nodes are the nodes that will be interleaved
+  const unsigned Factor = LeafNodes.size();
 
-  VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
-  const DataLayout &DL = II->getDataLayout();
+  VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
+  const DataLayout &DL = II->getModule()->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -17010,9 +17021,12 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
       R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
     }
 
-    if (UseScalable)
-      Builder.CreateCall(StNFunc, {L, R, Pred, Address});
-    else
+    if (UseScalable) {
+      SmallVector<Value *> Args(LeafNodes);
+      Args.push_back(Pred);
+      Args.push_back(Address);
+      Builder.CreateCall(StNFunc, Args);
+    } else
       Builder.CreateCall(StNFunc, {L, R, Address});
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fcdd47541be82..c4da785db8265 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -704,9 +704,11 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                        SmallVector<Value *> &LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e938454b8e642..07409272b9632 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21775,8 +21775,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                           LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
@@ -21784,10 +21784,13 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  unsigned Factor = 2;
+  unsigned Factor = std::max(2, (int)LeafNodes.size());
 
   VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
-  VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+  VectorType *ResVTy =
+      (LeafNodes.size() > 0)
+          ? cast<VectorType>(LeafNodes.front()->getType())
+          : cast<VectorType>(DI->getType()->getContainedType(0));
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
@@ -21815,6 +21818,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
                                            {ResVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
     Ops.append(Factor, PoisonValue::get(ResVTy));
+    Ops.append({LI->getPointerOperand(), VL});
+    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+    //-----------
+    if (Factor == 2) {
+      DI->replaceAllUsesWith(Vlseg);
+      return true;
+    }
+    for (unsigned I = 0; I < LeafNodes.size(); I++) {
+      auto CurrentExtract = LeafNodes[I];
+      Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+    }
+    return true;
   }
 
   Ops.append({LI->getPointerOperand(), VL});
@@ -21825,8 +21841,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                                          StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
 
@@ -21834,10 +21850,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  unsigned Factor = 2;
+  unsigned Factor = LeafNodes.size();
 
   VectorType *VTy = cast<VectorType>(II->getType());
-  VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+  VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
@@ -21863,6 +21879,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
     VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
                                            {InVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
+    SmallVector<Value *> Args(LeafNodes);
+    Args.push_back(SI->getPointerOperand());
+    Args.push_back(VL);
+    Builder.CreateCall(VssegNFunc, Args);
+    return true;
   }
 
   Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0b0ad9229f0b3..9f8fa2529934d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -876,10 +876,12 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                        SmallVector<Value *> &LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
new file mode 100644
index 0000000000000..606bb93e309e1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #1024 // =0x400
+; CHECK-NEXT:    neg x10, x10
+; CHECK-NEXT:    rdvl x11, #4
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x12, x1, x8
+; CHECK-NEXT:    adds x9, x9, x10
+; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x12]
+; CHECK-NEXT:    add x12, x2, x8
+; CHECK-NEXT:    ld4w { z4.s - z7.s }, p0/z, [x12]
+; CHECK-NEXT:    add x12, x0, x8
+; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    add z16.s, z4.s, z0.s
+; CHECK-NEXT:    sub z17.s, z1.s, z5.s
+; CHECK-NEXT:    movprfx z18, z2
+; CHECK-NEXT:    lsl z18.s, p0/m, z18.s, z6.s
+; CHECK-NEXT:    movprfx z19, z3
+; CHECK-NEXT:    asr z19.s, p0/m, z19.s, z7.s
+; CHECK-NEXT:    st4w { z16.s - z19.s }, p0, [x12]
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+  %16 = add nsw <vscale x 4 x i32> %12, %5
+  %17 = sub nsw <vscale x 4 x i32> %7, %14
+  %18 = shl <vscale x 4 x i32> %6, %13
+  %19 = ashr <vscale x 4 x i32> %8, %15
+  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+  %index.next = add nuw i64 %index, %1
+  %21 = icmp eq i64 %index.next, 1024
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
new file mode 100644
index 0000000000000..2ea14b13265c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    srli a3, a4, 1
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    li a5, 1024
+; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
+; CHECK-NEXT:  .LBB0_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vlseg4e32.v v8, (a1)
+; CHECK-NEXT:    vlseg4e32.v v16, (a2)
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsub.vv v10, v10, v18
+; CHECK-NEXT:    vsll.vv v12, v12, v20
+; CHECK-NEXT:    vsra.vv v14, v14, v22
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    sub a5, a5, a3
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    bnez a5, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+  %16 = add nsw <vscale x 4 x i32> %12, %5
+  %17 = sub nsw <vscale x 4 x i32> %7, %14
+  %18 = shl <vscale x 4 x i32> %6, %13
+  %19 = ashr <vscale x 4 x i32> %8, %15
+  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+  %index.next = add nuw i64 %index, %1
+  %21 = icmp eq i64 %index.next, 1024
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}

>From 5c345f9b72ecdc29d783c765257552b5cdbdf830 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Mon, 29 Apr 2024 05:03:36 +0000
Subject: [PATCH 3/7] [PatternMatch]: Add m_Interleave and m_Deinterleave
 matchers.

Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a
---
 llvm/include/llvm/IR/PatternMatch.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index d9e27e087e705..65ea948f16a32 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2924,6 +2924,17 @@ inline VScaleVal_match m_VScale() {
   return VScaleVal_match();
 }
 
+template <typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
+m_Interleave2(const Opnd0 &Op0, const Opnd1 &Op1) {
+  return m_Intrinsic<Intrinsic::vector_interleave2>(Op0, Op1);
+}
+
+template <typename Opnd>
+inline typename m_Intrinsic_Ty<Opnd>::Ty m_Deinterleave2(const Opnd &Op) {
+  return m_Intrinsic<Intrinsic::vector_deinterleave2>(Op);
+}
+
 template <typename LHS, typename RHS, unsigned Opcode, bool Commutable = false>
 struct LogicalOp_match {
   LHS L;

>From 0b4ffab948ce7c02b2ae8a0287f66b504457bcd5 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 15 May 2024 17:25:10 +0000
Subject: [PATCH 4/7] [AArch64]: Use PatternMatch to spot (de)interleave
 accesses

Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   4 -
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  82 +-------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 187 +++++++++++++-----
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 -
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  37 +---
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   4 +-
 .../CodeGen/AArch64/sve-deinterleave-load.ll  |  78 --------
 .../AArch64/sve-interleave_accesses4-load.ll  | 106 ----------
 .../RISCV/rvv/sve-deinterleave-load.ll        |  74 -------
 .../AArch64/sve-deinterleave4.ll              | 105 ++++++++++
 .../AArch64/sve-interleave4.ll                |  63 ++++++
 11 files changed, 315 insertions(+), 427 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
 create mode 100644 llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
 create mode 100644 llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2f6db5b0ac3fa..9d9886f4920a2 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -57,8 +57,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <queue>
-#include <stack>
 #include <string>
 #include <utility>
 #include <vector>
@@ -3158,7 +3156,6 @@ class TargetLoweringBase {
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
   virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                SmallVector<Value *> &LeafNodes,
                                                 LoadInst *LI) const {
     return false;
   }
@@ -3170,7 +3167,6 @@ class TargetLoweringBase {
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                               SmallVector<Value *> &LeafNodes,
                                                StoreInst *SI) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 1c42ac6016969..86f856139ff30 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,7 +70,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
-#include <queue>
 #include <utility>
 
 using namespace llvm;
@@ -489,57 +488,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
-  std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
-  SmallVector<Value *> TempLeafNodes, LeafNodes;
-  std::map<IntrinsicInst *, bool> mp;
-  SmallVector<Instruction *> TempDeadInsts;
-
-  DeinterleaveTreeQueue.push(DI);
-  while (!DeinterleaveTreeQueue.empty()) {
-    auto CurrentDI = DeinterleaveTreeQueue.top();
-    DeinterleaveTreeQueue.pop();
-    TempDeadInsts.push_back(CurrentDI);
-    // iterate over extract users of deinterleave
-    for (auto UserExtract : CurrentDI->users()) {
-      Instruction *Extract = dyn_cast<Instruction>(UserExtract);
-      if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
-        continue;
-      bool IsLeaf = true;
-      // iterate over deinterleave users of extract
-      for (auto UserDI : UserExtract->users()) {
-        IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
-        if (!Child_DI || Child_DI->getIntrinsicID() !=
-                             Intrinsic::vector_deinterleave2)
-          continue;
-        IsLeaf = false;
-        if (mp.count(Child_DI) == 0) {
-          DeinterleaveTreeQueue.push(Child_DI);
-        }
-        continue;
-      }
-      if (IsLeaf) {
-        TempLeafNodes.push_back(UserExtract);
-        TempDeadInsts.push_back(Extract);
-      } else {
-        TempDeadInsts.push_back(Extract);
-      }
-    }
-  }
-  // sort the deinterleaved nodes in the order that
-  // they will be extracted from the target-specific intrinsic.
-  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
-
-  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
-
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
     return false;
 
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
-                   TempDeadInsts.rend());
+  DeadInsts.push_back(DI);
   DeadInsts.push_back(LI);
   return true;
 }
@@ -555,38 +509,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-  std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
-  SmallVector<Value *> TempLeafNodes, LeafNodes;
-  SmallVector<Instruction *> TempDeadInsts;
-
-  IeinterleaveTreeQueue.push(II);
-  while (!IeinterleaveTreeQueue.empty()) {
-    auto node = IeinterleaveTreeQueue.front();
-    TempDeadInsts.push_back(node);
-    IeinterleaveTreeQueue.pop();
-    for (unsigned i = 0; i < 2; i++) {
-      auto op = node->getOperand(i);
-      if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
-        if (CurrentII->getIntrinsicID() !=
-            Intrinsic::vector_interleave2)
-          continue;
-        IeinterleaveTreeQueue.push(CurrentII);
-        continue;
-      }
-      TempLeafNodes.push_back(op);
-    }
-  }
-  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
-  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
-  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
+  DeadInsts.push_back(II);
   return true;
 }
 
@@ -607,8 +536,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
       // with a factor of 2.
       if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-
-      else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e704095d8f9d0..642759b606a72 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16881,18 +16881,74 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
+bool getDeinterleavedValues(Value *DI,
+                            SmallVectorImpl<Value *> &DeinterleavedValues,
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
+  if (!DI->hasNUses(2))
+    return false;
+
+  // make sure that the users of DI are extractValue instructions
+  auto *Extr0 = *(++DI->user_begin());
+  if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
+    return false;
+  auto *Extr1 = *(DI->user_begin());
+  if (!match(Extr1, m_ExtractValue<1>(m_Deinterleave2(m_Value()))))
+    return false;
+
+  // each extractValue instruction is expected to have a single user,
+  // which should be another DI
+  if (!Extr0->hasOneUser() || !Extr1->hasOneUser())
+    return false;
+  auto *DI1 = *(Extr0->user_begin());
+  if (!match(DI1, m_Deinterleave2(m_Value())))
+    return false;
+  auto *DI2 = *(Extr1->user_begin());
+  if (!match(DI2, m_Deinterleave2(m_Value())))
+    return false;
+
+  if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+    return false;
+
+  // Leaf nodes of the deinterleave tree
+  auto *A = *(++DI1->user_begin());
+  auto *C = *(DI1->user_begin());
+  auto *B = *(++DI2->user_begin());
+  auto *D = *(DI2->user_begin());
+
+  DeinterleavedValues.push_back(A);
+  DeinterleavedValues.push_back(B);
+  DeinterleavedValues.push_back(C);
+  DeinterleavedValues.push_back(D);
+
+  // These Values will not be used anymre,
+  // DI4 will be created instead of nested DI1 and DI2
+  DeadInsts.push_back(cast<Instruction>(DI1));
+  DeadInsts.push_back(cast<Instruction>(Extr0));
+  DeadInsts.push_back(cast<Instruction>(DI2));
+  DeadInsts.push_back(cast<Instruction>(Extr1));
+
+  return true;
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
+    IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  const unsigned Factor = std::max(2, (int)LeafNodes.size());
-
-  VectorType *VTy = (LeafNodes.size() > 0)
-                        ? cast<VectorType>(LeafNodes.front()->getType())
-                        : cast<VectorType>(DI->getType()->getContainedType(0));
+  SmallVector<Value *, 4> DeinterleavedValues;
+  SmallVector<Instruction *, 10> DeadInsts;
   const DataLayout &DL = DI->getModule()->getDataLayout();
+  unsigned Factor = 2;
+  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+
+  if (getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+    Factor = DeinterleavedValues.size();
+    VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+  }
+  assert((Factor == 2 || Factor == 4) &&
+         "Currently supported Factors are 2 or 4");
+
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -16903,7 +16959,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
-
   VectorType *LdTy =
       VectorType::get(VTy->getElementType(),
                       VTy->getElementCount().divideCoefficientBy(NumLoads));
@@ -16913,7 +16968,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                                 UseScalable, LdTy, PtrTy);
 
   IRBuilder<> Builder(LI);
-
   Value *Pred = nullptr;
   if (UseScalable)
     Pred =
@@ -16922,9 +16976,8 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
   Value *BaseAddr = LI->getPointerOperand();
   Value *Result;
   if (NumLoads > 1) {
-    Value *Left = PoisonValue::get(VTy);
-    Value *Right = PoisonValue::get(VTy);
-
+    // Create multiple legal small ldN instead of a wide one.
+    SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));
     for (unsigned I = 0; I < NumLoads; ++I) {
       Value *Offset = Builder.getInt64(I * Factor);
 
@@ -16934,50 +16987,79 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
       else
         LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
-
       Value *Idx =
           Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
-      Left = Builder.CreateInsertVector(
-          VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
-      Right = Builder.CreateInsertVector(
-          VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
+      for (int J = 0; J < Factor; ++J) {
+        WideValues[J] = Builder.CreateInsertVector(
+            VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
+      }
+    }
+    if (Factor == 2)
+      Result = PoisonValue::get(StructType::get(VTy, VTy));
+    else
+      Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
+    // Construct the wide result out of the small results.
+    for (int J = 0; J < Factor; ++J) {
+      Result = Builder.CreateInsertValue(Result, WideValues[J], J);
     }
-
-    Result = PoisonValue::get(DI->getType());
-    Result = Builder.CreateInsertValue(Result, Left, 0);
-    Result = Builder.CreateInsertValue(Result, Right, 1);
   } else {
-    if (UseScalable) {
+    if (UseScalable)
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
-      if (Factor == 2) {
-        DI->replaceAllUsesWith(Result);
-        return true;
-      }
-      for (unsigned I = 0; I < LeafNodes.size(); I++) {
-        llvm::Value *CurrentExtract = LeafNodes[I];
-        Value *Newextrct = Builder.CreateExtractValue(Result, I);
-        CurrentExtract->replaceAllUsesWith(Newextrct);
-      }
-      return true;
-    } else
+    else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
+  if (Factor > 2) {
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+      llvm::Value *CurrentExtract = DeinterleavedValues[I];
+      Value *NewExtract = Builder.CreateExtractValue(Result, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+      cast<Instruction>(CurrentExtract)->eraseFromParent();
+    }
 
+    for (auto &dead : DeadInsts)
+      dead->eraseFromParent();
+    return true;
+  }
   DI->replaceAllUsesWith(Result);
   return true;
 }
 
-bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
-  // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
-    return false;
+bool getValuesToInterleaved(Value *II,
+                            SmallVectorImpl<Value *> &ValuesToInterleave) {
+  Value *A, *B, *C, *D;
+  // Try to match interleave of Factor 4
+  if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
+                              m_Interleave2(m_Value(B), m_Value(D))))) {
+    ValuesToInterleave.push_back(A);
+    ValuesToInterleave.push_back(B);
+    ValuesToInterleave.push_back(C);
+    ValuesToInterleave.push_back(D);
+    return true;
+  }
 
-  // leaf nodes are the nodes that will be interleaved
-  const unsigned Factor = LeafNodes.size();
+  // Try to match interleave of Factor 2
+  if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) {
+    ValuesToInterleave.push_back(A);
+    ValuesToInterleave.push_back(B);
+    return true;
+  }
+
+  return false;
+}
 
-  VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
+bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
+    IntrinsicInst *II, StoreInst *SI) const {
+  LLVM_DEBUG(dbgs() << "lowerInterleaveIntrinsicToStore\n");
+
+  SmallVector<Value *, 4> ValuesToInterleave;
+  if (!getValuesToInterleaved(II, ValuesToInterleave))
+    return false;
+  unsigned Factor = ValuesToInterleave.size();
+  assert((Factor == 2 || Factor == 4) &&
+         "Currently supported Factors are 2 or 4");
+  VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
   const DataLayout &DL = II->getModule()->getDataLayout();
+
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -17006,28 +17088,25 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     Pred =
         Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
 
-  Value *L = II->getOperand(0);
-  Value *R = II->getOperand(1);
-
+  auto WideValues = ValuesToInterleave;
+  if (UseScalable)
+    ValuesToInterleave.push_back(Pred);
+  ValuesToInterleave.push_back(BaseAddr);
   for (unsigned I = 0; I < NumStores; ++I) {
     Value *Address = BaseAddr;
     if (NumStores > 1) {
       Value *Offset = Builder.getInt64(I * Factor);
       Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
-
       Value *Idx =
           Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
-      L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
-      R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
+      for (int J = 0; J < Factor; J++) {
+        ValuesToInterleave[J] =
+            Builder.CreateExtractVector(StTy, WideValues[J], Idx);
+      }
+      // update the address
+      ValuesToInterleave[ValuesToInterleave.size() - 1] = Address;
     }
-
-    if (UseScalable) {
-      SmallVector<Value *> Args(LeafNodes);
-      Args.push_back(Pred);
-      Args.push_back(Address);
-      Builder.CreateCall(StNFunc, Args);
-    } else
-      Builder.CreateCall(StNFunc, {L, R, Address});
+    Builder.CreateCall(StNFunc, ValuesToInterleave);
   }
 
   return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c4da785db8265..fcdd47541be82 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -704,11 +704,9 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                        SmallVector<Value *> &LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 07409272b9632..e938454b8e642 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21775,8 +21775,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                           LoadInst *LI) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
@@ -21784,13 +21784,10 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  unsigned Factor = std::max(2, (int)LeafNodes.size());
+  unsigned Factor = 2;
 
   VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
-  VectorType *ResVTy =
-      (LeafNodes.size() > 0)
-          ? cast<VectorType>(LeafNodes.front()->getType())
-          : cast<VectorType>(DI->getType()->getContainedType(0));
+  VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
@@ -21818,19 +21815,6 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                            {ResVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
     Ops.append(Factor, PoisonValue::get(ResVTy));
-    Ops.append({LI->getPointerOperand(), VL});
-    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
-    //-----------
-    if (Factor == 2) {
-      DI->replaceAllUsesWith(Vlseg);
-      return true;
-    }
-    for (unsigned I = 0; I < LeafNodes.size(); I++) {
-      auto CurrentExtract = LeafNodes[I];
-      Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
-      CurrentExtract->replaceAllUsesWith(NewExtract);
-    }
-    return true;
   }
 
   Ops.append({LI->getPointerOperand(), VL});
@@ -21841,8 +21825,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   return true;
 }
 
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                                          StoreInst *SI) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
 
@@ -21850,10 +21834,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  unsigned Factor = LeafNodes.size();
+  unsigned Factor = 2;
 
   VectorType *VTy = cast<VectorType>(II->getType());
-  VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
+  VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
@@ -21879,11 +21863,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
     VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
                                            {InVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
-    SmallVector<Value *> Args(LeafNodes);
-    Args.push_back(SI->getPointerOperand());
-    Args.push_back(VL);
-    Builder.CreateCall(VssegNFunc, Args);
-    return true;
   }
 
   Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9f8fa2529934d..0b0ad9229f0b3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -876,12 +876,10 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                        SmallVector<Value *> &LeafNodes,
+  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
deleted file mode 100644
index 606bb93e309e1..0000000000000
--- a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
-; CHECK-LABEL: loop_xyzt:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cntw x10
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov w9, #1024 // =0x400
-; CHECK-NEXT:    neg x10, x10
-; CHECK-NEXT:    rdvl x11, #4
-; CHECK-NEXT:  .LBB0_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x12, x1, x8
-; CHECK-NEXT:    adds x9, x9, x10
-; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x12]
-; CHECK-NEXT:    add x12, x2, x8
-; CHECK-NEXT:    ld4w { z4.s - z7.s }, p0/z, [x12]
-; CHECK-NEXT:    add x12, x0, x8
-; CHECK-NEXT:    add x8, x8, x11
-; CHECK-NEXT:    add z16.s, z4.s, z0.s
-; CHECK-NEXT:    sub z17.s, z1.s, z5.s
-; CHECK-NEXT:    movprfx z18, z2
-; CHECK-NEXT:    lsl z18.s, p0/m, z18.s, z6.s
-; CHECK-NEXT:    movprfx z19, z3
-; CHECK-NEXT:    asr z19.s, p0/m, z19.s, z7.s
-; CHECK-NEXT:    st4w { z16.s - z19.s }, p0, [x12]
-; CHECK-NEXT:    b.ne .LBB0_1
-; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-entry:
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %entry
-  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
-  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
-  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
-  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
-  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
-  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
-  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
-  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
-  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
-  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
-  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
-  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
-  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
-  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
-  %16 = add nsw <vscale x 4 x i32> %12, %5
-  %17 = sub nsw <vscale x 4 x i32> %7, %14
-  %18 = shl <vscale x 4 x i32> %6, %13
-  %19 = ashr <vscale x 4 x i32> %8, %15
-  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
-  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
-  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
-  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
-  %index.next = add nuw i64 %index, %1
-  %21 = icmp eq i64 %index.next, 1024
-  br i1 %21, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup:                                 ; preds = %vector.body
-  ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
deleted file mode 100644
index dcade71ccb684..0000000000000
--- a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {
-; CHECK-LABEL: interleave:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld2w { z1.s, z2.s }, p0/z, [x1]
-; CHECK-NEXT:    ld2w { z3.s, z4.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT:    uzp2 z5.s, z1.s, z3.s
-; CHECK-NEXT:    uzp1 z6.s, z1.s, z3.s
-; CHECK-NEXT:    uzp2 z7.s, z2.s, z4.s
-; CHECK-NEXT:    uzp1 z1.s, z2.s, z4.s
-; CHECK-NEXT:    add z2.s, z0.s, z6.s
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    lsl z3.s, p0/m, z3.s, z0.s
-; CHECK-NEXT:    sub z1.s, z1.s, z0.s
-; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z7.s
-; CHECK-NEXT:    zip1 z4.s, z2.s, z3.s
-; CHECK-NEXT:    zip2 z2.s, z2.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z1.s, z0.s
-; CHECK-NEXT:    zip2 z3.s, z1.s, z0.s
-; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
-; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #2, mul vl]
-; CHECK-NEXT:    ret
-  %wide.vec = load <vscale x 16 x i32>, ptr %a, align 4
-  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
-  %9 = add nsw <vscale x 4 x i32> %x, %5
-  %10 = sub nsw <vscale x 4 x i32> %7, %x
-  %11 = shl <vscale x 4 x i32> %6, %x
-  %12 = ashr <vscale x 4 x i32> %8, %x
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %9, <vscale x 4 x i32> %11)
-  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %10, <vscale x 4 x i32> %12)
-  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
-  store <vscale x 16 x i32> %interleaved.vec62, ptr %dst, align 4
-  ret void
-}
-
-define void @wide_interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 8 x i32> %x) {
-; CHECK-LABEL: wide_interleave:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld2w { z2.s, z3.s }, p0/z, [x1]
-; CHECK-NEXT:    ld2w { z4.s, z5.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT:    ld2w { z6.s, z7.s }, p0/z, [x1, #4, mul vl]
-; CHECK-NEXT:    ld2w { z24.s, z25.s }, p0/z, [x1, #6, mul vl]
-; CHECK-NEXT:    uzp2 z26.s, z2.s, z4.s
-; CHECK-NEXT:    uzp1 z27.s, z2.s, z4.s
-; CHECK-NEXT:    uzp2 z28.s, z3.s, z5.s
-; CHECK-NEXT:    uzp1 z2.s, z3.s, z5.s
-; CHECK-NEXT:    add z3.s, z0.s, z27.s
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    lsl z4.s, p0/m, z4.s, z0.s
-; CHECK-NEXT:    sub z2.s, z2.s, z0.s
-; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z28.s
-; CHECK-NEXT:    zip1 z26.s, z3.s, z4.s
-; CHECK-NEXT:    zip2 z3.s, z3.s, z4.s
-; CHECK-NEXT:    zip1 z27.s, z2.s, z0.s
-; CHECK-NEXT:    zip2 z4.s, z2.s, z0.s
-; CHECK-NEXT:    uzp2 z0.s, z6.s, z24.s
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z24.s
-; CHECK-NEXT:    st2w { z26.s, z27.s }, p0, [x0]
-; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    add z2.s, z1.s, z2.s
-; CHECK-NEXT:    st2w { z3.s, z4.s }, p0, [x0, #2, mul vl]
-; CHECK-NEXT:    uzp2 z3.s, z7.s, z25.s
-; CHECK-NEXT:    uzp1 z4.s, z7.s, z25.s
-; CHECK-NEXT:    zip1 z5.s, z2.s, z0.s
-; CHECK-NEXT:    sub z4.s, z4.s, z1.s
-; CHECK-NEXT:    asrr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    zip2 z2.s, z2.s, z0.s
-; CHECK-NEXT:    zip1 z6.s, z4.s, z1.s
-; CHECK-NEXT:    zip2 z3.s, z4.s, z1.s
-; CHECK-NEXT:    st2w { z5.s, z6.s }, p0, [x0, #4, mul vl]
-; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #6, mul vl]
-; CHECK-NEXT:    ret
-  %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
-  %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
-  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
-  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 1
-  %9 = add nsw <vscale x 8 x i32> %x, %5
-  %10 = sub nsw <vscale x 8 x i32> %7, %x
-  %11 = shl <vscale x 8 x i32> %6, %x
-  %12 = ashr <vscale x 8 x i32> %8, %x
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %9, <vscale x 8 x i32> %11)
-  %interleaved.vec61 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %10, <vscale x 8 x i32> %12)
-  %interleaved.vec62 = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.vec, <vscale x 16 x i32> %interleaved.vec61)
-  store <vscale x 32 x i32> %interleaved.vec62, ptr %dst, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
deleted file mode 100644
index 2ea14b13265c6..0000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
-; CHECK-LABEL: loop_xyzt:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    srli a3, a4, 1
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    li a5, 1024
-; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
-; CHECK-NEXT:  .LBB0_1: # %vector.body
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vlseg4e32.v v8, (a1)
-; CHECK-NEXT:    vlseg4e32.v v16, (a2)
-; CHECK-NEXT:    vadd.vv v8, v16, v8
-; CHECK-NEXT:    vsub.vv v10, v10, v18
-; CHECK-NEXT:    vsll.vv v12, v12, v20
-; CHECK-NEXT:    vsra.vv v14, v14, v22
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
-; CHECK-NEXT:    sub a5, a5, a3
-; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    bnez a5, .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT:    ret
-entry:
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %entry
-  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
-  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
-  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
-  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
-  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
-  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
-  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
-  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
-  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
-  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
-  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
-  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
-  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
-  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
-  %16 = add nsw <vscale x 4 x i32> %12, %5
-  %17 = sub nsw <vscale x 4 x i32> %7, %14
-  %18 = shl <vscale x 4 x i32> %6, %13
-  %19 = ashr <vscale x 4 x i32> %8, %15
-  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
-  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
-  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
-  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
-  %index.next = add nuw i64 %index, %1
-  %21 = icmp eq i64 %index.next, 1024
-  br i1 %21, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup:                                 ; preds = %vector.body
-  ret void
-}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
new file mode 100644
index 0000000000000..d6d0e98edb3c8
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    ret void
+;
+
+  %load = load <vscale x 16 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  ret void
+}
+
+define void @wide_deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @wide_deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP13]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
+; CHECK-NEXT:    ret void
+;
+  %load = load <vscale x 32 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 0
+  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 1
+  %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
+  ret void
+}
+
+define void @mix_deinterleave4_deinterleave2(ptr %src) {
+; CHECK-LABEL: define void @mix_deinterleave4_deinterleave2
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    ret void
+;
+
+  %load = load <vscale x 16 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+  %load1 = load <vscale x 8 x i32>, ptr %src, align 4
+  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load1)
+  %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
+  %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
+  ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
new file mode 100644
index 0000000000000..9e38172aaeff0
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  store <vscale x 16 x i32> %interleaved.vec, ptr %dst, align 4
+  ret void
+}
+
+define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
+; CHECK-LABEL: define void @wide_interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[A]], <vscale x 8 x i32> [[C]])
+; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[B]], <vscale x 8 x i32> [[D]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 4
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 4)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    ret void
+;
+  %interleaved.half1 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %c)
+  %interleaved.half2 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %d)
+  %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.half1, <vscale x 16 x i32> %interleaved.half2)
+  store <vscale x 32 x i32> %interleaved.vec, ptr %dst, align 4
+  ret void
+}
+
+define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @mix_interleave4_interleave2
+; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
+; CHECK-NEXT:    ret void
+;
+  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  store <vscale x 16 x i32> %interleaved.vec, ptr %dst1, align 4
+
+  %interleaved = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+  store <vscale x 8 x i32> %interleaved, ptr %dst2, align 4
+  ret void
+}

>From dea79e52a7c83e6b2f9ae13faa2e58492d6db3c4 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 13 Jun 2024 14:45:35 +0000
Subject: [PATCH 5/7] add DeadCodeElim pass to the RUN line

Change-Id: I2b2dc683dba21cdb6c35f407868a7537245c845e
---
 .../InterleavedAccess/AArch64/sve-interleave4.ll          | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index 9e38172aaeff0..6cbd201ab36a2 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+; RUN: opt < %s -passes=interleaved-access,dce -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
 
 
 define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
 ; CHECK-LABEL: define void @interleave4
 ; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
 ; CHECK-NEXT:    ret void
 ;
@@ -20,8 +18,6 @@ define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
 define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
 ; CHECK-LABEL: define void @wide_interleave4
 ; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[A]], <vscale x 8 x i32> [[C]])
-; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[B]], <vscale x 8 x i32> [[D]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
@@ -46,8 +42,6 @@ define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32
 define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
 ; CHECK-LABEL: define void @mix_interleave4_interleave2
 ; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
 ; CHECK-NEXT:    ret void

>From 64dbc1e362ef6e93e7dcb47c374ccbfc02df5ca7 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 18 Jun 2024 17:10:52 +0000
Subject: [PATCH 6/7] Fix assmuption of the extraction order, make it generic
 then make sure of the order using pattern match

Change-Id: I053e47d156c37cf4d7ab5b2af83c348b4210631a
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 116 ++++++++++--------
 1 file changed, 66 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 642759b606a72..23693f496c629 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16881,68 +16881,85 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool getDeinterleavedValues(Value *DI,
-                            SmallVectorImpl<Value *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  if (!DI->hasNUses(2))
+bool getDeinterleavedValues(
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
+  if (!DI->hasNUsesOrMore(2))
     return false;
-
-  // make sure that the users of DI are extractValue instructions
-  auto *Extr0 = *(++DI->user_begin());
-  if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
-    return false;
-  auto *Extr1 = *(DI->user_begin());
-  if (!match(Extr1, m_ExtractValue<1>(m_Deinterleave2(m_Value()))))
+  auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+  auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+  if (!Extr1 || !Extr2)
     return false;
 
-  // each extractValue instruction is expected to have a single user,
-  // which should be another DI
-  if (!Extr0->hasOneUser() || !Extr1->hasOneUser())
+  if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))
     return false;
-  auto *DI1 = *(Extr0->user_begin());
-  if (!match(DI1, m_Deinterleave2(m_Value())))
+  auto *DI1 = *(Extr1->user_begin());
+  auto *DI2 = *(Extr2->user_begin());
+
+  if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2))
     return false;
-  auto *DI2 = *(Extr1->user_begin());
-  if (!match(DI2, m_Deinterleave2(m_Value())))
+  // Leaf nodes of the deinterleave tree:
+  auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
+  auto *B = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
+  auto *C = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
+  auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
+  // Make sure that the A,B,C,D are instructions of ExtractValue,
+  // before getting the extract index
+  if (!A || !B || !C || !D)
     return false;
 
-  if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+  DeinterleavedValues.resize(4);
+  // Place the values into the vector in the order of extraction:
+  DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
+  DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
+  DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
+  DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;
+
+  // Make sure that A,B,C,D match the deinterleave tree pattern
+  if (!match(DeinterleavedValues[0],
+             m_ExtractValue<0>(m_Deinterleave2(
+                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[1],
+             m_ExtractValue<1>(m_Deinterleave2(
+                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[2],
+             m_ExtractValue<0>(m_Deinterleave2(
+                 m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[3],
+             m_ExtractValue<1>(m_Deinterleave2(
+                 m_ExtractValue<1>(m_Deinterleave2(m_Value())))))) {
+    LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
     return false;
-
-  // Leaf nodes of the deinterleave tree
-  auto *A = *(++DI1->user_begin());
-  auto *C = *(DI1->user_begin());
-  auto *B = *(++DI2->user_begin());
-  auto *D = *(DI2->user_begin());
-
-  DeinterleavedValues.push_back(A);
-  DeinterleavedValues.push_back(B);
-  DeinterleavedValues.push_back(C);
-  DeinterleavedValues.push_back(D);
-
-  // These Values will not be used anymre,
-  // DI4 will be created instead of nested DI1 and DI2
-  DeadInsts.push_back(cast<Instruction>(DI1));
-  DeadInsts.push_back(cast<Instruction>(Extr0));
-  DeadInsts.push_back(cast<Instruction>(DI2));
-  DeadInsts.push_back(cast<Instruction>(Extr1));
-
+  }
+  // Order the values according to the deinterleaving order.
+  std::swap(DeinterleavedValues[1], DeinterleavedValues[2]);
   return true;
 }
 
+void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
+  Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
+  match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
+  assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
+  match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
+  assert(DeadExtract != nullptr && "Match is expected to succeed");
+  DeadRoot->eraseFromParent();
+  if (DeadDeinterleave->getNumUses() == 0)
+    cast<Instruction>(DeadDeinterleave)->eraseFromParent();
+  if (DeadExtract->getNumUses() == 0)
+    cast<Instruction>(DeadExtract)->eraseFromParent();
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  SmallVector<Value *, 4> DeinterleavedValues;
-  SmallVector<Instruction *, 10> DeadInsts;
+  SmallVector<Instruction *, 4> DeinterleavedValues;
   const DataLayout &DL = DI->getModule()->getDataLayout();
   unsigned Factor = 2;
   VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
 
-  if (getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+  if (getDeinterleavedValues(DI, DeinterleavedValues)) {
     Factor = DeinterleavedValues.size();
     VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
   }
@@ -16989,7 +17006,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
       Value *Idx =
           Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
-      for (int J = 0; J < Factor; ++J) {
+      for (unsigned J = 0; J < Factor; ++J) {
         WideValues[J] = Builder.CreateInsertVector(
             VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
       }
@@ -16999,7 +17016,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     else
       Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
     // Construct the wide result out of the small results.
-    for (int J = 0; J < Factor; ++J) {
+    for (unsigned J = 0; J < Factor; ++J) {
       Result = Builder.CreateInsertValue(Result, WideValues[J], J);
     }
   } else {
@@ -17009,15 +17026,14 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
   if (Factor > 2) {
+    // Itereate over old deinterleaved values to replace it by
+    // the new deinterleaved values.
     for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
-      llvm::Value *CurrentExtract = DeinterleavedValues[I];
       Value *NewExtract = Builder.CreateExtractValue(Result, I);
-      CurrentExtract->replaceAllUsesWith(NewExtract);
-      cast<Instruction>(CurrentExtract)->eraseFromParent();
+      DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
     }
-
-    for (auto &dead : DeadInsts)
-      dead->eraseFromParent();
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
+      deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
     return true;
   }
   DI->replaceAllUsesWith(Result);
@@ -17099,7 +17115,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
       Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
       Value *Idx =
           Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
-      for (int J = 0; J < Factor; J++) {
+      for (unsigned J = 0; J < Factor; J++) {
         ValuesToInterleave[J] =
             Builder.CreateExtractVector(StTy, WideValues[J], Idx);
       }

>From d4ac0b5cbc13f70dd486305e6b7e910e6ed650a8 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 4 Jul 2024 16:13:06 +0000
Subject: [PATCH 7/7] Unify the logic for ld2 and ld4d ld4

add negative tests

Change-Id: Id323139f11ddc4d3a22a72af84a01e98dadfe46d
---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |   1 +
 .../Target/AArch64/AArch64ISelLowering.cpp    | 166 +++--
 .../AArch64/fixed-deinterleave-intrinsics.ll  |  66 +-
 .../scalable-deinterleave-intrinsics.ll       |  90 +--
 .../AArch64/sve-deinterleave4.ll              |  49 +-
 .../AArch64/sve-interleaved-accesses.ll       | 576 ++++++++++--------
 6 files changed, 557 insertions(+), 391 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 86f856139ff30..8c9065aec7faa 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -509,6 +509,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+
   // Try and match this with target specific intrinsics.
   if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 23693f496c629..d8da8532a45d7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16881,26 +16881,71 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool getDeinterleavedValues(
+bool getDeinterleave2Values(
     Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
-  if (!DI->hasNUsesOrMore(2))
+  if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
   auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
   if (!Extr1 || !Extr2)
     return false;
 
-  if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))
+  DeinterleavedValues.resize(2);
+  // Place the values into the vector in the order of extraction:
+  DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1;
+  DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2;
+  if (!DeinterleavedValues[0] || !DeinterleavedValues[1])
+    return false;
+
+  // Make sure that the extracted values match the deinterleave tree pattern
+  if (!match(DeinterleavedValues[0],
+             m_ExtractValue<0>((m_Specific(DI)))) ||
+      !match(DeinterleavedValues[1],
+             m_ExtractValue<1>((m_Specific(DI))))) {
+    LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
+    return false;
+  }
+  return true;
+}
+
+/*
+Diagram for DI tree.
+                  [LOAD]
+                    |
+                   [DI]
+                /        \
+         [Extr<0>]      [Extr<1>]
+            |                 |
+           [DI]              [DI]
+          /    \            /    \
+    [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
+        |       |         |         |
+roots:  A       C         B         D
+roots in correct order of DI4: A B C D.
+If there is a pattern matches the deinterleave tree above, then we can construct DI4 out of that pattern.
+This function tries to match the deinterleave tree pattern, and fetch the tree roots,
+so that in further steps they can be replaced by the output of DI4.
+*/
+bool getDeinterleave4Values(
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues, SmallVectorImpl<Instruction *> &DeadInstructions) {
+  if (!DI->hasNUses(2))
+    return false;
+  auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+  auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+  if (!Extr1 || !Extr2)
+    return false;
+
+  if (!Extr1->hasNUses(1) || !Extr2->hasNUses(1))
     return false;
   auto *DI1 = *(Extr1->user_begin());
   auto *DI2 = *(Extr2->user_begin());
 
-  if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2))
+  if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
     return false;
   // Leaf nodes of the deinterleave tree:
   auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
-  auto *B = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
-  auto *C = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
+  auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
+  auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
   auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
   // Make sure that the A,B,C,D are instructions of ExtractValue,
   // before getting the extract index
@@ -16908,44 +16953,45 @@ bool getDeinterleavedValues(
     return false;
 
   DeinterleavedValues.resize(4);
-  // Place the values into the vector in the order of extraction:
-  DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
-  DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
-  DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
-  DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;
+  // Place the values into the vector in the order of deinterleave4:
+  DeinterleavedValues[0x3 & ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A;
+  DeinterleavedValues[0x3 & ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B;
+  DeinterleavedValues[0x3 & ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C;
+  DeinterleavedValues[0x3 & ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D;
+  if (!DeinterleavedValues[0] || !DeinterleavedValues[1] || !DeinterleavedValues[2] || !DeinterleavedValues[3])
+    return false;
 
   // Make sure that A,B,C,D match the deinterleave tree pattern
   if (!match(DeinterleavedValues[0],
              m_ExtractValue<0>(m_Deinterleave2(
-                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+                 m_ExtractValue<0>(m_Specific(DI))))) ||
       !match(DeinterleavedValues[1],
-             m_ExtractValue<1>(m_Deinterleave2(
-                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
-      !match(DeinterleavedValues[2],
              m_ExtractValue<0>(m_Deinterleave2(
-                 m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||
+                 m_ExtractValue<1>(m_Specific(DI))))) ||
+      !match(DeinterleavedValues[2],
+             m_ExtractValue<1>(m_Deinterleave2(
+                 m_ExtractValue<0>(m_Specific(DI))))) ||
       !match(DeinterleavedValues[3],
              m_ExtractValue<1>(m_Deinterleave2(
-                 m_ExtractValue<1>(m_Deinterleave2(m_Value())))))) {
+                 m_ExtractValue<1>(m_Specific(DI)))))) {
     LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
     return false;
   }
-  // Order the values according to the deinterleaving order.
-  std::swap(DeinterleavedValues[1], DeinterleavedValues[2]);
+
+  // These Values will not be used anymre,
+  // DI4 will be created instead of nested DI1 and DI2
+  DeadInstructions.push_back(cast<Instruction>(DI1));
+  DeadInstructions.push_back(cast<Instruction>(Extr1));
+  DeadInstructions.push_back(cast<Instruction>(DI2));
+  DeadInstructions.push_back(cast<Instruction>(Extr2));
+  
   return true;
 }
 
-void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
-  Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
-  match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
-  assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
-  match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
-  assert(DeadExtract != nullptr && "Match is expected to succeed");
-  DeadRoot->eraseFromParent();
-  if (DeadDeinterleave->getNumUses() == 0)
-    cast<Instruction>(DeadDeinterleave)->eraseFromParent();
-  if (DeadExtract->getNumUses() == 0)
-    cast<Instruction>(DeadExtract)->eraseFromParent();
+bool getDeinterleavedValues(Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues, SmallVectorImpl<Instruction *> &DeadInstructions) {
+  if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInstructions))
+    return true;
+  return getDeinterleave2Values(DI, DeinterleavedValues);
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
@@ -16955,16 +17001,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
 
   SmallVector<Instruction *, 4> DeinterleavedValues;
+  SmallVector<Instruction *, 4> DeadInstructions;
   const DataLayout &DL = DI->getModule()->getDataLayout();
-  unsigned Factor = 2;
-  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
-
-  if (getDeinterleavedValues(DI, DeinterleavedValues)) {
-    Factor = DeinterleavedValues.size();
-    VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+  
+  if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInstructions)) {
+    LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+    return false;
   }
+  unsigned Factor = DeinterleavedValues.size();
   assert((Factor == 2 || Factor == 4) &&
-         "Currently supported Factors are 2 or 4");
+         "Currently supported Factor is 2 or 4 only");
+  VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
 
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -17025,22 +17072,33 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
-  if (Factor > 2) {
-    // Itereate over old deinterleaved values to replace it by
-    // the new deinterleaved values.
-    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
-      Value *NewExtract = Builder.CreateExtractValue(Result, I);
-      DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
-    }
-    for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
-      deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
-    return true;
+  // Itereate over old deinterleaved values to replace it by
+  // the new values.
+  for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+    Value *NewExtract = Builder.CreateExtractValue(Result, I);
+    DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
+    cast<Instruction>(DeinterleavedValues[I])->eraseFromParent();
   }
-  DI->replaceAllUsesWith(Result);
+  for (auto &dead : DeadInstructions)
+    dead->eraseFromParent();
   return true;
 }
 
-bool getValuesToInterleaved(Value *II,
+/*
+Diagram for Interleave tree.
+          A    C         B    D
+           \  /           \  / 
+       [Interleave]   [Interleave]
+                 \     / 
+               [Interleave]
+                    |
+                 [Store]
+values in correct order of interleave4: A B C D.
+If there is a pattern matches the interleave tree above, then we can construct Interleave4 out of that pattern.
+This function tries to match the interleave tree pattern, and fetch the values that we want to interleave,
+so that in further steps they can be replaced by the output of Inteleave4.
+*/
+bool getValuesToInterleave(Value *II,
                             SmallVectorImpl<Value *> &ValuesToInterleave) {
   Value *A, *B, *C, *D;
   // Try to match interleave of Factor 4
@@ -17065,14 +17123,18 @@ bool getValuesToInterleaved(Value *II,
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     IntrinsicInst *II, StoreInst *SI) const {
-  LLVM_DEBUG(dbgs() << "lowerInterleaveIntrinsicToStore\n");
+  // Only interleave2 supported at present.
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
+    return false;
 
   SmallVector<Value *, 4> ValuesToInterleave;
-  if (!getValuesToInterleaved(II, ValuesToInterleave))
+  if (!getValuesToInterleave(II, ValuesToInterleave)) {
+    LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
+  }
   unsigned Factor = ValuesToInterleave.size();
   assert((Factor == 2 || Factor == 4) &&
-         "Currently supported Factors are 2 or 4");
+         "Currently supported Factor is 2 or 4 only");
   VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
   const DataLayout &DL = II->getModule()->getDataLayout();
 
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 24d624c221f46..2582bb56ceef8 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -6,28 +6,35 @@
 
 target triple = "aarch64-linux-gnu"
 
-define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
-; NEON-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+define void @deinterleave_i8_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i8_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <16 x i8>, <16 x i8> } [[LDN]]
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <32 x i8>, ptr %ptr, align 1
   %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load)
-  ret { <16 x i8>, <16 x i8> } %deinterleave
+  %extract1 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 0
+  %extract2 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 1
+  ret void
 }
 
 define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <8 x i16>, <8 x i16> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; NEON-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -43,8 +50,9 @@ define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <4 x i32>, <4 x i32> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
+; NEON-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -60,8 +68,9 @@ define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <2 x i64>, <2 x i64> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
+; NEON-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -77,8 +86,9 @@ define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <4 x float>, <4 x float> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
+; NEON-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -94,8 +104,9 @@ define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <2 x double>, <2 x double> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
+; NEON-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -111,8 +122,9 @@ define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
+; NEON-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -247,21 +259,9 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
 define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0
-; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]])
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
-; NEON-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP2]], i64 0)
-; NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
-; NEON-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP4]], i64 0)
-; NEON-NEXT:    [[TMP6:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 2
-; NEON-NEXT:    [[LDN1:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP6]])
-; NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 0
-; NEON-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
-; NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
-; NEON-NEXT:    [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
-; NEON-NEXT:    [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
-; NEON-NEXT:    [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
-; NEON-NEXT:    ret { <16 x i16>, <16 x i16> } [[TMP12]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
+; NEON-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 2a05718cc4161..589b43b3cc177 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -4,22 +4,27 @@
 
 target triple = "aarch64-linux-gnu"
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2
+define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi8_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
   %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
+  %extract1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave, 1
+  %extract2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave, 0
+  ret void
 }
 
 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i16>, ptr [[PTR]], align 2
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
   %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
@@ -29,8 +34,9 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(pt
 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x i32>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
@@ -40,8 +46,9 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
@@ -51,8 +58,9 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(pt
 define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x float>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
@@ -62,8 +70,9 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_fact
 define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x double>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
@@ -73,8 +82,9 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_f
 define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
@@ -163,33 +173,9 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
 define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 0
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 4
-; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP12]], i64 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP14]], i64 8)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 6
-; CHECK-NEXT:    [[LDN3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP16]])
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
-; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
-; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 32 x i32>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
@@ -199,21 +185,9 @@ define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_fac
 define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
-; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x double>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index d6d0e98edb3c8..f4dda5209f3bb 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -10,6 +10,8 @@ define void @deinterleave4(ptr %src) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    ret void
 ;
 
@@ -23,6 +25,8 @@ define void @deinterleave4(ptr %src) {
   %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
   %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
   %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  %sum = add <vscale x 4 x i32> %5, %7
+  %sub = sub <vscale x 4 x i32> %6, %8
   ret void
 }
 
@@ -81,8 +85,8 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
 ; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
-; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
 ; CHECK-NEXT:    ret void
 ;
 
@@ -97,9 +101,46 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
   %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
   %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
 
-  %load1 = load <vscale x 8 x i32>, ptr %src, align 4
-  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load1)
+  %load2 = load <vscale x 8 x i32>, ptr %src, align 4
+  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
   %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
   %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
   ret void
 }
+
+define void @negative_deinterleave4_test(ptr %src) {
+; CHECK-LABEL: define void @negative_deinterleave4_test
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP11]], <vscale x 8 x i32> [[TMP10]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP12]], 1
+; CHECK-NEXT:    [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
+; CHECK-NEXT:    [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
+; CHECK-NEXT:    ret void
+;
+  %load = load <vscale x 16 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 73f26814f3a4b..4ef9ba30c825e 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -1,36 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -interleaved-access -S | FileCheck %s
 ; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s
 
 target triple = "aarch64-linux-gnu"
 
 define void @load_factor2(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor2(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT1:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT2:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP3]], i64 0)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP4]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <32 x i16>, ptr %ptr, align 4
   %v0 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
-                                                                                  i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %v1 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
-                                                                                  i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   ret void
 }
 
 define void @load_factor3(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor3(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT1:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT2:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <24 x i32>, ptr %ptr, align 4
   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
@@ -39,18 +44,20 @@ define void @load_factor3(ptr %ptr) #0 {
 }
 
 define void @load_factor4(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor4(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <16 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -60,54 +67,64 @@ define void @load_factor4(ptr %ptr) #0 {
 }
 
 define void @store_factor2(ptr %ptr, <16 x i16> %v0, <16 x i16> %v1) #0 {
-; CHECK-LABEL:    @store_factor2(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP2]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[INS1]], <vscale x 8 x i16> [[INS2]], <vscale x 8 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x i16> [[V0:%.*]], <16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <16 x i16> %v0, <16 x i16> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
-                                                                               i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   store <32 x i16> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @store_factor3(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) #0 {
-; CHECK-LABEL:    @store_factor3(
-; CHECK:            [[PTRUE:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[INS1]], <vscale x 4 x i32> [[INS2]], <vscale x 4 x i32> [[INS3]], <vscale x 4 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                                                                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                                                                   i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19,
-                                                                               i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
   store <24 x i32> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) #0 {
-; CHECK-LABEL:    @store_factor4(
-; CHECK:            [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]], <4 x i64> [[V2:%.*]], <4 x i64> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i64> [[V2]], <4 x i64> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -116,16 +133,18 @@ define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2
 }
 
 define void @load_ptrvec_factor2(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_ptrvec_factor2(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_ptrvec_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <8 x ptr>, ptr %ptr, align 4
   %v0 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %v1 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -133,19 +152,21 @@ define void @load_ptrvec_factor2(ptr %ptr) #0 {
 }
 
 define void @load_ptrvec_factor3(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_ptrvec_factor3(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_ptrvec_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <12 x ptr>, ptr %ptr, align 4
   %v0 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
@@ -154,22 +175,24 @@ define void @load_ptrvec_factor3(ptr %ptr) #0 {
 }
 
 define void @load_ptrvec_factor4(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_ptrvec_factor4(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       [[TOP4:%.*]] = inttoptr <4 x i64> [[EXT4]] to <4 x ptr>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_ptrvec_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr <4 x i64> [[TMP12]] to <4 x ptr>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <16 x ptr>, ptr %ptr, align 4
   %v0 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %v1 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -179,34 +202,40 @@ define void @load_ptrvec_factor4(ptr %ptr) #0 {
 }
 
 define void @store_ptrvec_factor2(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1) #0 {
-; CHECK-LABEL:    @store_ptrvec_factor2(
-; CHECK-NEXT:       [[TOI1:%.*]] = ptrtoint <4 x ptr> %v0 to <4 x i64>
-; CHECK-NEXT:       [[TOI2:%.*]] = ptrtoint <4 x ptr> %v1 to <4 x i64>
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_ptrvec_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x ptr> [[V0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x ptr> [[V1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x ptr> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2) #0 {
-; CHECK-LABEL:    @store_ptrvec_factor3(
-; CHECK:            [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64>
-; CHECK-NEXT:       [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64>
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_ptrvec_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x ptr> %v2, <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -215,45 +244,51 @@ define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x p
 }
 
 define void @store_ptrvec_factor4(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2, <4 x ptr> %v3) #0 {
-; CHECK-LABEL:    @store_ptrvec_factor4(
-; CHECK:            [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64>
-; CHECK-NEXT:       [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64>
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_ptrvec_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]], <4 x ptr> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP10]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x ptr> %v2, <4 x ptr> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13,
-                                                                               i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
   store <16 x ptr> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @load_factor2_wide(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor2_wide(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = getelementptr i64, ptr %ptr, i32 8
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr [[TMP4]])
-; CHECK-NEXT:       [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP6]], i64 0)
-; CHECK-NEXT:       [[TMP7:%.*]] = shufflevector <4 x i64> [[EXT1]], <4 x i64> [[EXT3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[TMP8:%.*]] = shufflevector <4 x i64> [[EXT2]], <4 x i64> [[EXT4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor2_wide(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[PTR]], i32 8
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP9]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <16 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -261,20 +296,22 @@ define void @load_factor2_wide(ptr %ptr) #0 {
 }
 
 define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 {
-; CHECK-LABEL:    @store_factor2_wide(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       [[TMP6:%.*]] = getelementptr i64, ptr %ptr, i32 8
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr [[TMP6]])
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor2_wide(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i64> [[V0:%.*]], <8 x i64> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[PTR]], i32 8
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[TMP10]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <8 x i64> %v0, <8 x i64> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i64> %interleaved.vec, ptr %ptr, align 4
   ret void
@@ -282,9 +319,27 @@ define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 {
 
 ; Check that neon is used for illegal multiples of 128-bit types
 define void @load_384bit(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_384bit(
-; CHECK: llvm.aarch64.neon.ld2
-; CHECK-NOT: llvm.aarch64.sve.ld2
+; CHECK-LABEL: define void @load_384bit(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[PTR]], i32 4
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <12 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
   %v1 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
@@ -293,9 +348,13 @@ define void @load_384bit(ptr %ptr) #0 {
 
 ; Check that neon is used for 128-bit vectors
 define void @load_128bit(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_128bit(
-; CHECK: llvm.aarch64.neon.ld2
-; CHECK-NOT: llvm.aarch64.sve.ld2
+; CHECK-LABEL: define void @load_128bit(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <4 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
   %v1 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
@@ -304,8 +363,16 @@ define void @load_128bit(ptr %ptr) #0 {
 
 ; Check that correct ptrues are generated for min != max case
 define void @load_min_not_max(ptr %ptr) #1 {
-; CHECK-LABEL:    @load_min_not_max(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @load_min_not_max(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <8 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -313,8 +380,16 @@ define void @load_min_not_max(ptr %ptr) #1 {
 }
 
 define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 {
-; CHECK-LABEL:    @store_min_not_max(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @store_min_not_max(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i64> %interleaved.vec, ptr %ptr, align 4
   ret void
@@ -322,8 +397,16 @@ define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 {
 
 ; Check that correct ptrues are generated for min > type case
 define void @load_min_ge_type(ptr %ptr) #2 {
-; CHECK-LABEL:    @load_min_ge_type(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @load_min_ge_type(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <8 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -331,25 +414,34 @@ define void @load_min_ge_type(ptr %ptr) #2 {
 }
 
 define void @store_min_ge_type(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #2 {
-; CHECK-LABEL:    @store_min_ge_type(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @store_min_ge_type(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i64> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @load_double_factor4(ptr %ptr) #0 {
-; CHECK-LABEL: @load_double_factor4(
+; CHECK-LABEL: define void @load_double_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP5]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP7]], i64 0)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP9]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP8]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x double>, ptr %ptr, align 4
@@ -361,15 +453,16 @@ define void @load_double_factor4(ptr %ptr) #0 {
 }
 
 define void @load_float_factor3(ptr %ptr) #0 {
-; CHECK-LABEL: @load_float_factor3(
+; CHECK-LABEL: define void @load_float_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP5]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP7]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP6]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <24 x float>, ptr %ptr, align 4
@@ -380,13 +473,14 @@ define void @load_float_factor3(ptr %ptr) #0 {
 }
 
 define void @load_half_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_half_factor2(
+; CHECK-LABEL: define void @load_half_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP4]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <32 x half>, ptr %ptr, align 4
@@ -396,13 +490,14 @@ define void @load_half_factor2(ptr %ptr) #0 {
 }
 
 define void @load_bfloat_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_bfloat_factor2(
+; CHECK-LABEL: define void @load_bfloat_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <32 x bfloat>, ptr %ptr, align 4
@@ -412,9 +507,10 @@ define void @load_bfloat_factor2(ptr %ptr) #0 {
 }
 
 define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) #0 {
-; CHECK-LABEL: @store_double_factor4(
-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define void @store_double_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0]], <4 x double> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2]], <4 x double> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP2]], i64 0)
@@ -424,7 +520,7 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1,
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP6]], i64 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP8]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -435,9 +531,10 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1,
 }
 
 define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8 x float> %v2) #0 {
-; CHECK-LABEL: @store_float_factor3(
-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[V2:%.*]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-LABEL: define void @store_float_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <8 x float> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[V0]], <8 x float> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[V2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP2]], i64 0)
@@ -445,7 +542,7 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP6]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %s0 = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -459,13 +556,14 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8
 }
 
 define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 {
-; CHECK-LABEL: @store_half_factor2(
+; CHECK-LABEL: define void @store_half_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v16f16(<vscale x 8 x half> undef, <16 x half> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v16f16(<vscale x 8 x half> undef, <16 x half> [[TMP4]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <16 x half> %v0, <16 x half> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
@@ -476,13 +574,14 @@ define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 {
 
 
 define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1) #0 {
-; CHECK-LABEL: @store_bfloat_factor2(
+; CHECK-LABEL: define void @store_bfloat_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP4]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <16 x bfloat> %v0, <16 x bfloat> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
@@ -494,20 +593,9 @@ define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1
 ; Ensure vscale_range property does not affect scalable vector types.
 define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(ptr %ptr) #2 {
 ; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x double>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[WIDE_VEC]])
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8