[llvm] [IA]: Construct (de)interleave4 out of (de)interleave2 (PR #89276)

Thu Aug 8 00:36:38 PDT 2024

https://github.com/hassnaaHamdi updated https://github.com/llvm/llvm-project/pull/89276

>From 08a64946531aeb6b6113fd9901b5b1b317e3643c Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 15 May 2024 23:44:44 +0000
Subject: [PATCH 01/10] [AArch64][Interleave]: Add test precommit

Change-Id: I5e2613156a482dcadae3e4cfa1bacdf7f3293fe2
---
 .../AArch64/sve-interleave_accesses4-load.ll  | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
new file mode 100644
index 00000000000000..dcade71ccb6846
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: interleave:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld2w { z1.s, z2.s }, p0/z, [x1]
+; CHECK-NEXT:    ld2w { z3.s, z4.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    uzp2 z5.s, z1.s, z3.s
+; CHECK-NEXT:    uzp1 z6.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z7.s, z2.s, z4.s
+; CHECK-NEXT:    uzp1 z1.s, z2.s, z4.s
+; CHECK-NEXT:    add z2.s, z0.s, z6.s
+; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    lsl z3.s, p0/m, z3.s, z0.s
+; CHECK-NEXT:    sub z1.s, z1.s, z0.s
+; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z7.s
+; CHECK-NEXT:    zip1 z4.s, z2.s, z3.s
+; CHECK-NEXT:    zip2 z2.s, z2.s, z3.s
+; CHECK-NEXT:    zip1 z5.s, z1.s, z0.s
+; CHECK-NEXT:    zip2 z3.s, z1.s, z0.s
+; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
+; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    ret
+  %wide.vec = load <vscale x 16 x i32>, ptr %a, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = add nsw <vscale x 4 x i32> %x, %5
+  %10 = sub nsw <vscale x 4 x i32> %7, %x
+  %11 = shl <vscale x 4 x i32> %6, %x
+  %12 = ashr <vscale x 4 x i32> %8, %x
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %9, <vscale x 4 x i32> %11)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %10, <vscale x 4 x i32> %12)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %dst, align 4
+  ret void
+}
+
+define void @wide_interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 8 x i32> %x) {
+; CHECK-LABEL: wide_interleave:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld2w { z2.s, z3.s }, p0/z, [x1]
+; CHECK-NEXT:    ld2w { z4.s, z5.s }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT:    ld2w { z6.s, z7.s }, p0/z, [x1, #4, mul vl]
+; CHECK-NEXT:    ld2w { z24.s, z25.s }, p0/z, [x1, #6, mul vl]
+; CHECK-NEXT:    uzp2 z26.s, z2.s, z4.s
+; CHECK-NEXT:    uzp1 z27.s, z2.s, z4.s
+; CHECK-NEXT:    uzp2 z28.s, z3.s, z5.s
+; CHECK-NEXT:    uzp1 z2.s, z3.s, z5.s
+; CHECK-NEXT:    add z3.s, z0.s, z27.s
+; CHECK-NEXT:    movprfx z4, z26
+; CHECK-NEXT:    lsl z4.s, p0/m, z4.s, z0.s
+; CHECK-NEXT:    sub z2.s, z2.s, z0.s
+; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z28.s
+; CHECK-NEXT:    zip1 z26.s, z3.s, z4.s
+; CHECK-NEXT:    zip2 z3.s, z3.s, z4.s
+; CHECK-NEXT:    zip1 z27.s, z2.s, z0.s
+; CHECK-NEXT:    zip2 z4.s, z2.s, z0.s
+; CHECK-NEXT:    uzp2 z0.s, z6.s, z24.s
+; CHECK-NEXT:    uzp1 z2.s, z6.s, z24.s
+; CHECK-NEXT:    st2w { z26.s, z27.s }, p0, [x0]
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    add z2.s, z1.s, z2.s
+; CHECK-NEXT:    st2w { z3.s, z4.s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp2 z3.s, z7.s, z25.s
+; CHECK-NEXT:    uzp1 z4.s, z7.s, z25.s
+; CHECK-NEXT:    zip1 z5.s, z2.s, z0.s
+; CHECK-NEXT:    sub z4.s, z4.s, z1.s
+; CHECK-NEXT:    asrr z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    zip2 z2.s, z2.s, z0.s
+; CHECK-NEXT:    zip1 z6.s, z4.s, z1.s
+; CHECK-NEXT:    zip2 z3.s, z4.s, z1.s
+; CHECK-NEXT:    st2w { z5.s, z6.s }, p0, [x0, #4, mul vl]
+; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #6, mul vl]
+; CHECK-NEXT:    ret
+  %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
+  %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 1
+  %9 = add nsw <vscale x 8 x i32> %x, %5
+  %10 = sub nsw <vscale x 8 x i32> %7, %x
+  %11 = shl <vscale x 8 x i32> %6, %x
+  %12 = ashr <vscale x 8 x i32> %8, %x
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %9, <vscale x 8 x i32> %11)
+  %interleaved.vec61 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %10, <vscale x 8 x i32> %12)
+  %interleaved.vec62 = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.vec, <vscale x 16 x i32> %interleaved.vec61)
+  store <vscale x 32 x i32> %interleaved.vec62, ptr %dst, align 4
+  ret void
+}

>From 571ecd3bb9df6acbc7ffab7cb3dacc82fd85a946 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 18 Apr 2024 17:30:51 +0000
Subject: [PATCH 02/10] [IA]: Construct (de)interleave4 out of (de)interleave2

- InterleavedAccess pass is updated to spot load/store (de)interleave4 like sequences,
  and emit equivalent sve.ld4 or sve.st4 intrinsics through targets that support SV.
- Tests are added for targets that support SV.

Change-Id: I76ef31080ddd72b182c1a3b1752a6178dc78ea84
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 83 +++++++++++++++++--
 .../Target/AArch64/AArch64ISelLowering.cpp    | 44 ++++++----
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 37 +++++++--
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  4 +-
 .../CodeGen/AArch64/sve-deinterleave-load.ll  | 78 +++++++++++++++++
 .../RISCV/rvv/sve-deinterleave-load.ll        | 74 +++++++++++++++++
 8 files changed, 296 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9d9886f4920a29..2f6db5b0ac3fa2 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -57,6 +57,8 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
+#include <queue>
+#include <stack>
 #include <string>
 #include <utility>
 #include <vector>
@@ -3156,6 +3158,7 @@ class TargetLoweringBase {
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
   virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                SmallVector<Value *> &LeafNodes,
                                                 LoadInst *LI) const {
     return false;
   }
@@ -3167,6 +3170,7 @@ class TargetLoweringBase {
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                               SmallVector<Value *> &LeafNodes,
                                                StoreInst *SI) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8c9065aec7faa4..1c42ac60169695 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,6 +70,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
+#include <queue>
 #include <utility>
 
 using namespace llvm;
@@ -488,12 +489,57 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
+  std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
+  SmallVector<Value *> TempLeafNodes, LeafNodes;
+  std::map<IntrinsicInst *, bool> mp;
+  SmallVector<Instruction *> TempDeadInsts;
+
+  DeinterleaveTreeQueue.push(DI);
+  while (!DeinterleaveTreeQueue.empty()) {
+    auto CurrentDI = DeinterleaveTreeQueue.top();
+    DeinterleaveTreeQueue.pop();
+    TempDeadInsts.push_back(CurrentDI);
+    // iterate over extract users of deinterleave
+    for (auto UserExtract : CurrentDI->users()) {
+      Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+      if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+        continue;
+      bool IsLeaf = true;
+      // iterate over deinterleave users of extract
+      for (auto UserDI : UserExtract->users()) {
+        IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+        if (!Child_DI || Child_DI->getIntrinsicID() !=
+                             Intrinsic::vector_deinterleave2)
+          continue;
+        IsLeaf = false;
+        if (mp.count(Child_DI) == 0) {
+          DeinterleaveTreeQueue.push(Child_DI);
+        }
+        continue;
+      }
+      if (IsLeaf) {
+        TempLeafNodes.push_back(UserExtract);
+        TempDeadInsts.push_back(Extract);
+      } else {
+        TempDeadInsts.push_back(Extract);
+      }
+    }
+  }
+  // sort the deinterleaved nodes in the order that
+  // they will be extracted from the target-specific intrinsic.
+  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+
+  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
     return false;
 
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.push_back(DI);
+  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
+                   TempDeadInsts.rend());
   DeadInsts.push_back(LI);
   return true;
 }
@@ -509,14 +555,38 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+  std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
+  SmallVector<Value *> TempLeafNodes, LeafNodes;
+  SmallVector<Instruction *> TempDeadInsts;
+
+  IeinterleaveTreeQueue.push(II);
+  while (!IeinterleaveTreeQueue.empty()) {
+    auto node = IeinterleaveTreeQueue.front();
+    TempDeadInsts.push_back(node);
+    IeinterleaveTreeQueue.pop();
+    for (unsigned i = 0; i < 2; i++) {
+      auto op = node->getOperand(i);
+      if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+        if (CurrentII->getIntrinsicID() !=
+            Intrinsic::vector_interleave2)
+          continue;
+        IeinterleaveTreeQueue.push(CurrentII);
+        continue;
+      }
+      TempLeafNodes.push_back(op);
+    }
+  }
+  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
+  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
+    LeafNodes.push_back(TempLeafNodes[I]);
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
-  DeadInsts.push_back(II);
+  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
   return true;
 }
 
@@ -537,7 +607,8 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
       // with a factor of 2.
       if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+
+      else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf205b1706a6c9..2fb6e7cfb1d139 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16907,16 +16907,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, LoadInst *LI) const {
+    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  const unsigned Factor = std::max(2, (int)LeafNodes.size());
 
-  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
-  const DataLayout &DL = DI->getDataLayout();
+  VectorType *VTy = (LeafNodes.size() > 0)
+                        ? cast<VectorType>(LeafNodes.front()->getType())
+                        : cast<VectorType>(DI->getType()->getContainedType(0));
+  const DataLayout &DL = DI->getModule()->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -16971,9 +16972,19 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Result = Builder.CreateInsertValue(Result, Left, 0);
     Result = Builder.CreateInsertValue(Result, Right, 1);
   } else {
-    if (UseScalable)
+    if (UseScalable) {
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
-    else
+      if (Factor == 2) {
+        DI->replaceAllUsesWith(Result);
+        return true;
+      }
+      for (unsigned I = 0; I < LeafNodes.size(); I++) {
+        llvm::Value *CurrentExtract = LeafNodes[I];
+        Value *Newextrct = Builder.CreateExtractValue(Result, I);
+        CurrentExtract->replaceAllUsesWith(Newextrct);
+      }
+      return true;
+    } else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
 
@@ -16982,16 +16993,16 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, StoreInst *SI) const {
+    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
   // Only interleave2 supported at present.
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  // Only a factor of 2 supported at present.
-  const unsigned Factor = 2;
+  // leaf nodes are the nodes that will be interleaved
+  const unsigned Factor = LeafNodes.size();
 
-  VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
-  const DataLayout &DL = II->getDataLayout();
+  VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
+  const DataLayout &DL = II->getModule()->getDataLayout();
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -17035,9 +17046,12 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
       R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
     }
 
-    if (UseScalable)
-      Builder.CreateCall(StNFunc, {L, R, Pred, Address});
-    else
+    if (UseScalable) {
+      SmallVector<Value *> Args(LeafNodes);
+      Args.push_back(Pred);
+      Args.push_back(Address);
+      Builder.CreateCall(StNFunc, Args);
+    } else
       Builder.CreateCall(StNFunc, {L, R, Address});
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fcdd47541be828..c4da785db8265f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -704,9 +704,11 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                        SmallVector<Value *> &LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e938454b8e6426..07409272b9632f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21775,8 +21775,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                           LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
@@ -21784,10 +21784,13 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  unsigned Factor = 2;
+  unsigned Factor = std::max(2, (int)LeafNodes.size());
 
   VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
-  VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+  VectorType *ResVTy =
+      (LeafNodes.size() > 0)
+          ? cast<VectorType>(LeafNodes.front()->getType())
+          : cast<VectorType>(DI->getType()->getContainedType(0));
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
@@ -21815,6 +21818,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
                                            {ResVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
     Ops.append(Factor, PoisonValue::get(ResVTy));
+    Ops.append({LI->getPointerOperand(), VL});
+    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+    //-----------
+    if (Factor == 2) {
+      DI->replaceAllUsesWith(Vlseg);
+      return true;
+    }
+    for (unsigned I = 0; I < LeafNodes.size(); I++) {
+      auto CurrentExtract = LeafNodes[I];
+      Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+    }
+    return true;
   }
 
   Ops.append({LI->getPointerOperand(), VL});
@@ -21825,8 +21841,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                                          StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
 
@@ -21834,10 +21850,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  unsigned Factor = 2;
+  unsigned Factor = LeafNodes.size();
 
   VectorType *VTy = cast<VectorType>(II->getType());
-  VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+  VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
@@ -21863,6 +21879,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
     VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
                                            {InVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
+    SmallVector<Value *> Args(LeafNodes);
+    Args.push_back(SI->getPointerOperand());
+    Args.push_back(VL);
+    Builder.CreateCall(VssegNFunc, Args);
+    return true;
   }
 
   Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0b0ad9229f0b35..9f8fa2529934d8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -876,10 +876,12 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                        SmallVector<Value *> &LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..606bb93e309e12
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #1024 // =0x400
+; CHECK-NEXT:    neg x10, x10
+; CHECK-NEXT:    rdvl x11, #4
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x12, x1, x8
+; CHECK-NEXT:    adds x9, x9, x10
+; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x12]
+; CHECK-NEXT:    add x12, x2, x8
+; CHECK-NEXT:    ld4w { z4.s - z7.s }, p0/z, [x12]
+; CHECK-NEXT:    add x12, x0, x8
+; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    add z16.s, z4.s, z0.s
+; CHECK-NEXT:    sub z17.s, z1.s, z5.s
+; CHECK-NEXT:    movprfx z18, z2
+; CHECK-NEXT:    lsl z18.s, p0/m, z18.s, z6.s
+; CHECK-NEXT:    movprfx z19, z3
+; CHECK-NEXT:    asr z19.s, p0/m, z19.s, z7.s
+; CHECK-NEXT:    st4w { z16.s - z19.s }, p0, [x12]
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+  %16 = add nsw <vscale x 4 x i32> %12, %5
+  %17 = sub nsw <vscale x 4 x i32> %7, %14
+  %18 = shl <vscale x 4 x i32> %6, %13
+  %19 = ashr <vscale x 4 x i32> %8, %15
+  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+  %index.next = add nuw i64 %index, %1
+  %21 = icmp eq i64 %index.next, 1024
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
new file mode 100644
index 00000000000000..2ea14b13265c61
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: loop_xyzt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    srli a3, a4, 1
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    li a5, 1024
+; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
+; CHECK-NEXT:  .LBB0_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vlseg4e32.v v8, (a1)
+; CHECK-NEXT:    vlseg4e32.v v16, (a2)
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vsub.vv v10, v10, v18
+; CHECK-NEXT:    vsll.vv v12, v12, v20
+; CHECK-NEXT:    vsra.vv v14, v14, v22
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    sub a5, a5, a3
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    bnez a5, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
+  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
+  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
+  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
+  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
+  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
+  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
+  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
+  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
+  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
+  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
+  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
+  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
+  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
+  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
+  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
+  %16 = add nsw <vscale x 4 x i32> %12, %5
+  %17 = sub nsw <vscale x 4 x i32> %7, %14
+  %18 = shl <vscale x 4 x i32> %6, %13
+  %19 = ashr <vscale x 4 x i32> %8, %15
+  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
+  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
+  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
+  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
+  %index.next = add nuw i64 %index, %1
+  %21 = icmp eq i64 %index.next, 1024
+  br i1 %21, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}

>From 884e12a078e9fff753dae42dbcc2ec0093a846f6 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Mon, 29 Apr 2024 05:03:36 +0000
Subject: [PATCH 03/10] [PatternMatch]: Add m_Interleave and m_Deinterleave
 matchers.

Change-Id: Id94189e601ed70c5ea922f9adbee63cf8b80829a
---
 llvm/include/llvm/IR/PatternMatch.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index d9e27e087e705b..65ea948f16a323 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2924,6 +2924,17 @@ inline VScaleVal_match m_VScale() {
   return VScaleVal_match();
 }
 
+template <typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
+m_Interleave2(const Opnd0 &Op0, const Opnd1 &Op1) {
+  return m_Intrinsic<Intrinsic::vector_interleave2>(Op0, Op1);
+}
+
+template <typename Opnd>
+inline typename m_Intrinsic_Ty<Opnd>::Ty m_Deinterleave2(const Opnd &Op) {
+  return m_Intrinsic<Intrinsic::vector_deinterleave2>(Op);
+}
+
 template <typename LHS, typename RHS, unsigned Opcode, bool Commutable = false>
 struct LogicalOp_match {
   LHS L;

>From 73f187f5643cdd7f27975d1648a45bf993db8e50 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 15 May 2024 17:25:10 +0000
Subject: [PATCH 04/10] [AArch64]: Use PatternMatch to spot (de)interleave
 accesses

Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   4 -
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  82 +-------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 187 +++++++++++++-----
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 -
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  37 +---
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   4 +-
 .../CodeGen/AArch64/sve-deinterleave-load.ll  |  78 --------
 .../AArch64/sve-interleave_accesses4-load.ll  | 106 ----------
 .../RISCV/rvv/sve-deinterleave-load.ll        |  74 -------
 .../AArch64/sve-deinterleave4.ll              | 105 ++++++++++
 .../AArch64/sve-interleave4.ll                |  63 ++++++
 11 files changed, 315 insertions(+), 427 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
 create mode 100644 llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
 create mode 100644 llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2f6db5b0ac3fa2..9d9886f4920a29 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -57,8 +57,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <queue>
-#include <stack>
 #include <string>
 #include <utility>
 #include <vector>
@@ -3158,7 +3156,6 @@ class TargetLoweringBase {
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
   virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                SmallVector<Value *> &LeafNodes,
                                                 LoadInst *LI) const {
     return false;
   }
@@ -3170,7 +3167,6 @@ class TargetLoweringBase {
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                               SmallVector<Value *> &LeafNodes,
                                                StoreInst *SI) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 1c42ac60169695..86f856139ff306 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -70,7 +70,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
-#include <queue>
 #include <utility>
 
 using namespace llvm;
@@ -489,57 +488,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
-  std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
-  SmallVector<Value *> TempLeafNodes, LeafNodes;
-  std::map<IntrinsicInst *, bool> mp;
-  SmallVector<Instruction *> TempDeadInsts;
-
-  DeinterleaveTreeQueue.push(DI);
-  while (!DeinterleaveTreeQueue.empty()) {
-    auto CurrentDI = DeinterleaveTreeQueue.top();
-    DeinterleaveTreeQueue.pop();
-    TempDeadInsts.push_back(CurrentDI);
-    // iterate over extract users of deinterleave
-    for (auto UserExtract : CurrentDI->users()) {
-      Instruction *Extract = dyn_cast<Instruction>(UserExtract);
-      if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
-        continue;
-      bool IsLeaf = true;
-      // iterate over deinterleave users of extract
-      for (auto UserDI : UserExtract->users()) {
-        IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
-        if (!Child_DI || Child_DI->getIntrinsicID() !=
-                             Intrinsic::vector_deinterleave2)
-          continue;
-        IsLeaf = false;
-        if (mp.count(Child_DI) == 0) {
-          DeinterleaveTreeQueue.push(Child_DI);
-        }
-        continue;
-      }
-      if (IsLeaf) {
-        TempLeafNodes.push_back(UserExtract);
-        TempDeadInsts.push_back(Extract);
-      } else {
-        TempDeadInsts.push_back(Extract);
-      }
-    }
-  }
-  // sort the deinterleaved nodes in the order that
-  // they will be extracted from the target-specific intrinsic.
-  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
-
-  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
-
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
     return false;
 
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
-                   TempDeadInsts.rend());
+  DeadInsts.push_back(DI);
   DeadInsts.push_back(LI);
   return true;
 }
@@ -555,38 +509,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-  std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
-  SmallVector<Value *> TempLeafNodes, LeafNodes;
-  SmallVector<Instruction *> TempDeadInsts;
-
-  IeinterleaveTreeQueue.push(II);
-  while (!IeinterleaveTreeQueue.empty()) {
-    auto node = IeinterleaveTreeQueue.front();
-    TempDeadInsts.push_back(node);
-    IeinterleaveTreeQueue.pop();
-    for (unsigned i = 0; i < 2; i++) {
-      auto op = node->getOperand(i);
-      if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
-        if (CurrentII->getIntrinsicID() !=
-            Intrinsic::vector_interleave2)
-          continue;
-        IeinterleaveTreeQueue.push(CurrentII);
-        continue;
-      }
-      TempLeafNodes.push_back(op);
-    }
-  }
-  for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
-  for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
-    LeafNodes.push_back(TempLeafNodes[I]);
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
-  DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
+  DeadInsts.push_back(II);
   return true;
 }
 
@@ -607,8 +536,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
       // with a factor of 2.
       if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-
-      else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2fb6e7cfb1d139..f0d413c5a1ca3f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16906,18 +16906,74 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
+bool getDeinterleavedValues(Value *DI,
+                            SmallVectorImpl<Value *> &DeinterleavedValues,
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
+  if (!DI->hasNUses(2))
+    return false;
+
+  // make sure that the users of DI are extractValue instructions
+  auto *Extr0 = *(++DI->user_begin());
+  if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
+    return false;
+  auto *Extr1 = *(DI->user_begin());
+  if (!match(Extr1, m_ExtractValue<1>(m_Deinterleave2(m_Value()))))
+    return false;
+
+  // each extractValue instruction is expected to have a single user,
+  // which should be another DI
+  if (!Extr0->hasOneUser() || !Extr1->hasOneUser())
+    return false;
+  auto *DI1 = *(Extr0->user_begin());
+  if (!match(DI1, m_Deinterleave2(m_Value())))
+    return false;
+  auto *DI2 = *(Extr1->user_begin());
+  if (!match(DI2, m_Deinterleave2(m_Value())))
+    return false;
+
+  if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+    return false;
+
+  // Leaf nodes of the deinterleave tree
+  auto *A = *(++DI1->user_begin());
+  auto *C = *(DI1->user_begin());
+  auto *B = *(++DI2->user_begin());
+  auto *D = *(DI2->user_begin());
+
+  DeinterleavedValues.push_back(A);
+  DeinterleavedValues.push_back(B);
+  DeinterleavedValues.push_back(C);
+  DeinterleavedValues.push_back(D);
+
+  // These Values will not be used anymre,
+  // DI4 will be created instead of nested DI1 and DI2
+  DeadInsts.push_back(cast<Instruction>(DI1));
+  DeadInsts.push_back(cast<Instruction>(Extr0));
+  DeadInsts.push_back(cast<Instruction>(DI2));
+  DeadInsts.push_back(cast<Instruction>(Extr1));
+
+  return true;
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
+    IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  const unsigned Factor = std::max(2, (int)LeafNodes.size());
-
-  VectorType *VTy = (LeafNodes.size() > 0)
-                        ? cast<VectorType>(LeafNodes.front()->getType())
-                        : cast<VectorType>(DI->getType()->getContainedType(0));
+  SmallVector<Value *, 4> DeinterleavedValues;
+  SmallVector<Instruction *, 10> DeadInsts;
   const DataLayout &DL = DI->getModule()->getDataLayout();
+  unsigned Factor = 2;
+  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+
+  if (getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+    Factor = DeinterleavedValues.size();
+    VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+  }
+  assert((Factor == 2 || Factor == 4) &&
+         "Currently supported Factors are 2 or 4");
+
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -16928,7 +16984,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
-
   VectorType *LdTy =
       VectorType::get(VTy->getElementType(),
                       VTy->getElementCount().divideCoefficientBy(NumLoads));
@@ -16938,7 +16993,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                                 UseScalable, LdTy, PtrTy);
 
   IRBuilder<> Builder(LI);
-
   Value *Pred = nullptr;
   if (UseScalable)
     Pred =
@@ -16947,9 +17001,8 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
   Value *BaseAddr = LI->getPointerOperand();
   Value *Result;
   if (NumLoads > 1) {
-    Value *Left = PoisonValue::get(VTy);
-    Value *Right = PoisonValue::get(VTy);
-
+    // Create multiple legal small ldN instead of a wide one.
+    SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));
     for (unsigned I = 0; I < NumLoads; ++I) {
       Value *Offset = Builder.getInt64(I * Factor);
 
@@ -16959,50 +17012,79 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
       else
         LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
-
       Value *Idx =
           Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
-      Left = Builder.CreateInsertVector(
-          VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
-      Right = Builder.CreateInsertVector(
-          VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
+      for (int J = 0; J < Factor; ++J) {
+        WideValues[J] = Builder.CreateInsertVector(
+            VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
+      }
+    }
+    if (Factor == 2)
+      Result = PoisonValue::get(StructType::get(VTy, VTy));
+    else
+      Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
+    // Construct the wide result out of the small results.
+    for (int J = 0; J < Factor; ++J) {
+      Result = Builder.CreateInsertValue(Result, WideValues[J], J);
     }
-
-    Result = PoisonValue::get(DI->getType());
-    Result = Builder.CreateInsertValue(Result, Left, 0);
-    Result = Builder.CreateInsertValue(Result, Right, 1);
   } else {
-    if (UseScalable) {
+    if (UseScalable)
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
-      if (Factor == 2) {
-        DI->replaceAllUsesWith(Result);
-        return true;
-      }
-      for (unsigned I = 0; I < LeafNodes.size(); I++) {
-        llvm::Value *CurrentExtract = LeafNodes[I];
-        Value *Newextrct = Builder.CreateExtractValue(Result, I);
-        CurrentExtract->replaceAllUsesWith(Newextrct);
-      }
-      return true;
-    } else
+    else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
+  if (Factor > 2) {
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+      llvm::Value *CurrentExtract = DeinterleavedValues[I];
+      Value *NewExtract = Builder.CreateExtractValue(Result, I);
+      CurrentExtract->replaceAllUsesWith(NewExtract);
+      cast<Instruction>(CurrentExtract)->eraseFromParent();
+    }
 
+    for (auto &dead : DeadInsts)
+      dead->eraseFromParent();
+    return true;
+  }
   DI->replaceAllUsesWith(Result);
   return true;
 }
 
-bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
-  // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
-    return false;
+bool getValuesToInterleaved(Value *II,
+                            SmallVectorImpl<Value *> &ValuesToInterleave) {
+  Value *A, *B, *C, *D;
+  // Try to match interleave of Factor 4
+  if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
+                              m_Interleave2(m_Value(B), m_Value(D))))) {
+    ValuesToInterleave.push_back(A);
+    ValuesToInterleave.push_back(B);
+    ValuesToInterleave.push_back(C);
+    ValuesToInterleave.push_back(D);
+    return true;
+  }
 
-  // leaf nodes are the nodes that will be interleaved
-  const unsigned Factor = LeafNodes.size();
+  // Try to match interleave of Factor 2
+  if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) {
+    ValuesToInterleave.push_back(A);
+    ValuesToInterleave.push_back(B);
+    return true;
+  }
+
+  return false;
+}
 
-  VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
+bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
+    IntrinsicInst *II, StoreInst *SI) const {
+  LLVM_DEBUG(dbgs() << "lowerInterleaveIntrinsicToStore\n");
+
+  SmallVector<Value *, 4> ValuesToInterleave;
+  if (!getValuesToInterleaved(II, ValuesToInterleave))
+    return false;
+  unsigned Factor = ValuesToInterleave.size();
+  assert((Factor == 2 || Factor == 4) &&
+         "Currently supported Factors are 2 or 4");
+  VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
   const DataLayout &DL = II->getModule()->getDataLayout();
+
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
@@ -17031,28 +17113,25 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     Pred =
         Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
 
-  Value *L = II->getOperand(0);
-  Value *R = II->getOperand(1);
-
+  auto WideValues = ValuesToInterleave;
+  if (UseScalable)
+    ValuesToInterleave.push_back(Pred);
+  ValuesToInterleave.push_back(BaseAddr);
   for (unsigned I = 0; I < NumStores; ++I) {
     Value *Address = BaseAddr;
     if (NumStores > 1) {
       Value *Offset = Builder.getInt64(I * Factor);
       Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
-
       Value *Idx =
           Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
-      L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
-      R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
+      for (int J = 0; J < Factor; J++) {
+        ValuesToInterleave[J] =
+            Builder.CreateExtractVector(StTy, WideValues[J], Idx);
+      }
+      // update the address
+      ValuesToInterleave[ValuesToInterleave.size() - 1] = Address;
     }
-
-    if (UseScalable) {
-      SmallVector<Value *> Args(LeafNodes);
-      Args.push_back(Pred);
-      Args.push_back(Address);
-      Builder.CreateCall(StNFunc, Args);
-    } else
-      Builder.CreateCall(StNFunc, {L, R, Address});
+    Builder.CreateCall(StNFunc, ValuesToInterleave);
   }
 
   return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c4da785db8265f..fcdd47541be828 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -704,11 +704,9 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                        SmallVector<Value *> &LeafNodes,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 07409272b9632f..e938454b8e6426 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21775,8 +21775,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+                                                           LoadInst *LI) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
@@ -21784,13 +21784,10 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  unsigned Factor = std::max(2, (int)LeafNodes.size());
+  unsigned Factor = 2;
 
   VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
-  VectorType *ResVTy =
-      (LeafNodes.size() > 0)
-          ? cast<VectorType>(LeafNodes.front()->getType())
-          : cast<VectorType>(DI->getType()->getContainedType(0));
+  VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
 
   if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
                                     LI->getPointerAddressSpace(),
@@ -21818,19 +21815,6 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                            {ResVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
     Ops.append(Factor, PoisonValue::get(ResVTy));
-    Ops.append({LI->getPointerOperand(), VL});
-    Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
-    //-----------
-    if (Factor == 2) {
-      DI->replaceAllUsesWith(Vlseg);
-      return true;
-    }
-    for (unsigned I = 0; I < LeafNodes.size(); I++) {
-      auto CurrentExtract = LeafNodes[I];
-      Value *NewExtract = Builder.CreateExtractValue(Vlseg, I);
-      CurrentExtract->replaceAllUsesWith(NewExtract);
-    }
-    return true;
   }
 
   Ops.append({LI->getPointerOperand(), VL});
@@ -21841,8 +21825,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   return true;
 }
 
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+                                                          StoreInst *SI) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
 
@@ -21850,10 +21834,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
-  unsigned Factor = LeafNodes.size();
+  unsigned Factor = 2;
 
   VectorType *VTy = cast<VectorType>(II->getType());
-  VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
+  VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
 
   if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
                                     SI->getPointerAddressSpace(),
@@ -21879,11 +21863,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
     VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
                                            {InVTy, XLenTy});
     VL = Constant::getAllOnesValue(XLenTy);
-    SmallVector<Value *> Args(LeafNodes);
-    Args.push_back(SI->getPointerOperand());
-    Args.push_back(VL);
-    Builder.CreateCall(VssegNFunc, Args);
-    return true;
   }
 
   Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9f8fa2529934d8..0b0ad9229f0b35 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -876,12 +876,10 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                        SmallVector<Value *> &LeafNodes,
+  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
                                         LoadInst *LI) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       SmallVector<Value *> &LeafNodes,
                                        StoreInst *SI) const override;
 
   bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
deleted file mode 100644
index 606bb93e309e12..00000000000000
--- a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
-; CHECK-LABEL: loop_xyzt:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cntw x10
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov w9, #1024 // =0x400
-; CHECK-NEXT:    neg x10, x10
-; CHECK-NEXT:    rdvl x11, #4
-; CHECK-NEXT:  .LBB0_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x12, x1, x8
-; CHECK-NEXT:    adds x9, x9, x10
-; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x12]
-; CHECK-NEXT:    add x12, x2, x8
-; CHECK-NEXT:    ld4w { z4.s - z7.s }, p0/z, [x12]
-; CHECK-NEXT:    add x12, x0, x8
-; CHECK-NEXT:    add x8, x8, x11
-; CHECK-NEXT:    add z16.s, z4.s, z0.s
-; CHECK-NEXT:    sub z17.s, z1.s, z5.s
-; CHECK-NEXT:    movprfx z18, z2
-; CHECK-NEXT:    lsl z18.s, p0/m, z18.s, z6.s
-; CHECK-NEXT:    movprfx z19, z3
-; CHECK-NEXT:    asr z19.s, p0/m, z19.s, z7.s
-; CHECK-NEXT:    st4w { z16.s - z19.s }, p0, [x12]
-; CHECK-NEXT:    b.ne .LBB0_1
-; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-entry:
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %entry
-  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
-  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
-  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
-  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
-  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
-  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
-  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
-  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
-  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
-  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
-  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
-  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
-  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
-  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
-  %16 = add nsw <vscale x 4 x i32> %12, %5
-  %17 = sub nsw <vscale x 4 x i32> %7, %14
-  %18 = shl <vscale x 4 x i32> %6, %13
-  %19 = ashr <vscale x 4 x i32> %8, %15
-  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
-  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
-  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
-  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
-  %index.next = add nuw i64 %index, %1
-  %21 = icmp eq i64 %index.next, 1024
-  br i1 %21, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup:                                 ; preds = %vector.body
-  ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll b/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
deleted file mode 100644
index dcade71ccb6846..00000000000000
--- a/llvm/test/CodeGen/AArch64/sve-interleave_accesses4-load.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define void @interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 4 x i32> %x) {
-; CHECK-LABEL: interleave:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld2w { z1.s, z2.s }, p0/z, [x1]
-; CHECK-NEXT:    ld2w { z3.s, z4.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT:    uzp2 z5.s, z1.s, z3.s
-; CHECK-NEXT:    uzp1 z6.s, z1.s, z3.s
-; CHECK-NEXT:    uzp2 z7.s, z2.s, z4.s
-; CHECK-NEXT:    uzp1 z1.s, z2.s, z4.s
-; CHECK-NEXT:    add z2.s, z0.s, z6.s
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    lsl z3.s, p0/m, z3.s, z0.s
-; CHECK-NEXT:    sub z1.s, z1.s, z0.s
-; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z7.s
-; CHECK-NEXT:    zip1 z4.s, z2.s, z3.s
-; CHECK-NEXT:    zip2 z2.s, z2.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z1.s, z0.s
-; CHECK-NEXT:    zip2 z3.s, z1.s, z0.s
-; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
-; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #2, mul vl]
-; CHECK-NEXT:    ret
-  %wide.vec = load <vscale x 16 x i32>, ptr %a, align 4
-  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
-  %9 = add nsw <vscale x 4 x i32> %x, %5
-  %10 = sub nsw <vscale x 4 x i32> %7, %x
-  %11 = shl <vscale x 4 x i32> %6, %x
-  %12 = ashr <vscale x 4 x i32> %8, %x
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %9, <vscale x 4 x i32> %11)
-  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %10, <vscale x 4 x i32> %12)
-  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
-  store <vscale x 16 x i32> %interleaved.vec62, ptr %dst, align 4
-  ret void
-}
-
-define void @wide_interleave(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, <vscale x 8 x i32> %x) {
-; CHECK-LABEL: wide_interleave:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld2w { z2.s, z3.s }, p0/z, [x1]
-; CHECK-NEXT:    ld2w { z4.s, z5.s }, p0/z, [x1, #2, mul vl]
-; CHECK-NEXT:    ld2w { z6.s, z7.s }, p0/z, [x1, #4, mul vl]
-; CHECK-NEXT:    ld2w { z24.s, z25.s }, p0/z, [x1, #6, mul vl]
-; CHECK-NEXT:    uzp2 z26.s, z2.s, z4.s
-; CHECK-NEXT:    uzp1 z27.s, z2.s, z4.s
-; CHECK-NEXT:    uzp2 z28.s, z3.s, z5.s
-; CHECK-NEXT:    uzp1 z2.s, z3.s, z5.s
-; CHECK-NEXT:    add z3.s, z0.s, z27.s
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    lsl z4.s, p0/m, z4.s, z0.s
-; CHECK-NEXT:    sub z2.s, z2.s, z0.s
-; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z28.s
-; CHECK-NEXT:    zip1 z26.s, z3.s, z4.s
-; CHECK-NEXT:    zip2 z3.s, z3.s, z4.s
-; CHECK-NEXT:    zip1 z27.s, z2.s, z0.s
-; CHECK-NEXT:    zip2 z4.s, z2.s, z0.s
-; CHECK-NEXT:    uzp2 z0.s, z6.s, z24.s
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z24.s
-; CHECK-NEXT:    st2w { z26.s, z27.s }, p0, [x0]
-; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    add z2.s, z1.s, z2.s
-; CHECK-NEXT:    st2w { z3.s, z4.s }, p0, [x0, #2, mul vl]
-; CHECK-NEXT:    uzp2 z3.s, z7.s, z25.s
-; CHECK-NEXT:    uzp1 z4.s, z7.s, z25.s
-; CHECK-NEXT:    zip1 z5.s, z2.s, z0.s
-; CHECK-NEXT:    sub z4.s, z4.s, z1.s
-; CHECK-NEXT:    asrr z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    zip2 z2.s, z2.s, z0.s
-; CHECK-NEXT:    zip1 z6.s, z4.s, z1.s
-; CHECK-NEXT:    zip2 z3.s, z4.s, z1.s
-; CHECK-NEXT:    st2w { z5.s, z6.s }, p0, [x0, #4, mul vl]
-; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, #6, mul vl]
-; CHECK-NEXT:    ret
-  %wide.vec = load <vscale x 32 x i32>, ptr %a, align 4
-  %root.strided.vec = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
-  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
-  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec56, 1
-  %9 = add nsw <vscale x 8 x i32> %x, %5
-  %10 = sub nsw <vscale x 8 x i32> %7, %x
-  %11 = shl <vscale x 8 x i32> %6, %x
-  %12 = ashr <vscale x 8 x i32> %8, %x
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %9, <vscale x 8 x i32> %11)
-  %interleaved.vec61 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %10, <vscale x 8 x i32> %12)
-  %interleaved.vec62 = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.vec, <vscale x 16 x i32> %interleaved.vec61)
-  store <vscale x 32 x i32> %interleaved.vec62, ptr %dst, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
deleted file mode 100644
index 2ea14b13265c61..00000000000000
--- a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s
-
-%struct.xyzt = type { i32, i32, i32, i32 }
-
-define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
-; CHECK-LABEL: loop_xyzt:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    srli a3, a4, 1
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    li a5, 1024
-; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
-; CHECK-NEXT:  .LBB0_1: # %vector.body
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vlseg4e32.v v8, (a1)
-; CHECK-NEXT:    vlseg4e32.v v16, (a2)
-; CHECK-NEXT:    vadd.vv v8, v16, v8
-; CHECK-NEXT:    vsub.vv v10, v10, v18
-; CHECK-NEXT:    vsll.vv v12, v12, v20
-; CHECK-NEXT:    vsra.vv v14, v14, v22
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
-; CHECK-NEXT:    sub a5, a5, a3
-; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    bnez a5, .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT:    ret
-entry:
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %entry
-  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-  %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index
-  %wide.vec = load <vscale x 16 x i32>, ptr %2, align 4
-  %root.strided.vec = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec, 1
-  %root.strided.vec55 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec55, 1
-  %root.strided.vec56 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec56, 1
-  %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index
-  %wide.vec57 = load <vscale x 16 x i32>, ptr %9, align 4
-  %root.strided.vec58 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.vec57)
-  %10 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 0
-  %11 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %root.strided.vec58, 1
-  %root.strided.vec59 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %10)
-  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 0
-  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec59, 1
-  %root.strided.vec60 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %11)
-  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 0
-  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %root.strided.vec60, 1
-  %16 = add nsw <vscale x 4 x i32> %12, %5
-  %17 = sub nsw <vscale x 4 x i32> %7, %14
-  %18 = shl <vscale x 4 x i32> %6, %13
-  %19 = ashr <vscale x 4 x i32> %8, %15
-  %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %16, <vscale x 4 x i32> %18)
-  %interleaved.vec61 = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %17, <vscale x 4 x i32> %19)
-  %interleaved.vec62 = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.vec, <vscale x 8 x i32> %interleaved.vec61)
-  store <vscale x 16 x i32> %interleaved.vec62, ptr %20, align 4
-  %index.next = add nuw i64 %index, %1
-  %21 = icmp eq i64 %index.next, 1024
-  br i1 %21, label %for.cond.cleanup, label %vector.body
-
-for.cond.cleanup:                                 ; preds = %vector.body
-  ret void
-}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
new file mode 100644
index 00000000000000..d6d0e98edb3c88
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    ret void
+;
+
+  %load = load <vscale x 16 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  ret void
+}
+
+define void @wide_deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @wide_deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP13]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
+; CHECK-NEXT:    ret void
+;
+  %load = load <vscale x 32 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 0
+  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 1
+  %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
+  ret void
+}
+
+define void @mix_deinterleave4_deinterleave2(ptr %src) {
+; CHECK-LABEL: define void @mix_deinterleave4_deinterleave2
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    ret void
+;
+
+  %load = load <vscale x 16 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+  %load1 = load <vscale x 8 x i32>, ptr %src, align 4
+  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load1)
+  %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
+  %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
+  ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
new file mode 100644
index 00000000000000..9e38172aaeff0f
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  store <vscale x 16 x i32> %interleaved.vec, ptr %dst, align 4
+  ret void
+}
+
+define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
+; CHECK-LABEL: define void @wide_interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[A]], <vscale x 8 x i32> [[C]])
+; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[B]], <vscale x 8 x i32> [[D]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 4
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 4)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    ret void
+;
+  %interleaved.half1 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %c)
+  %interleaved.half2 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %d)
+  %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.half1, <vscale x 16 x i32> %interleaved.half2)
+  store <vscale x 32 x i32> %interleaved.vec, ptr %dst, align 4
+  ret void
+}
+
+define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @mix_interleave4_interleave2
+; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
+; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
+; CHECK-NEXT:    ret void
+;
+  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  store <vscale x 16 x i32> %interleaved.vec, ptr %dst1, align 4
+
+  %interleaved = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+  store <vscale x 8 x i32> %interleaved, ptr %dst2, align 4
+  ret void
+}

>From 68e9c3051775472541260e73ce1018fa850d130e Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 13 Jun 2024 14:45:35 +0000
Subject: [PATCH 05/10] add DeadCodeElim pass to the RUN line

Change-Id: I2b2dc683dba21cdb6c35f407868a7537245c845e
---
 .../InterleavedAccess/AArch64/sve-interleave4.ll          | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index 9e38172aaeff0f..6cbd201ab36a2a 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+; RUN: opt < %s -passes=interleaved-access,dce -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
 
 
 define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
 ; CHECK-LABEL: define void @interleave4
 ; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
 ; CHECK-NEXT:    ret void
 ;
@@ -20,8 +18,6 @@ define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
 define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
 ; CHECK-LABEL: define void @wide_interleave4
 ; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[A]], <vscale x 8 x i32> [[C]])
-; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[B]], <vscale x 8 x i32> [[D]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
@@ -46,8 +42,6 @@ define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32
 define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
 ; CHECK-LABEL: define void @mix_interleave4_interleave2
 ; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[INTERLEAVED_HALF1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]])
-; CHECK-NEXT:    [[INTERLEAVED_HALF2:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[D]])
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
 ; CHECK-NEXT:    ret void

>From 94d537bfd1aa696c5e1880a4b2197f142215393d Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 18 Jun 2024 17:10:52 +0000
Subject: [PATCH 06/10] Fix assmuption of the extraction order, make it generic
 then make sure of the order using pattern match

Change-Id: I053e47d156c37cf4d7ab5b2af83c348b4210631a
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 116 ++++++++++--------
 1 file changed, 66 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f0d413c5a1ca3f..72a461050a906f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16906,68 +16906,85 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool getDeinterleavedValues(Value *DI,
-                            SmallVectorImpl<Value *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  if (!DI->hasNUses(2))
+bool getDeinterleavedValues(
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
+  if (!DI->hasNUsesOrMore(2))
     return false;
-
-  // make sure that the users of DI are extractValue instructions
-  auto *Extr0 = *(++DI->user_begin());
-  if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
-    return false;
-  auto *Extr1 = *(DI->user_begin());
-  if (!match(Extr1, m_ExtractValue<1>(m_Deinterleave2(m_Value()))))
+  auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+  auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+  if (!Extr1 || !Extr2)
     return false;
 
-  // each extractValue instruction is expected to have a single user,
-  // which should be another DI
-  if (!Extr0->hasOneUser() || !Extr1->hasOneUser())
+  if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))
     return false;
-  auto *DI1 = *(Extr0->user_begin());
-  if (!match(DI1, m_Deinterleave2(m_Value())))
+  auto *DI1 = *(Extr1->user_begin());
+  auto *DI2 = *(Extr2->user_begin());
+
+  if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2))
     return false;
-  auto *DI2 = *(Extr1->user_begin());
-  if (!match(DI2, m_Deinterleave2(m_Value())))
+  // Leaf nodes of the deinterleave tree:
+  auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
+  auto *B = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
+  auto *C = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
+  auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
+  // Make sure that the A,B,C,D are instructions of ExtractValue,
+  // before getting the extract index
+  if (!A || !B || !C || !D)
     return false;
 
-  if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+  DeinterleavedValues.resize(4);
+  // Place the values into the vector in the order of extraction:
+  DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
+  DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
+  DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
+  DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;
+
+  // Make sure that A,B,C,D match the deinterleave tree pattern
+  if (!match(DeinterleavedValues[0],
+             m_ExtractValue<0>(m_Deinterleave2(
+                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[1],
+             m_ExtractValue<1>(m_Deinterleave2(
+                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[2],
+             m_ExtractValue<0>(m_Deinterleave2(
+                 m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||
+      !match(DeinterleavedValues[3],
+             m_ExtractValue<1>(m_Deinterleave2(
+                 m_ExtractValue<1>(m_Deinterleave2(m_Value())))))) {
+    LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
     return false;
-
-  // Leaf nodes of the deinterleave tree
-  auto *A = *(++DI1->user_begin());
-  auto *C = *(DI1->user_begin());
-  auto *B = *(++DI2->user_begin());
-  auto *D = *(DI2->user_begin());
-
-  DeinterleavedValues.push_back(A);
-  DeinterleavedValues.push_back(B);
-  DeinterleavedValues.push_back(C);
-  DeinterleavedValues.push_back(D);
-
-  // These Values will not be used anymre,
-  // DI4 will be created instead of nested DI1 and DI2
-  DeadInsts.push_back(cast<Instruction>(DI1));
-  DeadInsts.push_back(cast<Instruction>(Extr0));
-  DeadInsts.push_back(cast<Instruction>(DI2));
-  DeadInsts.push_back(cast<Instruction>(Extr1));
-
+  }
+  // Order the values according to the deinterleaving order.
+  std::swap(DeinterleavedValues[1], DeinterleavedValues[2]);
   return true;
 }
 
+void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
+  Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
+  match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
+  assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
+  match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
+  assert(DeadExtract != nullptr && "Match is expected to succeed");
+  DeadRoot->eraseFromParent();
+  if (DeadDeinterleave->getNumUses() == 0)
+    cast<Instruction>(DeadDeinterleave)->eraseFromParent();
+  if (DeadExtract->getNumUses() == 0)
+    cast<Instruction>(DeadExtract)->eraseFromParent();
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
-  SmallVector<Value *, 4> DeinterleavedValues;
-  SmallVector<Instruction *, 10> DeadInsts;
+  SmallVector<Instruction *, 4> DeinterleavedValues;
   const DataLayout &DL = DI->getModule()->getDataLayout();
   unsigned Factor = 2;
   VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
 
-  if (getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+  if (getDeinterleavedValues(DI, DeinterleavedValues)) {
     Factor = DeinterleavedValues.size();
     VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
   }
@@ -17014,7 +17031,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
       Value *Idx =
           Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
-      for (int J = 0; J < Factor; ++J) {
+      for (unsigned J = 0; J < Factor; ++J) {
         WideValues[J] = Builder.CreateInsertVector(
             VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
       }
@@ -17024,7 +17041,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     else
       Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
     // Construct the wide result out of the small results.
-    for (int J = 0; J < Factor; ++J) {
+    for (unsigned J = 0; J < Factor; ++J) {
       Result = Builder.CreateInsertValue(Result, WideValues[J], J);
     }
   } else {
@@ -17034,15 +17051,14 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
   if (Factor > 2) {
+    // Itereate over old deinterleaved values to replace it by
+    // the new deinterleaved values.
     for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
-      llvm::Value *CurrentExtract = DeinterleavedValues[I];
       Value *NewExtract = Builder.CreateExtractValue(Result, I);
-      CurrentExtract->replaceAllUsesWith(NewExtract);
-      cast<Instruction>(CurrentExtract)->eraseFromParent();
+      DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
     }
-
-    for (auto &dead : DeadInsts)
-      dead->eraseFromParent();
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
+      deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
     return true;
   }
   DI->replaceAllUsesWith(Result);
@@ -17124,7 +17140,7 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
       Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
       Value *Idx =
           Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
-      for (int J = 0; J < Factor; J++) {
+      for (unsigned J = 0; J < Factor; J++) {
         ValuesToInterleave[J] =
             Builder.CreateExtractVector(StTy, WideValues[J], Idx);
       }

>From 85d159b06273aff46cceebaad88074c28cfb3adb Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 4 Jul 2024 16:13:06 +0000
Subject: [PATCH 07/10] Unify the logic for ld2 and ld4d ld4

add negative tests

Change-Id: Id323139f11ddc4d3a22a72af84a01e98dadfe46d
---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |   1 +
 .../Target/AArch64/AArch64ISelLowering.cpp    | 182 ++++--
 .../AArch64/fixed-deinterleave-intrinsics.ll  |  66 +-
 .../scalable-deinterleave-intrinsics.ll       |  90 +--
 .../AArch64/sve-deinterleave4.ll              |  49 +-
 .../AArch64/sve-interleaved-accesses.ll       | 576 ++++++++++--------
 6 files changed, 567 insertions(+), 397 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 86f856139ff306..8c9065aec7faa4 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -509,6 +509,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     return false;
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+
   // Try and match this with target specific intrinsics.
   if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72a461050a906f..9029464752d340 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16906,26 +16906,71 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool getDeinterleavedValues(
+bool getDeinterleave2Values(
     Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
-  if (!DI->hasNUsesOrMore(2))
+  if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
   auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
   if (!Extr1 || !Extr2)
     return false;
 
-  if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))
+  DeinterleavedValues.resize(2);
+  // Place the values into the vector in the order of extraction:
+  DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1;
+  DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2;
+  if (!DeinterleavedValues[0] || !DeinterleavedValues[1])
+    return false;
+
+  // Make sure that the extracted values match the deinterleave tree pattern
+  if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) ||
+      !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) {
+    LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
+    return false;
+  }
+  return true;
+}
+
+/*
+Diagram for DI tree.
+                  [LOAD]
+                    |
+                   [DI]
+                /        \
+         [Extr<0>]      [Extr<1>]
+            |                 |
+           [DI]              [DI]
+          /    \            /    \
+    [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
+        |       |         |         |
+roots:  A       C         B         D
+roots in correct order of DI4: A B C D.
+If there is a pattern matches the deinterleave tree above, then we can construct
+DI4 out of that pattern. This function tries to match the deinterleave tree
+pattern, and fetch the tree roots, so that in further steps they can be replaced
+by the output of DI4.
+*/
+bool getDeinterleave4Values(Value *DI,
+                            SmallVectorImpl<Instruction *> &DeinterleavedValues,
+                            SmallVectorImpl<Instruction *> &DeadInstructions) {
+  if (!DI->hasNUses(2))
+    return false;
+  auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+  auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+  if (!Extr1 || !Extr2)
+    return false;
+
+  if (!Extr1->hasNUses(1) || !Extr2->hasNUses(1))
     return false;
   auto *DI1 = *(Extr1->user_begin());
   auto *DI2 = *(Extr2->user_begin());
 
-  if (!DI1->hasNUsesOrMore(2) || !DI2->hasNUsesOrMore(2))
+  if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
     return false;
   // Leaf nodes of the deinterleave tree:
   auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
-  auto *B = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
-  auto *C = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
+  auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
+  auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
   auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
   // Make sure that the A,B,C,D are instructions of ExtractValue,
   // before getting the extract index
@@ -16933,44 +16978,48 @@ bool getDeinterleavedValues(
     return false;
 
   DeinterleavedValues.resize(4);
-  // Place the values into the vector in the order of extraction:
-  DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
-  DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
-  DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
-  DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;
+  // Place the values into the vector in the order of deinterleave4:
+  DeinterleavedValues[0x3 &
+                      ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A;
+  DeinterleavedValues[0x3 &
+                      ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B;
+  DeinterleavedValues[0x3 &
+                      ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C;
+  DeinterleavedValues[0x3 &
+                      ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D;
+  if (!DeinterleavedValues[0] || !DeinterleavedValues[1] ||
+      !DeinterleavedValues[2] || !DeinterleavedValues[3])
+    return false;
 
   // Make sure that A,B,C,D match the deinterleave tree pattern
-  if (!match(DeinterleavedValues[0],
-             m_ExtractValue<0>(m_Deinterleave2(
-                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
-      !match(DeinterleavedValues[1],
-             m_ExtractValue<1>(m_Deinterleave2(
-                 m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
-      !match(DeinterleavedValues[2],
-             m_ExtractValue<0>(m_Deinterleave2(
-                 m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||
-      !match(DeinterleavedValues[3],
-             m_ExtractValue<1>(m_Deinterleave2(
-                 m_ExtractValue<1>(m_Deinterleave2(m_Value())))))) {
+  if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
+                                         m_ExtractValue<0>(m_Specific(DI))))) ||
+      !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2(
+                                         m_ExtractValue<1>(m_Specific(DI))))) ||
+      !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2(
+                                         m_ExtractValue<0>(m_Specific(DI))))) ||
+      !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2(
+                                         m_ExtractValue<1>(m_Specific(DI)))))) {
     LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
     return false;
   }
-  // Order the values according to the deinterleaving order.
-  std::swap(DeinterleavedValues[1], DeinterleavedValues[2]);
+
+  // These Values will not be used anymre,
+  // DI4 will be created instead of nested DI1 and DI2
+  DeadInstructions.push_back(cast<Instruction>(DI1));
+  DeadInstructions.push_back(cast<Instruction>(Extr1));
+  DeadInstructions.push_back(cast<Instruction>(DI2));
+  DeadInstructions.push_back(cast<Instruction>(Extr2));
+
   return true;
 }
 
-void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
-  Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
-  match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
-  assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
-  match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
-  assert(DeadExtract != nullptr && "Match is expected to succeed");
-  DeadRoot->eraseFromParent();
-  if (DeadDeinterleave->getNumUses() == 0)
-    cast<Instruction>(DeadDeinterleave)->eraseFromParent();
-  if (DeadExtract->getNumUses() == 0)
-    cast<Instruction>(DeadExtract)->eraseFromParent();
+bool getDeinterleavedValues(Value *DI,
+                            SmallVectorImpl<Instruction *> &DeinterleavedValues,
+                            SmallVectorImpl<Instruction *> &DeadInstructions) {
+  if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInstructions))
+    return true;
+  return getDeinterleave2Values(DI, DeinterleavedValues);
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
@@ -16980,16 +17029,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
 
   SmallVector<Instruction *, 4> DeinterleavedValues;
+  SmallVector<Instruction *, 4> DeadInstructions;
   const DataLayout &DL = DI->getModule()->getDataLayout();
-  unsigned Factor = 2;
-  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
 
-  if (getDeinterleavedValues(DI, DeinterleavedValues)) {
-    Factor = DeinterleavedValues.size();
-    VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+  if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInstructions)) {
+    LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+    return false;
   }
+  unsigned Factor = DeinterleavedValues.size();
   assert((Factor == 2 || Factor == 4) &&
-         "Currently supported Factors are 2 or 4");
+         "Currently supported Factor is 2 or 4 only");
+  VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
 
   bool UseScalable;
   if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -17050,23 +17100,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
-  if (Factor > 2) {
-    // Itereate over old deinterleaved values to replace it by
-    // the new deinterleaved values.
-    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
-      Value *NewExtract = Builder.CreateExtractValue(Result, I);
-      DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
-    }
-    for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
-      deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
-    return true;
+  // Itereate over old deinterleaved values to replace it by
+  // the new values.
+  for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+    Value *NewExtract = Builder.CreateExtractValue(Result, I);
+    DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
+    cast<Instruction>(DeinterleavedValues[I])->eraseFromParent();
   }
-  DI->replaceAllUsesWith(Result);
+  for (auto &dead : DeadInstructions)
+    dead->eraseFromParent();
   return true;
 }
 
-bool getValuesToInterleaved(Value *II,
-                            SmallVectorImpl<Value *> &ValuesToInterleave) {
+/*
+Diagram for Interleave tree.
+          A    C         B    D
+           \  /           \  /
+       [Interleave]   [Interleave]
+                 \     /
+               [Interleave]
+                    |
+                 [Store]
+values in correct order of interleave4: A B C D.
+If there is a pattern matches the interleave tree above, then we can construct
+Interleave4 out of that pattern. This function tries to match the interleave
+tree pattern, and fetch the values that we want to interleave, so that in
+further steps they can be replaced by the output of Inteleave4.
+*/
+bool getValuesToInterleave(Value *II,
+                           SmallVectorImpl<Value *> &ValuesToInterleave) {
   Value *A, *B, *C, *D;
   // Try to match interleave of Factor 4
   if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
@@ -17090,14 +17152,18 @@ bool getValuesToInterleaved(Value *II,
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     IntrinsicInst *II, StoreInst *SI) const {
-  LLVM_DEBUG(dbgs() << "lowerInterleaveIntrinsicToStore\n");
+  // Only interleave2 supported at present.
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
+    return false;
 
   SmallVector<Value *, 4> ValuesToInterleave;
-  if (!getValuesToInterleaved(II, ValuesToInterleave))
+  if (!getValuesToInterleave(II, ValuesToInterleave)) {
+    LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
+  }
   unsigned Factor = ValuesToInterleave.size();
   assert((Factor == 2 || Factor == 4) &&
-         "Currently supported Factors are 2 or 4");
+         "Currently supported Factor is 2 or 4 only");
   VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
   const DataLayout &DL = II->getModule()->getDataLayout();
 
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 24d624c221f46b..2582bb56ceef80 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -6,28 +6,35 @@
 
 target triple = "aarch64-linux-gnu"
 
-define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
-; NEON-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+define void @deinterleave_i8_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i8_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <16 x i8>, <16 x i8> } [[LDN]]
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <32 x i8>, ptr %ptr, align 1
   %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load)
-  ret { <16 x i8>, <16 x i8> } %deinterleave
+  %extract1 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 0
+  %extract2 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 1
+  ret void
 }
 
 define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <8 x i16>, <8 x i16> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; NEON-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -43,8 +50,9 @@ define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <4 x i32>, <4 x i32> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
+; NEON-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -60,8 +68,9 @@ define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <2 x i64>, <2 x i64> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
+; NEON-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -77,8 +86,9 @@ define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <4 x float>, <4 x float> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
+; NEON-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -94,8 +104,9 @@ define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <2 x double>, <2 x double> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
+; NEON-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -111,8 +122,9 @@ define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
 ; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
-; NEON-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[LDN]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
+; NEON-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
@@ -247,21 +259,9 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
 define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0
-; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]])
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
-; NEON-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP2]], i64 0)
-; NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
-; NEON-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP4]], i64 0)
-; NEON-NEXT:    [[TMP6:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 2
-; NEON-NEXT:    [[LDN1:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP6]])
-; NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 0
-; NEON-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
-; NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
-; NEON-NEXT:    [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
-; NEON-NEXT:    [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
-; NEON-NEXT:    [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
-; NEON-NEXT:    ret { <16 x i16>, <16 x i16> } [[TMP12]]
+; NEON-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
+; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
+; NEON-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
 ;
 ; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 2a05718cc4161c..589b43b3cc1778 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -4,22 +4,27 @@
 
 target triple = "aarch64-linux-gnu"
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2
+define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi8_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
   %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
-  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
+  %extract1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave, 1
+  %extract2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave, 0
+  ret void
 }
 
 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i16>, ptr [[PTR]], align 2
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
   %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
@@ -29,8 +34,9 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(pt
 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x i32>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
@@ -40,8 +46,9 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(
 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
@@ -51,8 +58,9 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(pt
 define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x float>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
@@ -62,8 +70,9 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_fact
 define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x double>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
@@ -73,8 +82,9 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_f
 define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
@@ -163,33 +173,9 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
 define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 0
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 4
-; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP12]], i64 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP14]], i64 8)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 6
-; CHECK-NEXT:    [[LDN3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP16]])
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
-; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
-; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 32 x i32>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
@@ -199,21 +185,9 @@ define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_fac
 define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
-; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x double>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[LOAD]])
+; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index d6d0e98edb3c88..f4dda5209f3bb7 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -10,6 +10,8 @@ define void @deinterleave4(ptr %src) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    ret void
 ;
 
@@ -23,6 +25,8 @@ define void @deinterleave4(ptr %src) {
   %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
   %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
   %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  %sum = add <vscale x 4 x i32> %5, %7
+  %sub = sub <vscale x 4 x i32> %6, %8
   ret void
 }
 
@@ -81,8 +85,8 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
 ; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
-; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
 ; CHECK-NEXT:    ret void
 ;
 
@@ -97,9 +101,46 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
   %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
   %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
 
-  %load1 = load <vscale x 8 x i32>, ptr %src, align 4
-  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load1)
+  %load2 = load <vscale x 8 x i32>, ptr %src, align 4
+  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
   %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
   %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
   ret void
 }
+
+define void @negative_deinterleave4_test(ptr %src) {
+; CHECK-LABEL: define void @negative_deinterleave4_test
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP11]], <vscale x 8 x i32> [[TMP10]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP12]], 1
+; CHECK-NEXT:    [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
+; CHECK-NEXT:    [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
+; CHECK-NEXT:    ret void
+;
+  %load = load <vscale x 16 x i32>, ptr %src, align 4
+  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 73f26814f3a4bc..4ef9ba30c825e6 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -1,36 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -interleaved-access -S | FileCheck %s
 ; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s
 
 target triple = "aarch64-linux-gnu"
 
 define void @load_factor2(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor2(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT1:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT2:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP3]], i64 0)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP4]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <32 x i16>, ptr %ptr, align 4
   %v0 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
-                                                                                  i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %v1 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
-                                                                                  i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   ret void
 }
 
 define void @load_factor3(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor3(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT1:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT2:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <24 x i32>, ptr %ptr, align 4
   %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
   %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
@@ -39,18 +44,20 @@ define void @load_factor3(ptr %ptr) #0 {
 }
 
 define void @load_factor4(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor4(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <16 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -60,54 +67,64 @@ define void @load_factor4(ptr %ptr) #0 {
 }
 
 define void @store_factor2(ptr %ptr, <16 x i16> %v0, <16 x i16> %v1) #0 {
-; CHECK-LABEL:    @store_factor2(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP2]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[INS1]], <vscale x 8 x i16> [[INS2]], <vscale x 8 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x i16> [[V0:%.*]], <16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <16 x i16> %v0, <16 x i16> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
-                                                                               i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   store <32 x i16> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @store_factor3(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) #0 {
-; CHECK-LABEL:    @store_factor3(
-; CHECK:            [[PTRUE:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[INS1]], <vscale x 4 x i32> [[INS2]], <vscale x 4 x i32> [[INS3]], <vscale x 4 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[V2]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                                                                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                                                                   i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19,
-                                                                               i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
   store <24 x i32> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) #0 {
-; CHECK-LABEL:    @store_factor4(
-; CHECK:            [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]], <4 x i64> [[V2:%.*]], <4 x i64> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i64> [[V2]], <4 x i64> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -116,16 +133,18 @@ define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2
 }
 
 define void @load_ptrvec_factor2(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_ptrvec_factor2(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_ptrvec_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <8 x ptr>, ptr %ptr, align 4
   %v0 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %v1 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -133,19 +152,21 @@ define void @load_ptrvec_factor2(ptr %ptr) #0 {
 }
 
 define void @load_ptrvec_factor3(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_ptrvec_factor3(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_ptrvec_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <12 x ptr>, ptr %ptr, align 4
   %v0 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
@@ -154,22 +175,24 @@ define void @load_ptrvec_factor3(ptr %ptr) #0 {
 }
 
 define void @load_ptrvec_factor4(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_ptrvec_factor4(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr>
-; CHECK-NEXT:       [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       [[TOP4:%.*]] = inttoptr <4 x i64> [[EXT4]] to <4 x ptr>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_ptrvec_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr <4 x i64> [[TMP12]] to <4 x ptr>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <16 x ptr>, ptr %ptr, align 4
   %v0 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %v1 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -179,34 +202,40 @@ define void @load_ptrvec_factor4(ptr %ptr) #0 {
 }
 
 define void @store_ptrvec_factor2(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1) #0 {
-; CHECK-LABEL:    @store_ptrvec_factor2(
-; CHECK-NEXT:       [[TOI1:%.*]] = ptrtoint <4 x ptr> %v0 to <4 x i64>
-; CHECK-NEXT:       [[TOI2:%.*]] = ptrtoint <4 x ptr> %v1 to <4 x i64>
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_ptrvec_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <4 x ptr> [[V0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <4 x ptr> [[V1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x ptr> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2) #0 {
-; CHECK-LABEL:    @store_ptrvec_factor3(
-; CHECK:            [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64>
-; CHECK-NEXT:       [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64>
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_ptrvec_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x ptr> %v2, <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -215,45 +244,51 @@ define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x p
 }
 
 define void @store_ptrvec_factor4(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2, <4 x ptr> %v3) #0 {
-; CHECK-LABEL:    @store_ptrvec_factor4(
-; CHECK:            [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64>
-; CHECK-NEXT:       [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64>
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_ptrvec_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]], <4 x ptr> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP10]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %s1 = shufflevector <4 x ptr> %v2, <4 x ptr> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13,
-                                                                               i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
   store <16 x ptr> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @load_factor2_wide(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_factor2_wide(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       [[TMP4:%.*]] = getelementptr i64, ptr %ptr, i32 8
-; CHECK-NEXT:       [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr [[TMP4]])
-; CHECK-NEXT:       [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT:       [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:       [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP6]], i64 0)
-; CHECK-NEXT:       [[TMP7:%.*]] = shufflevector <4 x i64> [[EXT1]], <4 x i64> [[EXT3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[TMP8:%.*]] = shufflevector <4 x i64> [[EXT2]], <4 x i64> [[EXT4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @load_factor2_wide(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[PTR]], i32 8
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP9]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <16 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -261,20 +296,22 @@ define void @load_factor2_wide(ptr %ptr) #0 {
 }
 
 define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 {
-; CHECK-LABEL:    @store_factor2_wide(
-; CHECK-NEXT:       [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:       [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:       [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:       [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:       [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT:       [[TMP6:%.*]] = getelementptr i64, ptr %ptr, i32 8
-; CHECK-NEXT:       call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr [[TMP6]])
-; CHECK-NEXT:       ret void
+; CHECK-LABEL: define void @store_factor2_wide(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i64> [[V0:%.*]], <8 x i64> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[PTR]], i32 8
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[TMP10]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <8 x i64> %v0, <8 x i64> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i64> %interleaved.vec, ptr %ptr, align 4
   ret void
@@ -282,9 +319,27 @@ define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 {
 
 ; Check that neon is used for illegal multiples of 128-bit types
 define void @load_384bit(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_384bit(
-; CHECK: llvm.aarch64.neon.ld2
-; CHECK-NOT: llvm.aarch64.sve.ld2
+; CHECK-LABEL: define void @load_384bit(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[PTR]], i32 4
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <12 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
   %v1 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
@@ -293,9 +348,13 @@ define void @load_384bit(ptr %ptr) #0 {
 
 ; Check that neon is used for 128-bit vectors
 define void @load_128bit(ptr %ptr) #0 {
-; CHECK-LABEL:    @load_128bit(
-; CHECK: llvm.aarch64.neon.ld2
-; CHECK-NOT: llvm.aarch64.sve.ld2
+; CHECK-LABEL: define void @load_128bit(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <4 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
   %v1 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
@@ -304,8 +363,16 @@ define void @load_128bit(ptr %ptr) #0 {
 
 ; Check that correct ptrues are generated for min != max case
 define void @load_min_not_max(ptr %ptr) #1 {
-; CHECK-LABEL:    @load_min_not_max(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @load_min_not_max(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <8 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -313,8 +380,16 @@ define void @load_min_not_max(ptr %ptr) #1 {
 }
 
 define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 {
-; CHECK-LABEL:    @store_min_not_max(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @store_min_not_max(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i64> %interleaved.vec, ptr %ptr, align 4
   ret void
@@ -322,8 +397,16 @@ define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 {
 
 ; Check that correct ptrues are generated for min > type case
 define void @load_min_ge_type(ptr %ptr) #2 {
-; CHECK-LABEL:    @load_min_ge_type(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @load_min_ge_type(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = load <8 x i64>, ptr %ptr, align 4
   %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -331,25 +414,34 @@ define void @load_min_ge_type(ptr %ptr) #2 {
 }
 
 define void @store_min_ge_type(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #2 {
-; CHECK-LABEL:    @store_min_ge_type(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @store_min_ge_type(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
   %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
   store <8 x i64> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
 
 define void @load_double_factor4(ptr %ptr) #0 {
-; CHECK-LABEL: @load_double_factor4(
+; CHECK-LABEL: define void @load_double_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP5]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP7]], i64 0)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP9]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP8]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <16 x double>, ptr %ptr, align 4
@@ -361,15 +453,16 @@ define void @load_double_factor4(ptr %ptr) #0 {
 }
 
 define void @load_float_factor3(ptr %ptr) #0 {
-; CHECK-LABEL: @load_float_factor3(
+; CHECK-LABEL: define void @load_float_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP5]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP7]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP6]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <24 x float>, ptr %ptr, align 4
@@ -380,13 +473,14 @@ define void @load_float_factor3(ptr %ptr) #0 {
 }
 
 define void @load_half_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_half_factor2(
+; CHECK-LABEL: define void @load_half_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP4]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <32 x half>, ptr %ptr, align 4
@@ -396,13 +490,14 @@ define void @load_half_factor2(ptr %ptr) #0 {
 }
 
 define void @load_bfloat_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_bfloat_factor2(
+; CHECK-LABEL: define void @load_bfloat_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = load <32 x bfloat>, ptr %ptr, align 4
@@ -412,9 +507,10 @@ define void @load_bfloat_factor2(ptr %ptr) #0 {
 }
 
 define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) #0 {
-; CHECK-LABEL: @store_double_factor4(
-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define void @store_double_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <4 x double> [[V0]], <4 x double> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x double> [[V2]], <4 x double> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP2]], i64 0)
@@ -424,7 +520,7 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1,
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP6]], i64 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP8]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -435,9 +531,10 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1,
 }
 
 define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8 x float> %v2) #0 {
-; CHECK-LABEL: @store_float_factor3(
-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[V2:%.*]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-LABEL: define void @store_float_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <8 x float> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[S0:%.*]] = shufflevector <8 x float> [[V0]], <8 x float> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x float> [[V2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP2]], i64 0)
@@ -445,7 +542,7 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP6]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %s0 = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -459,13 +556,14 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8
 }
 
 define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 {
-; CHECK-LABEL: @store_half_factor2(
+; CHECK-LABEL: define void @store_half_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v16f16(<vscale x 8 x half> undef, <16 x half> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v16f16(<vscale x 8 x half> undef, <16 x half> [[TMP4]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <16 x half> %v0, <16 x half> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
@@ -476,13 +574,14 @@ define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 {
 
 
 define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1) #0 {
-; CHECK-LABEL: @store_bfloat_factor2(
+; CHECK-LABEL: define void @store_bfloat_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP4]], i64 0)
-; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
   %interleaved.vec = shufflevector <16 x bfloat> %v0, <16 x bfloat> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
@@ -494,20 +593,9 @@ define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1
 ; Ensure vscale_range property does not affect scalable vector types.
 define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(ptr %ptr) #2 {
 ; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x double>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[WIDE_VEC]])
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8

>From 7b13365b812f86fa1bb5d1b3c672b311b9ff322b Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 7 Aug 2024 10:16:33 +0000
Subject: [PATCH 08/10] Resolve review comments. - remove dead instructions of
 interleave tree. - code improvement.

Change-Id: I2020bf850c987829bf6e2255425eacee9361a980
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   6 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |   8 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 127 +++++++-------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   6 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   6 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   6 +-
 .../AArch64/fixed-deinterleave-intrinsics.ll  | 156 +++++++++++-------
 .../scalable-deinterleave-intrinsics.ll       | 150 +++++++++++------
 .../AArch64/sve-deinterleave4.ll              |  20 +--
 .../AArch64/sve-interleave4.ll                |   2 +-
 .../AArch64/sve-interleaved-accesses.ll       |  24 ++-
 11 files changed, 311 insertions(+), 200 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9d9886f4920a29..5f4ed131c7e51c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3156,7 +3156,8 @@ class TargetLoweringBase {
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
   virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                LoadInst *LI) const {
+                                                LoadInst *LI,
+                                                SmallVectorImpl<Instruction *> &DeadInsts) const {
     return false;
   }
 
@@ -3167,7 +3168,8 @@ class TargetLoweringBase {
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                               StoreInst *SI) const {
+                                               StoreInst *SI,
+                                               SmallVectorImpl<Instruction *> &DeadInstructions) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8c9065aec7faa4..f466a5f2891596 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -489,7 +489,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
 
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeadInsts))
     return false;
 
   // We now have a target-specific load, so delete the old one.
@@ -510,13 +510,15 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
 
+  SmallVector<Instruction *, 32> InterleaveDeadInsts;
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
   DeadInsts.push_back(II);
+  DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
   return true;
 }
 
@@ -537,7 +539,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
       // with a factor of 2.
       if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+      else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9029464752d340..50b149397b8129 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16907,7 +16907,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool getDeinterleave2Values(
-    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+    SmallVectorImpl<Instruction *> &DeadInsts) {
   if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
@@ -16928,13 +16929,13 @@ bool getDeinterleave2Values(
     LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
     return false;
   }
+  // DeinterleavedValues will be replace by output of ld2
+  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(), DeinterleavedValues.end());
   return true;
 }
 
 /*
-Diagram for DI tree.
-                  [LOAD]
-                    |
+DeinterleaveIntrinsic tree:
                    [DI]
                 /        \
          [Extr<0>]      [Extr<1>]
@@ -16944,15 +16945,14 @@ Diagram for DI tree.
     [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
         |       |         |         |
 roots:  A       C         B         D
-roots in correct order of DI4: A B C D.
-If there is a pattern matches the deinterleave tree above, then we can construct
-DI4 out of that pattern. This function tries to match the deinterleave tree
-pattern, and fetch the tree roots, so that in further steps they can be replaced
-by the output of DI4.
+roots in correct order of DI4 will be: A B C D.
+Returns true if `DI` is the top of an IR tree that represents a theoretical vector.deinterleave4 intrinsic.
+When true is returned, `DeinterleavedValues` vector is populated with the results such an intrinsic would return:
+(i.e. {A, B, C, D } = vector.deinterleave4(...))
 */
 bool getDeinterleave4Values(Value *DI,
                             SmallVectorImpl<Instruction *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInstructions) {
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
   if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
@@ -16960,7 +16960,7 @@ bool getDeinterleave4Values(Value *DI,
   if (!Extr1 || !Extr2)
     return false;
 
-  if (!Extr1->hasNUses(1) || !Extr2->hasNUses(1))
+  if (!Extr1->hasOneUse() || !Extr2->hasOneUse())
     return false;
   auto *DI1 = *(Extr1->user_begin());
   auto *DI2 = *(Extr2->user_begin());
@@ -16972,8 +16972,7 @@ bool getDeinterleave4Values(Value *DI,
   auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
   auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
   auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
-  // Make sure that the A,B,C,D are instructions of ExtractValue,
-  // before getting the extract index
+  // Make sure that the A,B,C and D are ExtractValue instructions before getting the extract index
   if (!A || !B || !C || !D)
     return false;
 
@@ -17004,35 +17003,35 @@ bool getDeinterleave4Values(Value *DI,
     return false;
   }
 
-  // These Values will not be used anymre,
+  // These Values will not be used anymore,
   // DI4 will be created instead of nested DI1 and DI2
-  DeadInstructions.push_back(cast<Instruction>(DI1));
-  DeadInstructions.push_back(cast<Instruction>(Extr1));
-  DeadInstructions.push_back(cast<Instruction>(DI2));
-  DeadInstructions.push_back(cast<Instruction>(Extr2));
+  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(), DeinterleavedValues.end());
+  DeadInsts.push_back(cast<Instruction>(DI1));
+  DeadInsts.push_back(cast<Instruction>(Extr1));
+  DeadInsts.push_back(cast<Instruction>(DI2));
+  DeadInsts.push_back(cast<Instruction>(Extr2));
 
   return true;
 }
 
 bool getDeinterleavedValues(Value *DI,
                             SmallVectorImpl<Instruction *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInstructions) {
-  if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInstructions))
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
+  if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInsts))
     return true;
-  return getDeinterleave2Values(DI, DeinterleavedValues);
+  return getDeinterleave2Values(DI, DeinterleavedValues, DeadInsts);
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, LoadInst *LI) const {
+    IntrinsicInst *DI, LoadInst *LI, SmallVectorImpl<Instruction *> &DeadInsts) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   SmallVector<Instruction *, 4> DeinterleavedValues;
-  SmallVector<Instruction *, 4> DeadInstructions;
   const DataLayout &DL = DI->getModule()->getDataLayout();
 
-  if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInstructions)) {
+  if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
   }
@@ -17042,13 +17041,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
   VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
 
   bool UseScalable;
-  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
+    DeadInsts.clear();
     return false;
+  }
 
   // TODO: Add support for using SVE instructions with fixed types later, using
   // the code from lowerInterleavedLoad to obtain the correct container type.
-  if (UseScalable && !VTy->isScalableTy())
+  if (UseScalable && !VTy->isScalableTy()) {
+    DeadInsts.clear();
     return false;
+  }
 
   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
   VectorType *LdTy =
@@ -17066,10 +17069,9 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
 
   Value *BaseAddr = LI->getPointerOperand();
-  Value *Result;
   if (NumLoads > 1) {
-    // Create multiple legal small ldN instead of a wide one.
-    SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));
+    // Create multiple legal small ldN.
+    SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
     for (unsigned I = 0; I < NumLoads; ++I) {
       Value *Offset = Builder.getInt64(I * Factor);
 
@@ -17082,53 +17084,45 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       Value *Idx =
           Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
       for (unsigned J = 0; J < Factor; ++J) {
-        WideValues[J] = Builder.CreateInsertVector(
-            VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
+        ExtractedLdValues[J] = Builder.CreateInsertVector(
+            VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
       }
+      LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
     }
-    if (Factor == 2)
-      Result = PoisonValue::get(StructType::get(VTy, VTy));
-    else
-      Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
-    // Construct the wide result out of the small results.
-    for (unsigned J = 0; J < Factor; ++J) {
-      Result = Builder.CreateInsertValue(Result, WideValues[J], J);
-    }
+    // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4
+    for (unsigned J = 0; J < Factor; ++J)
+      DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
   } else {
+    Value *Result;
     if (UseScalable)
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
+    // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4
+    for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
+      Value *NewExtract = Builder.CreateExtractValue(Result, I);
+      DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
+    }
   }
-  // Itereate over old deinterleaved values to replace it by
-  // the new values.
-  for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
-    Value *NewExtract = Builder.CreateExtractValue(Result, I);
-    DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
-    cast<Instruction>(DeinterleavedValues[I])->eraseFromParent();
-  }
-  for (auto &dead : DeadInstructions)
-    dead->eraseFromParent();
   return true;
 }
 
 /*
-Diagram for Interleave tree.
+InterleaveIntrinsic tree.
           A    C         B    D
            \  /           \  /
-       [Interleave]   [Interleave]
+           [II]           [II]
                  \     /
-               [Interleave]
-                    |
-                 [Store]
+                  [II]
+
 values in correct order of interleave4: A B C D.
-If there is a pattern matches the interleave tree above, then we can construct
-Interleave4 out of that pattern. This function tries to match the interleave
-tree pattern, and fetch the values that we want to interleave, so that in
-further steps they can be replaced by the output of Inteleave4.
+Returns true if `II` is the root of an IR tree that represents a theoretical vector.interleave4 intrinsic.
+When true is returned, `ValuesToInterleave` vector is populated with the inputs such an intrinsic would take:
+(i.e. vector.interleave4(A, B, C, D)).
 */
 bool getValuesToInterleave(Value *II,
-                           SmallVectorImpl<Value *> &ValuesToInterleave) {
+                           SmallVectorImpl<Value *> &ValuesToInterleave,
+                           SmallVectorImpl<Instruction *> &DeadInsts) {
   Value *A, *B, *C, *D;
   // Try to match interleave of Factor 4
   if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
@@ -17137,6 +17131,11 @@ bool getValuesToInterleave(Value *II,
     ValuesToInterleave.push_back(B);
     ValuesToInterleave.push_back(C);
     ValuesToInterleave.push_back(D);
+    // intermediate II will not be needed anymore
+    Value *II1, *II2;
+    assert(match(II, m_Interleave2(m_Value(II1), m_Value(II2))) && "II tree is expected");
+    DeadInsts.push_back(cast<Instruction>(II1));
+    DeadInsts.push_back(cast<Instruction>(II2));
     return true;
   }
 
@@ -17151,13 +17150,13 @@ bool getValuesToInterleave(Value *II,
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, StoreInst *SI) const {
+    IntrinsicInst *II, StoreInst *SI, SmallVectorImpl<Instruction *> &DeadInsts) const {
   // Only interleave2 supported at present.
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   SmallVector<Value *, 4> ValuesToInterleave;
-  if (!getValuesToInterleave(II, ValuesToInterleave)) {
+  if (!getValuesToInterleave(II, ValuesToInterleave, DeadInsts)) {
     LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
   }
@@ -17168,13 +17167,17 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
   const DataLayout &DL = II->getModule()->getDataLayout();
 
   bool UseScalable;
-  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
+    DeadInsts.clear();
     return false;
+  }
 
   // TODO: Add support for using SVE instructions with fixed types later, using
   // the code from lowerInterleavedStore to obtain the correct container type.
-  if (UseScalable && !VTy->isScalableTy())
+  if (UseScalable && !VTy->isScalableTy()) {
+    DeadInsts.clear();
     return false;
+  }
 
   unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fcdd47541be828..a405d1c5d80ad1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -704,10 +704,12 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                        LoadInst *LI) const override;
+                                        LoadInst *LI,
+                                        SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       StoreInst *SI) const override;
+                                       StoreInst *SI,
+                                       SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e938454b8e6426..765016433dc2c1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21776,7 +21776,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                           LoadInst *LI) const {
+                                                           LoadInst *LI,
+                                                           SmallVectorImpl<Instruction *> &DeadInsts) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
@@ -21826,7 +21827,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
 }
 
 bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                                          StoreInst *SI) const {
+                                                          StoreInst *SI,
+                                                          SmallVectorImpl<Instruction *> &DeadInstructions) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0b0ad9229f0b35..2060314e096047 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -877,10 +877,12 @@ class RISCVTargetLowering : public TargetLowering {
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
-                                        LoadInst *LI) const override;
+                                        LoadInst *LI,
+                                        SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
   bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       StoreInst *SI) const override;
+                                       StoreInst *SI,
+                                       SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
   bool supportKCFIBundles() const override { return true; }
 
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 2582bb56ceef80..09e2c53465cd73 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -29,112 +29,142 @@ define void @deinterleave_i8_factor2(ptr %ptr) {
   ret void
 }
 
-define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
-; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
+define void @deinterleave_i16_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
-; NEON-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <16 x i16>, ptr %ptr, align 2
   %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %load)
-  ret { <8 x i16>, <8 x i16> } %deinterleave
+  %extract1 = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 0
+  %extract2 = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 1
+  ret void
 }
 
-define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
-; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
+define void @deinterleave_8xi32_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_8xi32_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
-; NEON-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <8 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %load)
-  ret { <4 x i32>, <4 x i32> } %deinterleave
+  %extract1 = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 0
+  %extract2 = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 1
+  ret void
 }
 
-define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
-; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
+define void @deinterleave_i64_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i64_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
-; NEON-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <4 x i64>, ptr %ptr, align 8
   %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> %load)
-  ret { <2 x i64>, <2 x i64> } %deinterleave
+  %extract1 = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 0
+  %extract2 = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 1
+  ret void
 }
 
-define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
-; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
+define void @deinterleave_float_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_float_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
-; NEON-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <8 x float>, ptr %ptr, align 4
   %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> %load)
-  ret { <4 x float>, <4 x float> } %deinterleave
+  %extract1 = extractvalue { <4 x float>, <4 x float> } %deinterleave, 0
+  %extract2 = extractvalue { <4 x float>, <4 x float> } %deinterleave, 1
+  ret void
 }
 
-define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
-; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
+define void @deinterleave_double_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_double_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
-; NEON-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <4 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %load)
-  ret { <2 x double>, <2 x double> } %deinterleave
+  %extract1 = extractvalue { <2 x double>, <2 x double> } %deinterleave, 0
+  %extract2 = extractvalue { <2 x double>, <2 x double> } %deinterleave, 1
+  ret void
 }
 
-define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
-; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
+define void @deinterleave_ptr_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_ptr_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
-; NEON-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
+; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <4 x ptr>, ptr %ptr, align 8
   %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> %load)
-  ret { <2 x ptr>, <2 x ptr> } %deinterleave
+  %extract1 = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 0
+  %extract2 = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 1
+  ret void
 }
 
 define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) {
@@ -256,22 +286,36 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
   ret void
 }
 
-define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
-; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
+define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
+; NEON-LABEL: define void @deinterleave_wide_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
-; NEON-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
-; NEON-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
-; NEON-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
+; NEON-NEXT:    [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0
+; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]])
+; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP2]], i64 0)
+; NEON-NEXT:    [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP4]], i64 0)
+; NEON-NEXT:    [[TMP6:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 2
+; NEON-NEXT:    [[LDN1:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP6]])
+; NEON-NEXT:    [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 0
+; NEON-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
+; NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
+; NEON-NEXT:    [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
+; NEON-NEXT:    ret void
 ;
-; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
 ; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
-; SVE-FIXED-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT:    ret void
 ;
   %load = load <32 x i16>, ptr %ptr, align 2
   %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> %load)
-  ret { <16 x i16>, <16 x i16> } %deinterleave
+  %extract1 = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 0
+  %extract2 = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 1
+  ret void
 }
 
 define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 589b43b3cc1778..e5b56eb54f927d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -19,76 +19,94 @@ define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
   ret void
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2
+define void @deinterleave_nxi16_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi16_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i16>, ptr [[PTR]], align 2
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
   %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
-  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave
+  %extract1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave, 1
+  ret void
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2
+define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x i32>, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
-  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave
+  %extract1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  ret void
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2
+define void @deinterleave_nxi64_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi64_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[PTR]], align 8
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
-  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave
+  %extract1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave, 1
+  ret void
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2
+define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxfloat_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x float>, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
-  ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave
+  %extract1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave, 1
+  ret void
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2
+define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x double>, ptr [[PTR]], align 8
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
-  ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave
+  %extract1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave, 1
+  ret void
 }
 
-define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2
+define void @deinterleave_nxptr_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxptr_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[PTR]], align 8
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
-  ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave
+  %extract1 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave, 1
+  ret void
 }
 
 define void @interleave_nxi8_factor2(ptr %ptr, <vscale x 16 x i8> %l, <vscale x 16 x i8> %r) #0 {
@@ -170,28 +188,64 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
 
 ;;; Check that we 'legalize' operations that are wider than the target supports.
 
-define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2
+define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_wide_nxi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 32 x i32>, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 4
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP12]], i64 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP14]], i64 8)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 6
+; CHECK-NEXT:    [[LDN3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
   %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
-  ret { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave
+  %extract1 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave, 1
+  ret void
 }
 
-define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2
+define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_wide_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 8 x double>, ptr [[PTR]], align 8
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[LOAD]])
-; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[DEINTERLEAVE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
   %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
-  ret { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave
+  %extract1 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave, 0
+  %extract2 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave, 1
+  ret void
 }
 
 define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l, <vscale x 4 x double> %r) #0 {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index f4dda5209f3bb7..995075f1784a62 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -53,14 +53,6 @@ define void @wide_deinterleave4(ptr %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
-; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
-; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %src, align 4
@@ -123,14 +115,10 @@ define void @negative_deinterleave4_test(ptr %src) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP11]], <vscale x 8 x i32> [[TMP10]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP12]], 1
-; CHECK-NEXT:    [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP13]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
-; CHECK-NEXT:    [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
+; CHECK-NEXT:    [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
+; CHECK-NEXT:    [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i32>, ptr %src, align 4
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index 6cbd201ab36a2a..ba9bff093678c2 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=interleaved-access,dce -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
 
 
 define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 4ef9ba30c825e6..8821255a86b2f8 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -591,16 +591,28 @@ define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1
 }
 
 ; Ensure vscale_range property does not affect scalable vector types.
-define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(ptr %ptr) #2 {
-; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(
+define void @deinterleave_nxptr_factor2(ptr %ptr) #2 {
+; CHECK-LABEL: define void @deinterleave_nxptr_factor2(
 ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x double>, ptr [[PTR]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[WIDE_VEC]])
-; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    ret void
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
   %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
-  ret { <vscale x 4 x double>, <vscale x 4 x double> } %ldN
+  %extract1 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %ldN, 0
+  %extract2 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %ldN, 1
+  ret void
 }
 
 attributes #0 = { vscale_range(2,2) "target-features"="+sve" }

>From b67e3f7a679e216ad47d0b71fe0364724532801b Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Wed, 7 Aug 2024 10:23:35 +0000
Subject: [PATCH 09/10] code format

Change-Id: I5d1cada783bb3da0a5d51fd10c98428d63db9c13
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 12 +++---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  3 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 38 +++++++++++--------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 12 +++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 12 +++---
 llvm/lib/Target/RISCV/RISCVISelLowering.h     | 12 +++---
 6 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 5f4ed131c7e51c..69fc4741ebebc4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3155,9 +3155,9 @@ class TargetLoweringBase {
   ///
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
-  virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                LoadInst *LI,
-                                                SmallVectorImpl<Instruction *> &DeadInsts) const {
+  virtual bool lowerDeinterleaveIntrinsicToLoad(
+      IntrinsicInst *DI, LoadInst *LI,
+      SmallVectorImpl<Instruction *> &DeadInsts) const {
     return false;
   }
 
@@ -3167,9 +3167,9 @@ class TargetLoweringBase {
   ///
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
-  virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                               StoreInst *SI,
-                                               SmallVectorImpl<Instruction *> &DeadInstructions) const {
+  virtual bool lowerInterleaveIntrinsicToStore(
+      IntrinsicInst *II, StoreInst *SI,
+      SmallVectorImpl<Instruction *> &DeadInstructions) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index f466a5f2891596..b983ee91e175e8 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -518,7 +518,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   // We now have a target-specific store, so delete the old one.
   DeadInsts.push_back(SI);
   DeadInsts.push_back(II);
-  DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
+  DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
+                   InterleaveDeadInsts.end());
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 50b149397b8129..0ef46e3c037890 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16906,9 +16906,9 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool getDeinterleave2Values(
-    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
-    SmallVectorImpl<Instruction *> &DeadInsts) {
+bool getDeinterleave2Values(Value *DI,
+                            SmallVectorImpl<Instruction *> &DeinterleavedValues,
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
   if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
@@ -16930,7 +16930,8 @@ bool getDeinterleave2Values(
     return false;
   }
   // DeinterleavedValues will be replace by output of ld2
-  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(), DeinterleavedValues.end());
+  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(),
+                   DeinterleavedValues.end());
   return true;
 }
 
@@ -16946,9 +16947,10 @@ DeinterleaveIntrinsic tree:
         |       |         |         |
 roots:  A       C         B         D
 roots in correct order of DI4 will be: A B C D.
-Returns true if `DI` is the top of an IR tree that represents a theoretical vector.deinterleave4 intrinsic.
-When true is returned, `DeinterleavedValues` vector is populated with the results such an intrinsic would return:
-(i.e. {A, B, C, D } = vector.deinterleave4(...))
+Returns true if `DI` is the top of an IR tree that represents a theoretical
+vector.deinterleave4 intrinsic. When true is returned, `DeinterleavedValues`
+vector is populated with the results such an intrinsic would return: (i.e. {A,
+B, C, D } = vector.deinterleave4(...))
 */
 bool getDeinterleave4Values(Value *DI,
                             SmallVectorImpl<Instruction *> &DeinterleavedValues,
@@ -16972,7 +16974,8 @@ bool getDeinterleave4Values(Value *DI,
   auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
   auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
   auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
-  // Make sure that the A,B,C and D are ExtractValue instructions before getting the extract index
+  // Make sure that the A,B,C and D are ExtractValue instructions before getting
+  // the extract index
   if (!A || !B || !C || !D)
     return false;
 
@@ -17005,7 +17008,8 @@ bool getDeinterleave4Values(Value *DI,
 
   // These Values will not be used anymore,
   // DI4 will be created instead of nested DI1 and DI2
-  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(), DeinterleavedValues.end());
+  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(),
+                   DeinterleavedValues.end());
   DeadInsts.push_back(cast<Instruction>(DI1));
   DeadInsts.push_back(cast<Instruction>(Extr1));
   DeadInsts.push_back(cast<Instruction>(DI2));
@@ -17023,7 +17027,8 @@ bool getDeinterleavedValues(Value *DI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    IntrinsicInst *DI, LoadInst *LI, SmallVectorImpl<Instruction *> &DeadInsts) const {
+    IntrinsicInst *DI, LoadInst *LI,
+    SmallVectorImpl<Instruction *> &DeadInsts) const {
   // Only deinterleave2 supported at present.
   if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
@@ -17116,9 +17121,10 @@ InterleaveIntrinsic tree.
                   [II]
 
 values in correct order of interleave4: A B C D.
-Returns true if `II` is the root of an IR tree that represents a theoretical vector.interleave4 intrinsic.
-When true is returned, `ValuesToInterleave` vector is populated with the inputs such an intrinsic would take:
-(i.e. vector.interleave4(A, B, C, D)).
+Returns true if `II` is the root of an IR tree that represents a theoretical
+vector.interleave4 intrinsic. When true is returned, `ValuesToInterleave` vector
+is populated with the inputs such an intrinsic would take: (i.e.
+vector.interleave4(A, B, C, D)).
 */
 bool getValuesToInterleave(Value *II,
                            SmallVectorImpl<Value *> &ValuesToInterleave,
@@ -17133,7 +17139,8 @@ bool getValuesToInterleave(Value *II,
     ValuesToInterleave.push_back(D);
     // intermediate II will not be needed anymore
     Value *II1, *II2;
-    assert(match(II, m_Interleave2(m_Value(II1), m_Value(II2))) && "II tree is expected");
+    assert(match(II, m_Interleave2(m_Value(II1), m_Value(II2))) &&
+           "II tree is expected");
     DeadInsts.push_back(cast<Instruction>(II1));
     DeadInsts.push_back(cast<Instruction>(II2));
     return true;
@@ -17150,7 +17157,8 @@ bool getValuesToInterleave(Value *II,
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    IntrinsicInst *II, StoreInst *SI, SmallVectorImpl<Instruction *> &DeadInsts) const {
+    IntrinsicInst *II, StoreInst *SI,
+    SmallVectorImpl<Instruction *> &DeadInsts) const {
   // Only interleave2 supported at present.
   if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index a405d1c5d80ad1..b2c511b4917bc1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -703,13 +703,13 @@ class AArch64TargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                        LoadInst *LI,
-                                        SmallVectorImpl<Instruction *> &DeadInsts) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(
+      IntrinsicInst *DI, LoadInst *LI,
+      SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
-  bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       StoreInst *SI,
-                                       SmallVectorImpl<Instruction *> &DeadInsts) const override;
+  bool lowerInterleaveIntrinsicToStore(
+      IntrinsicInst *II, StoreInst *SI,
+      SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 765016433dc2c1..fc2372b7705aa8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21775,9 +21775,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
-                                                           LoadInst *LI,
-                                                           SmallVectorImpl<Instruction *> &DeadInsts) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+    IntrinsicInst *DI, LoadInst *LI,
+    SmallVectorImpl<Instruction *> &DeadInsts) const {
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
@@ -21826,9 +21826,9 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   return true;
 }
 
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                                          StoreInst *SI,
-                                                          SmallVectorImpl<Instruction *> &DeadInstructions) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+    IntrinsicInst *II, StoreInst *SI,
+    SmallVectorImpl<Instruction *> &DeadInstructions) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 2060314e096047..57c4d308e395d9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -876,13 +876,13 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
-                                        LoadInst *LI,
-                                        SmallVectorImpl<Instruction *> &DeadInsts) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(
+      IntrinsicInst *II, LoadInst *LI,
+      SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
-  bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
-                                       StoreInst *SI,
-                                       SmallVectorImpl<Instruction *> &DeadInsts) const override;
+  bool lowerInterleaveIntrinsicToStore(
+      IntrinsicInst *II, StoreInst *SI,
+      SmallVectorImpl<Instruction *> &DeadInsts) const override;
 
   bool supportKCFIBundles() const override { return true; }
 

>From 6753d0e77ab29e51f075c89cfea263e791328d73 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Thu, 8 Aug 2024 07:32:31 +0000
Subject: [PATCH 10/10] resolve review comments

update DeadInst vec on successful transformation only

Change-Id: I920357d090960b84c46648666eca033033f209a3
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  2 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 88 +++++++++----------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +-
 4 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 69fc4741ebebc4..1560d6a5711f17 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3169,7 +3169,7 @@ class TargetLoweringBase {
   /// \p SI is the accompanying store instruction
   virtual bool lowerInterleaveIntrinsicToStore(
       IntrinsicInst *II, StoreInst *SI,
-      SmallVectorImpl<Instruction *> &DeadInstructions) const {
+      SmallVectorImpl<Instruction *> &DeadInsts) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index b983ee91e175e8..ef11713892a53d 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -510,7 +510,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
 
-  SmallVector<Instruction *, 32> InterleaveDeadInsts;
+  SmallVector<Instruction *, 4> InterleaveDeadInsts;
   // Try and match this with target specific intrinsics.
   if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0ef46e3c037890..5b183890e49e66 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16906,9 +16906,9 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-bool getDeinterleave2Values(Value *DI,
-                            SmallVectorImpl<Instruction *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
+bool getDeinterleave2Values(
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+    SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
   if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
@@ -16930,8 +16930,9 @@ bool getDeinterleave2Values(Value *DI,
     return false;
   }
   // DeinterleavedValues will be replace by output of ld2
-  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(),
-                   DeinterleavedValues.end());
+  DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
+                               DeinterleavedValues.begin(),
+                               DeinterleavedValues.end());
   return true;
 }
 
@@ -16952,9 +16953,9 @@ vector.deinterleave4 intrinsic. When true is returned, `DeinterleavedValues`
 vector is populated with the results such an intrinsic would return: (i.e. {A,
 B, C, D } = vector.deinterleave4(...))
 */
-bool getDeinterleave4Values(Value *DI,
-                            SmallVectorImpl<Instruction *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
+bool getDeinterleave4Values(
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+    SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
   if (!DI->hasNUses(2))
     return false;
   auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
@@ -17008,22 +17009,23 @@ bool getDeinterleave4Values(Value *DI,
 
   // These Values will not be used anymore,
   // DI4 will be created instead of nested DI1 and DI2
-  DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(),
-                   DeinterleavedValues.end());
-  DeadInsts.push_back(cast<Instruction>(DI1));
-  DeadInsts.push_back(cast<Instruction>(Extr1));
-  DeadInsts.push_back(cast<Instruction>(DI2));
-  DeadInsts.push_back(cast<Instruction>(Extr2));
+  DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
+                               DeinterleavedValues.begin(),
+                               DeinterleavedValues.end());
+  DeInterleaveDeadInsts.push_back(cast<Instruction>(DI1));
+  DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr1));
+  DeInterleaveDeadInsts.push_back(cast<Instruction>(DI2));
+  DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr2));
 
   return true;
 }
 
-bool getDeinterleavedValues(Value *DI,
-                            SmallVectorImpl<Instruction *> &DeinterleavedValues,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInsts))
+bool getDeinterleavedValues(
+    Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+    SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
+  if (getDeinterleave4Values(DI, DeinterleavedValues, DeInterleaveDeadInsts))
     return true;
-  return getDeinterleave2Values(DI, DeinterleavedValues, DeadInsts);
+  return getDeinterleave2Values(DI, DeinterleavedValues, DeInterleaveDeadInsts);
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
@@ -17034,9 +17036,9 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
 
   SmallVector<Instruction *, 4> DeinterleavedValues;
-  const DataLayout &DL = DI->getModule()->getDataLayout();
+  SmallVector<Instruction *, 4> DeInterleaveDeadInsts;
 
-  if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts)) {
+  if (!getDeinterleavedValues(DI, DeinterleavedValues, DeInterleaveDeadInsts)) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
   }
@@ -17045,18 +17047,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
          "Currently supported Factor is 2 or 4 only");
   VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
 
+  const DataLayout &DL = DI->getModule()->getDataLayout();
   bool UseScalable;
-  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
-    DeadInsts.clear();
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
-  }
 
   // TODO: Add support for using SVE instructions with fixed types later, using
   // the code from lowerInterleavedLoad to obtain the correct container type.
-  if (UseScalable && !VTy->isScalableTy()) {
-    DeadInsts.clear();
+  if (UseScalable && !VTy->isScalableTy())
     return false;
-  }
 
   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
   VectorType *LdTy =
@@ -17094,7 +17093,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       }
       LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
     }
-    // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4
+    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
     for (unsigned J = 0; J < Factor; ++J)
       DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
   } else {
@@ -17103,12 +17102,14 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
-    // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4
+    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
     for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
       Value *NewExtract = Builder.CreateExtractValue(Result, I);
       DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
     }
   }
+  DeadInsts.insert(DeadInsts.end(), DeInterleaveDeadInsts.begin(),
+                   DeInterleaveDeadInsts.end());
   return true;
 }
 
@@ -17126,9 +17127,9 @@ vector.interleave4 intrinsic. When true is returned, `ValuesToInterleave` vector
 is populated with the inputs such an intrinsic would take: (i.e.
 vector.interleave4(A, B, C, D)).
 */
-bool getValuesToInterleave(Value *II,
-                           SmallVectorImpl<Value *> &ValuesToInterleave,
-                           SmallVectorImpl<Instruction *> &DeadInsts) {
+bool getValuesToInterleave(
+    Value *II, SmallVectorImpl<Value *> &ValuesToInterleave,
+    SmallVectorImpl<Instruction *> &InterleaveDeadInsts) {
   Value *A, *B, *C, *D;
   // Try to match interleave of Factor 4
   if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
@@ -17138,11 +17139,10 @@ bool getValuesToInterleave(Value *II,
     ValuesToInterleave.push_back(C);
     ValuesToInterleave.push_back(D);
     // intermediate II will not be needed anymore
-    Value *II1, *II2;
-    assert(match(II, m_Interleave2(m_Value(II1), m_Value(II2))) &&
-           "II tree is expected");
-    DeadInsts.push_back(cast<Instruction>(II1));
-    DeadInsts.push_back(cast<Instruction>(II2));
+    InterleaveDeadInsts.push_back(
+        cast<Instruction>(cast<Instruction>(II)->getOperand(0)));
+    InterleaveDeadInsts.push_back(
+        cast<Instruction>(cast<Instruction>(II)->getOperand(1)));
     return true;
   }
 
@@ -17164,7 +17164,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     return false;
 
   SmallVector<Value *, 4> ValuesToInterleave;
-  if (!getValuesToInterleave(II, ValuesToInterleave, DeadInsts)) {
+  SmallVector<Instruction *, 4> InterleaveDeadInsts;
+  if (!getValuesToInterleave(II, ValuesToInterleave, InterleaveDeadInsts)) {
     LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
   }
@@ -17175,17 +17176,13 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
   const DataLayout &DL = II->getModule()->getDataLayout();
 
   bool UseScalable;
-  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
-    DeadInsts.clear();
+  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
     return false;
-  }
 
   // TODO: Add support for using SVE instructions with fixed types later, using
   // the code from lowerInterleavedStore to obtain the correct container type.
-  if (UseScalable && !VTy->isScalableTy()) {
-    DeadInsts.clear();
+  if (UseScalable && !VTy->isScalableTy())
     return false;
-  }
 
   unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
 
@@ -17226,7 +17223,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     }
     Builder.CreateCall(StNFunc, ValuesToInterleave);
   }
-
+  DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
+                   InterleaveDeadInsts.end());
   return true;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index fc2372b7705aa8..b55018bd9b2eff 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21828,7 +21828,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
 
 bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
     IntrinsicInst *II, StoreInst *SI,
-    SmallVectorImpl<Instruction *> &DeadInstructions) const {
+    SmallVectorImpl<Instruction *> &DeadInsts) const {
   assert(SI->isSimple());
   IRBuilder<> Builder(SI);