[llvm] [IA] Relax the requirement of having ExtractValue users on deinterleave intrinsic (PR #148716)

Wed Jul 16 11:16:41 PDT 2025

https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/148716

>From 40ed9ae5d8c8c32fc7896bcc76d11d10f99d8f49 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Fri, 11 Jul 2025 16:16:16 -0700
Subject: [PATCH 1/5] [IA] Do not require ExtractValueInst as users for
 deinterleave intrinsics

---
 llvm/include/llvm/Analysis/VectorUtils.h      |  20 ++++
 llvm/include/llvm/CodeGen/TargetLowering.h    |   8 +-
 llvm/lib/Analysis/VectorUtils.cpp             |  19 +++
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  32 ++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  30 ++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   6 +-
 .../Target/RISCV/RISCVInterleavedAccess.cpp   |  52 ++++-----
 .../rvv/fixed-vectors-deinterleave-load.ll    |  53 +++++++++
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 109 ++++++++++++++++++
 .../AArch64/fixed-deinterleave-intrinsics.ll  |  32 ++---
 .../scalable-deinterleave-intrinsics.ll       |  36 +++---
 .../AArch64/sve-deinterleave4.ll              |  27 +++--
 .../AArch64/sve-interleaved-accesses.ll       |   4 +
 14 files changed, 314 insertions(+), 118 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index af1e0d7251a4f..8d7407949b3f0 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -23,6 +23,7 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
+class IntrinsicInst;
 class TargetLibraryInfo;
 
 /// The Vector Function Database.
@@ -188,6 +189,25 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
 /// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
 LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID);
 
+/// A vector can either be deinterleaved through an intrinsic or a combination
+/// of shufflevector instructions. This is a thin abstraction layer to provide
+/// some common information like the deinterleaving factor.
+struct VectorDeinterleaving {
+  IntrinsicInst *DI = nullptr;
+  ArrayRef<Value *> Values;
+
+  unsigned getFactor() const;
+
+  Type *getDeinterleavedType() const;
+
+  explicit VectorDeinterleaving(IntrinsicInst *DI) : DI(DI) {}
+  explicit VectorDeinterleaving(ArrayRef<Value *> Values) : Values(Values) {}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  bool isValid() const { return (DI != nullptr) ^ !Values.empty(); }
+#endif
+};
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a248eb7444b20..f3cf784afb540 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -96,6 +96,7 @@ class TargetRegisterClass;
 class TargetRegisterInfo;
 class TargetTransformInfo;
 class Value;
+struct VectorDeinterleaving;
 class VPIntrinsic;
 
 namespace Sched {
@@ -3230,7 +3231,7 @@ class LLVM_ABI TargetLoweringBase {
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
+                                      const VectorDeinterleaving &VD) const {
     return false;
   }
 
@@ -3251,9 +3252,8 @@ class LLVM_ABI TargetLoweringBase {
   ///
   /// \p LI is the accompanying load instruction.
   /// \p DeinterleaveValues contains the deinterleaved values.
-  virtual bool
-  lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
-                                   ArrayRef<Value *> DeinterleaveValues) const {
+  virtual bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+                                                IntrinsicInst *DI) const {
     return false;
   }
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 7f0ed0b60a785..cd2ab0edf6fd3 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -306,6 +306,25 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
   }
 }
 
+unsigned VectorDeinterleaving::getFactor() const {
+  assert(isValid());
+  if (DI)
+    return getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  else
+    return Values.size();
+}
+
+Type *VectorDeinterleaving::getDeinterleavedType() const {
+  assert(getFactor() > 0);
+  if (DI) {
+    return *DI->getType()->subtype_begin();
+  } else {
+    Value *FirstActive =
+        *llvm::find_if(Values, [](Value *V) { return V != nullptr; });
+    return FirstActive->getType();
+  }
+}
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7259834975cf4..1363e8b6b28b6 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -381,7 +381,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
     for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
       ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
-    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
+    VectorDeinterleaving VD(ShuffleValues);
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, VD))
       // If Extracts is not empty, tryReplaceExtracts made changes earlier.
       return !Extracts.empty() || BinOpShuffleChanged;
   } else {
@@ -615,32 +616,17 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
     return false;
 
-  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  VectorDeinterleaving VD(DI);
+  const unsigned Factor = VD.getFactor();
   assert(Factor && "unexpected deinterleave intrinsic");
 
-  SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
-  Value *LastFactor = nullptr;
-  for (auto *User : DI->users()) {
-    auto *Extract = dyn_cast<ExtractValueInst>(User);
-    if (!Extract || Extract->getNumIndices() != 1)
-      return false;
-    unsigned Idx = Extract->getIndices()[0];
-    if (DeinterleaveValues[Idx])
-      return false;
-    DeinterleaveValues[Idx] = Extract;
-    LastFactor = Extract;
-  }
-
-  if (!LastFactor)
-    return false;
-
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
     Value *Mask =
-        getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
+        getMask(WideMask, Factor, cast<VectorType>(VD.getDeinterleavedType()));
     if (!Mask)
       return false;
 
@@ -649,7 +635,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
+    VectorDeinterleaving VD(DI);
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, VD))
       return false;
 
   } else {
@@ -661,13 +648,10 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
                       << " and factor = " << Factor << "\n");
 
     // Try and match this with target specific intrinsics.
-    if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
+    if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DI))
       return false;
   }
 
-  for (Value *V : DeinterleaveValues)
-    if (V)
-      DeadInsts.insert(cast<Instruction>(V));
   DeadInsts.insert(DI);
   // We now have a target-specific load, so delete the old one.
   DeadInsts.insert(cast<Instruction>(LoadedVal));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bde4ba993f69e..ff156f4f04c5d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17476,16 +17476,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
-  unsigned Factor = DeinterleavedValues.size();
+    LoadInst *LI, IntrinsicInst *DI) const {
+  ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
+  const unsigned Factor = DISubtypes.size();
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
   }
 
-  Value *FirstActive = *llvm::find_if(DeinterleavedValues,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
+  VectorType *VTy = cast<VectorType>(DISubtypes[0]);
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
   bool UseScalable;
@@ -17513,7 +17512,10 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
 
   Value *BaseAddr = LI->getPointerOperand();
+  Value *Result = nullptr;
   if (NumLoads > 1) {
+    Result = PoisonValue::get(DI->getType());
+
     // Create multiple legal small ldN.
     SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
     for (unsigned I = 0; I < NumLoads; ++I) {
@@ -17533,25 +17535,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       }
       LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
     }
-    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
-    for (unsigned J = 0; J < Factor; ++J) {
-      if (DeinterleavedValues[J])
-        DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
-    }
+
+    for (unsigned J = 0; J < Factor; ++J)
+      Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
   } else {
-    Value *Result;
     if (UseScalable)
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
-    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
-    for (unsigned I = 0; I < Factor; I++) {
-      if (DeinterleavedValues[I]) {
-        Value *NewExtract = Builder.CreateExtractValue(Result, I);
-        DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
-      }
-    }
   }
+
+  DI->replaceAllUsesWith(Result);
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 65fe08e92c235..ca40884e3a5c0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -218,8 +218,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(
-      LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+                                        IntrinsicInst *DI) const override;
 
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 00e969056df7d..743745ba5d65f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -437,14 +437,14 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(
-      LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
+                                        IntrinsicInst *DI) const override;
 
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
   bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                              ArrayRef<Value *> DeinterleaveRes) const override;
+                              const VectorDeinterleaving &VD) const override;
 
   bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
                                ArrayRef<Value *> InterleaveOps) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index a6ff22c4b391f..044b1eae84f26 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
 #include "RISCVISelLowering.h"
 #include "RISCVSubtarget.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -233,17 +234,17 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
-  const unsigned Factor = DeinterleaveValues.size();
+    LoadInst *LI, IntrinsicInst *DI) const {
+  VectorDeinterleaving VD(DI);
+  const unsigned Factor = VD.getFactor();
+  assert(Factor && "unexpected deinterleaving factor");
   if (Factor > 8)
     return false;
 
   assert(LI->isSimple());
   IRBuilder<> Builder(LI);
 
-  Value *FirstActive =
-      *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
-  VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+  VectorType *ResVTy = cast<VectorType>(VD.getDeinterleavedType());
 
   const DataLayout &DL = LI->getDataLayout();
 
@@ -293,16 +294,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
     }
   }
 
-  for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
-    if (!DIV)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIV->replaceAllUsesWith(NewEV);
-  }
-
+  DI->replaceAllUsesWith(Return);
   return true;
 }
 
@@ -419,16 +411,14 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 /// dealing with factor of 2 (extractvalue is still required for most of other
 /// factors though).
 bool RISCVTargetLowering::lowerInterleavedVPLoad(
-    VPIntrinsic *Load, Value *Mask,
-    ArrayRef<Value *> DeinterleaveResults) const {
-  const unsigned Factor = DeinterleaveResults.size();
+    VPIntrinsic *Load, Value *Mask, const VectorDeinterleaving &VD) const {
   assert(Mask && "Expect a valid mask");
   assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
          "Unexpected intrinsic");
 
-  Value *FirstActive = *llvm::find_if(DeinterleaveResults,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
+  const unsigned Factor = VD.getFactor();
+  assert(Factor && "unexpected vector deinterleaving");
+  VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
 
   auto &DL = Load->getModule()->getDataLayout();
   Align Alignment = Load->getParamAlign(0).value_or(
@@ -494,14 +484,18 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad(
     }
   }
 
-  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
-    if (!DIO)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIO->replaceAllUsesWith(NewEV);
+  if (VD.DI) {
+    VD.DI->replaceAllUsesWith(Return);
+  } else {
+    for (auto [Idx, DIO] : enumerate(VD.Values)) {
+      if (!DIO)
+        continue;
+      // We have to create a brand new ExtractValue to replace each
+      // of these old ExtractValue instructions.
+      Value *NewEV =
+          Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+      DIO->replaceAllUsesWith(NewEV);
+    }
   }
 
   return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 3e822d357b667..861423026b2eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -274,6 +274,59 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
 }
 
+define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_partial:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    ret
+  %vec = load <24 x i8>, ptr %p
+  %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
+  %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+  %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+  %res0 = insertvalue { <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8> } %res0, <8 x i8> %t2, 1
+  ret { <8 x i8>, <8 x i8> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue.
+define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a2, a2, 1
+; CHECK-NEXT:    beqz a2, .LBB17_2
+; CHECK-NEXT:  # %bb.1: # %bb0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB17_2: # %bb1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v6, (a1)
+; CHECK-NEXT:    ret
+  br i1 %c, label %bb0, label %bb1
+
+bb0:
+  %vec0 = load <24 x i8>, ptr %p
+  %d0.0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec0)
+  br label %merge
+
+bb1:
+  %vec1 = load <24 x i8>, ptr %p1
+  %d0.1 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec1)
+  br label %merge
+
+merge:
+  %d0 = phi {<8 x i8>, <8 x i8>, <8 x i8>} [%d0.0, %bb0], [%d0.1, %bb1]
+  %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+  %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
+  %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+  %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
+  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
+}
+
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor4:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 7fb822d20f892..0fa8e05acc8ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -66,6 +66,115 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_partial(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_partial:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a1, a1, a2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:    vmv1r.v v8, v7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor3_partial:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a2, a1, 1
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    lui a2, 699051
+; RV64-NEXT:    addi a2, a2, -1365
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a2, a2, 32
+; RV64-NEXT:    mulhu a1, a1, a2
+; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:    vmv1r.v v8, v7
+; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 3
+  %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t2, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_no_extract(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_no_extract:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    beq a1, a2, .LBB3_2
+; RV32-NEXT:  # %bb.1: # %bb0
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a1, a1, a2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:    j .LBB3_3
+; RV32-NEXT:  .LBB3_2: # %bb1
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:  .LBB3_3: # %merge
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor3_no_extract:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a1
+; RV64-NEXT:    li a3, 12
+; RV64-NEXT:    beq a2, a3, .LBB3_2
+; RV64-NEXT:  # %bb.1: # %bb0
+; RV64-NEXT:    slli a2, a1, 1
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    lui a2, 699051
+; RV64-NEXT:    addi a2, a2, -1365
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a2, a2, 32
+; RV64-NEXT:    mulhu a1, a1, a2
+; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:    j .LBB3_3
+; RV64-NEXT:  .LBB3_2: # %bb1
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:  .LBB3_3: # %merge
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv1r.v v8, v7
+; RV64-NEXT:    ret
+  %p = icmp ne i32 %evl, 12
+  br i1 %p, label %bb0, label %bb1
+
+bb0:
+  %rvl.0 = mul i32 %evl, 3
+  %wide.load.0 = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl.0)
+  %deinterleaved.results.0 = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.load.0)
+  br label %merge
+
+bb1:
+  %wide.load.1 = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 12)
+  %deinterleaved.results.1 = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.load.1)
+  br label %merge
+
+merge:
+  %deinterleaved.results = phi { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [%deinterleaved.results.0, %bb0], [%deinterleaved.results.1, %bb1]
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t2, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor4_v2:
 ; RV32:       # %bb.0:
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 09e2c53465cd7..6c81d9a4d2ed6 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -10,8 +10,8 @@ define void @deinterleave_i8_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i8_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2
@@ -33,8 +33,8 @@ define void @deinterleave_i16_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2
@@ -56,8 +56,8 @@ define void @deinterleave_8xi32_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_8xi32_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2
@@ -79,8 +79,8 @@ define void @deinterleave_i64_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i64_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2
@@ -102,8 +102,8 @@ define void @deinterleave_float_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_float_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_float_factor2
@@ -125,8 +125,8 @@ define void @deinterleave_double_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_double_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_double_factor2
@@ -148,8 +148,8 @@ define void @deinterleave_ptr_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_ptr_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2
@@ -301,6 +301,10 @@ define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; NEON-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
 ; NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
 ; NEON-NEXT:    [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
+; NEON-NEXT:    [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
+; NEON-NEXT:    [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 436389ba5b991..d7649801ea2fc 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -8,8 +8,8 @@ define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi8_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
@@ -23,8 +23,8 @@ define void @deinterleave_nxi16_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi16_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
@@ -38,8 +38,8 @@ define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
@@ -53,8 +53,8 @@ define void @deinterleave_nxi64_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi64_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
@@ -68,8 +68,8 @@ define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxfloat_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
@@ -83,8 +83,8 @@ define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
@@ -98,8 +98,8 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxptr_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
@@ -215,6 +215,10 @@ define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
@@ -239,6 +243,10 @@ define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index c565066541d1d..58c0bccc3be38 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -49,8 +49,16 @@ define void @wide_deinterleave4(ptr %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
-; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
+; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %src, align 4
@@ -73,8 +81,8 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
 ; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[SRC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
 ; CHECK-NEXT:    ret void
 ;
 
@@ -95,12 +103,11 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 define void @negative_deinterleave4_test(ptr %src) {
 ; CHECK-LABEL: define void @negative_deinterleave4_test
 ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[LOAD]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i32>, ptr %src, align 4
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index b109448bd5d7c..1418ca09c0d61 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -606,6 +606,10 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #2 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 1
 ; CHECK-NEXT:    ret void
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8

>From 5106d8b9552e22e8af4415d98f2b3dadc914cfb5 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Mon, 14 Jul 2025 13:28:41 -0700
Subject: [PATCH 2/5] fixup! [IA] Do not require ExtractValueInst as users for
 deinterleave intrinsics

---
 llvm/include/llvm/CodeGen/TargetLowering.h      | 5 +++--
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index f3cf784afb540..b6e0fa13a99f6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3229,7 +3229,8 @@ class LLVM_ABI TargetLoweringBase {
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
-  /// \p DeinterleaveRes is a list of deinterleaved results.
+  /// \p VD represents either a deinterleave intrinsic or a list of
+  /// deinterleaved values.
   virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
                                       const VectorDeinterleaving &VD) const {
     return false;
@@ -3251,7 +3252,7 @@ class LLVM_ABI TargetLoweringBase {
   /// llvm.vector.deinterleave{2,3,5,7}
   ///
   /// \p LI is the accompanying load instruction.
-  /// \p DeinterleaveValues contains the deinterleaved values.
+  /// \p DI represents the deinterleave intrinsic.
   virtual bool lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
                                                 IntrinsicInst *DI) const {
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff156f4f04c5d..11ae8605c5a71 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17477,14 +17477,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     LoadInst *LI, IntrinsicInst *DI) const {
-  ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
-  const unsigned Factor = DISubtypes.size();
+  VectorDeinterleaving VD(DI);
+  const unsigned Factor = VD.getFactor();
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
   }
 
-  VectorType *VTy = cast<VectorType>(DISubtypes[0]);
+  VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
   bool UseScalable;
@@ -17536,6 +17536,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
     }
 
+    // Merge the values from different factors.
     for (unsigned J = 0; J < Factor; ++J)
       Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
   } else {
@@ -17545,6 +17546,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
   }
 
+  // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
   DI->replaceAllUsesWith(Result);
   return true;
 }

>From fd0de02dae233ac3e68c099e5748124d6272116c Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Mon, 14 Jul 2025 14:40:13 -0700
Subject: [PATCH 3/5] fixup! Address review comments

---
 llvm/lib/Analysis/VectorUtils.cpp                 | 15 +++++++--------
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp   |  3 +--
 .../RISCV/rvv/fixed-vectors-deinterleave-load.ll  |  2 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll     |  2 +-
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index cd2ab0edf6fd3..01b3aecf57ac3 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -310,19 +310,18 @@ unsigned VectorDeinterleaving::getFactor() const {
   assert(isValid());
   if (DI)
     return getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-  else
-    return Values.size();
+
+  return Values.size();
 }
 
 Type *VectorDeinterleaving::getDeinterleavedType() const {
   assert(getFactor() > 0);
-  if (DI) {
+  if (DI)
     return *DI->getType()->subtype_begin();
-  } else {
-    Value *FirstActive =
-        *llvm::find_if(Values, [](Value *V) { return V != nullptr; });
-    return FirstActive->getType();
-  }
+
+  Value *FirstActive =
+      *llvm::find_if(Values, [](Value *V) { return V != nullptr; });
+  return FirstActive->getType();
 }
 
 /// Given a vector and an element number, see if the scalar value is
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 11ae8605c5a71..6278a73e7ddfb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17514,8 +17514,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
   Value *BaseAddr = LI->getPointerOperand();
   Value *Result = nullptr;
   if (NumLoads > 1) {
-    Result = PoisonValue::get(DI->getType());
-
     // Create multiple legal small ldN.
     SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
     for (unsigned I = 0; I < NumLoads; ++I) {
@@ -17537,6 +17535,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     }
 
     // Merge the values from different factors.
+    Result = PoisonValue::get(DI->getType());
     for (unsigned J = 0; J < Factor; ++J)
       Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
   } else {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 861423026b2eb..807651c9b40c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -290,7 +290,7 @@ define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p)
   ret { <8 x i8>, <8 x i8> } %res1
 }
 
-; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue.
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue.
 define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) {
 ; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 0fa8e05acc8ff..27ecbe56bda42 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -104,7 +104,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_partial(ptr %ptr,
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
-; InterleavedAccess should kick in even if the users of deinterleave intrinsic is not extractvalue.
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue.
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_no_extract(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor3_no_extract:
 ; RV32:       # %bb.0:

>From 445a187684db80d7cfe7a19cb1ca0f2d4c0c5cad Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Wed, 16 Jul 2025 11:06:52 -0700
Subject: [PATCH 4/5] fixup! Rebase to latest TLI hooks

---
 llvm/include/llvm/Analysis/VectorUtils.h      | 22 ++----------
 llvm/include/llvm/CodeGen/TargetLowering.h    |  6 ++--
 llvm/lib/Analysis/VectorUtils.cpp             | 23 ++++--------
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 11 +++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  5 ++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  2 +-
 .../Target/RISCV/RISCVInterleavedAccess.cpp   | 35 +++++++++----------
 7 files changed, 36 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 8d7407949b3f0..00cbfde107db7 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -23,8 +23,8 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class IntrinsicInst;
 class TargetLibraryInfo;
+class IntrinsicInst;
 
 /// The Vector Function Database.
 ///
@@ -189,24 +189,8 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
 /// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
 LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID);
 
-/// A vector can either be deinterleaved through an intrinsic or a combination
-/// of shufflevector instructions. This is a thin abstraction layer to provide
-/// some common information like the deinterleaving factor.
-struct VectorDeinterleaving {
-  IntrinsicInst *DI = nullptr;
-  ArrayRef<Value *> Values;
-
-  unsigned getFactor() const;
-
-  Type *getDeinterleavedType() const;
-
-  explicit VectorDeinterleaving(IntrinsicInst *DI) : DI(DI) {}
-  explicit VectorDeinterleaving(ArrayRef<Value *> Values) : Values(Values) {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  bool isValid() const { return (DI != nullptr) ^ !Values.empty(); }
-#endif
-};
+/// Given a deinterleaveN intrinsic, return the (narrow) type of each factor.
+LLVM_ABI Type *getDeinterleavedVectorType(IntrinsicInst *DI);
 
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 25106a7fb1f25..06aa66670c7e7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -96,7 +96,6 @@ class TargetRegisterClass;
 class TargetRegisterInfo;
 class TargetTransformInfo;
 class Value;
-struct VectorDeinterleaving;
 class VPIntrinsic;
 
 namespace Sched {
@@ -3229,10 +3228,9 @@ class LLVM_ABI TargetLoweringBase {
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
-  /// \p VD represents either a deinterleave intrinsic or a list of
-  /// deinterleaved values.
+  /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      const VectorDeinterleaving &VD) const {
+                                      ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 01b3aecf57ac3..a505743caf655 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -306,22 +306,13 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
   }
 }
 
-unsigned VectorDeinterleaving::getFactor() const {
-  assert(isValid());
-  if (DI)
-    return getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-
-  return Values.size();
-}
-
-Type *VectorDeinterleaving::getDeinterleavedType() const {
-  assert(getFactor() > 0);
-  if (DI)
-    return *DI->getType()->subtype_begin();
-
-  Value *FirstActive =
-      *llvm::find_if(Values, [](Value *V) { return V != nullptr; });
-  return FirstActive->getType();
+Type *llvm::getDeinterleavedVectorType(IntrinsicInst *DI) {
+  [[maybe_unused]] unsigned Factor =
+      getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
+  assert(Factor && Factor == DISubtypes.size() &&
+         "unexpected deinterleave factor or result type");
+  return DISubtypes[0];
 }
 
 /// Given a vector and an element number, see if the scalar value is
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index b77874b9a1fbb..4db2f87b4777d 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -381,8 +381,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
     for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
       ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
-    VectorDeinterleaving VD(ShuffleValues);
-    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, VD))
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
       // If Extracts is not empty, tryReplaceExtracts made changes earlier.
       return !Extracts.empty() || BinOpShuffleChanged;
   } else {
@@ -616,17 +615,17 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
     return false;
 
-  VectorDeinterleaving VD(DI);
-  const unsigned Factor = VD.getFactor();
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   assert(Factor && "unexpected deinterleave intrinsic");
 
+  Value *Mask = nullptr;
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
-    Value *Mask =
-        getMask(WideMask, Factor, cast<VectorType>(VD.getDeinterleavedType()));
+    Mask = getMask(WideMask, Factor,
+                   cast<VectorType>(getDeinterleavedVectorType(DI)));
     if (!Mask)
       return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5a6d45c5900d8..fa5d0a509d66a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17487,8 +17487,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
-  VectorDeinterleaving VD(DI);
-  const unsigned Factor = VD.getFactor();
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
@@ -17498,7 +17497,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     return false;
   assert(!Mask && "Unexpected mask on a load\n");
 
-  VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
+  VectorType *VTy = cast<VectorType>(getDeinterleavedVectorType(DI));
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
   bool UseScalable;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 5fc7b57a7ea3c..e8adf561c9c35 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -444,7 +444,7 @@ class RISCVTargetLowering : public TargetLowering {
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
   bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                              const VectorDeinterleaving &VD) const override;
+                              ArrayRef<Value *> DeinterleaveRes) const override;
 
   bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
                                ArrayRef<Value *> InterleaveOps) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index a4b68bf201bf3..7b79d203e962e 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -258,15 +258,14 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
-  VectorDeinterleaving VD(DI);
-  const unsigned Factor = VD.getFactor();
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   assert(Factor && "unexpected deinterleaving factor");
   if (Factor > 8)
     return false;
 
   IRBuilder<> Builder(Load);
 
-  VectorType *ResVTy = cast<VectorType>(VD.getDeinterleavedType());
+  VectorType *ResVTy = cast<VectorType>(getDeinterleavedVectorType(DI));
 
   const DataLayout &DL = Load->getDataLayout();
   auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
@@ -438,14 +437,16 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
 /// dealing with factor of 2 (extractvalue is still required for most of other
 /// factors though).
 bool RISCVTargetLowering::lowerInterleavedVPLoad(
-    VPIntrinsic *Load, Value *Mask, const VectorDeinterleaving &VD) const {
+    VPIntrinsic *Load, Value *Mask,
+    ArrayRef<Value *> DeinterleaveResults) const {
+  const unsigned Factor = DeinterleaveResults.size();
   assert(Mask && "Expect a valid mask");
   assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
          "Unexpected intrinsic");
 
-  const unsigned Factor = VD.getFactor();
-  assert(Factor && "unexpected vector deinterleaving");
-  VectorType *VTy = cast<VectorType>(VD.getDeinterleavedType());
+  Value *FirstActive = *llvm::find_if(DeinterleaveResults,
+                                      [](Value *V) { return V != nullptr; });
+  VectorType *VTy = cast<VectorType>(FirstActive->getType());
 
   auto &DL = Load->getModule()->getDataLayout();
   Align Alignment = Load->getParamAlign(0).value_or(
@@ -509,18 +510,14 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad(
     }
   }
 
-  if (VD.DI) {
-    VD.DI->replaceAllUsesWith(Return);
-  } else {
-    for (auto [Idx, DIO] : enumerate(VD.Values)) {
-      if (!DIO)
-        continue;
-      // We have to create a brand new ExtractValue to replace each
-      // of these old ExtractValue instructions.
-      Value *NewEV =
-          Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-      DIO->replaceAllUsesWith(NewEV);
-    }
+  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
+    if (!DIO)
+      continue;
+    // We have to create a brand new ExtractValue to replace each
+    // of these old ExtractValue instructions.
+    Value *NewEV =
+        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+    DIO->replaceAllUsesWith(NewEV);
   }
 
   return true;

>From aec05c4084e8a25a2d9077784cc181637af1cff1 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Wed, 16 Jul 2025 11:16:19 -0700
Subject: [PATCH 5/5] fixup! fixup! Rebase to latest TLI hooks

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 06aa66670c7e7..238d07a20eec8 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3251,7 +3251,7 @@ class LLVM_ABI TargetLoweringBase {
   ///
   /// \p Load is the accompanying load instruction.  Can be either a plain load
   /// instruction or a vp.load intrinsic.
-  /// \p DeinterleaveValues contains the deinterleaved values.
+  /// \p DI represents the deinterleaveN intrinsic.
   virtual bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                                 IntrinsicInst *DI) const {
     return false;