[llvm] [LV] Add initial support for vectorizing literal struct return values (multiple defs) (PR #115928)

Tue Nov 12 11:20:12 PST 2024

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/115928

This patch adds initial support for vectorizing literal struct return values. Currently, this is limited to the case where the struct is homogeneous (all elements have the same type) and not packed. Additionally, all users of the struct must be `extractvalue` operations.

The intended use case for this is vectorizing intrinsics such as:

```
declare { float, float } @llvm.sincos.f32(float %x)
```

Mapping them to structure-returning library calls such as:

```
declare { <4 x float>, <4 x i32> } @Sleef_sincosf4_u10advsimd(<4 x float>)
```

It could also be possible to vectorize the intrinsic (without a libcall) and then later lower the intrinsic to a library call. This may be desired if the only library calls available take output pointers rather than return multiple values. Note this is not yet supported as `VPWidenIntrinsicRecipe` needs to be updated to support struct types too (though this should be a relatively small change after this patch).

To represent struct return types in VPlan the `VPWidenCallRecipe` and `VPReplicateRecipe` have been updated to support multiple definitions. This allows these recipes to create a new value per element of the struct value, which may help cost the extractions.

Before this change, there were quite a few assumptions (especially for replication), that recipes that map to LLVM-IR instructions are single-def. Where possible, these have been fixed, but several transforms do not yet support multiple definitions.

>From 9e663148b4409d0f78aba4ca20f40d2a042dc0de Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 15 Oct 2024 14:07:36 +0000
Subject: [PATCH 1/3] [LV] Make having flags (FMFs etc) not require inheriting
 VPSingleDefRecipe

This splits out `VPRecipeWithIRFlags` into `VPRecipeIRFlags` and
`VPSingleDefRecipeWithIRFlags`. With this, the `VPRecipeIRFlags` class
contains the flags but does not inherit from any VPRecipe. The new
`VPSingleDefRecipeWithIRFlags` class functions the same as the previous
`VPRecipeWithIRFlags` (and all previous uses have been replaced with
it).

This alone is an NFC, but it will be needed to vectorize
calls/intrinsics that return multiple values (via literal structs) but
modeled in VPlan as multiple recipe results. These calls can still have
FMFs, so they need the `VPRecipeIRFlags` but can't inherit from
`VPSingleDefRecipe`.
---
 .../Vectorize/LoopVectorizationPlanner.h      |  12 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         | 216 ++++++++++--------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  47 ++--
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  31 +--
 .../AArch64/sve2-histcnt-vplan.ll             |   2 +-
 5 files changed, 172 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 7787f58683b2a4..7cbef9fd310c18 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -168,7 +168,7 @@ class VPBuilder {
 
   VPInstruction *createOverflowingOp(unsigned Opcode,
                                      std::initializer_list<VPValue *> Operands,
-                                     VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
+                                     VPRecipeIRFlags::WrapFlagsTy WrapFlags,
                                      DebugLoc DL = {}, const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
@@ -187,9 +187,9 @@ class VPBuilder {
   VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
                     const Twine &Name = "") {
 
-    return tryInsertInstruction(new VPInstruction(
-        Instruction::BinaryOps::Or, {LHS, RHS},
-        VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
+    return tryInsertInstruction(
+        new VPInstruction(Instruction::BinaryOps::Or, {LHS, RHS},
+                          VPRecipeIRFlags::DisjointFlagsTy(false), DL, Name));
   }
 
   VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
@@ -223,12 +223,12 @@ class VPBuilder {
   VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
                               const Twine &Name = "") {
     return tryInsertInstruction(new VPInstruction(
-        Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
+        Ptr, Offset, VPRecipeIRFlags::GEPFlagsTy(false), DL, Name));
   }
   VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
                                 const Twine &Name = "") {
     return tryInsertInstruction(new VPInstruction(
-        Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
+        Ptr, Offset, VPRecipeIRFlags::GEPFlagsTy(true), DL, Name));
   }
 
   VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index abfe97b4ab55b6..b77b9e65ab69cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -700,6 +700,8 @@ struct VPCostContext {
   TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
 };
 
+class VPRecipeIRFlags;
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -803,6 +805,13 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// Returns the debug location of the recipe.
   DebugLoc getDebugLoc() const { return DL; }
 
+  /// Returns the IR flags for the recipe.
+  virtual const VPRecipeIRFlags *getIRFlags() const { return nullptr; }
+  VPRecipeIRFlags *getIRFlags() {
+    return const_cast<VPRecipeIRFlags *>(
+        const_cast<const VPRecipeBase *>(this)->getIRFlags());
+  }
+
 protected:
   /// Compute the cost of this recipe either using a recipe's specialized
   /// implementation or using the legacy cost model and the underlying
@@ -916,8 +925,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
 #endif
 };
 
-/// Class to record LLVM IR flag for a recipe along with it.
-class VPRecipeWithIRFlags : public VPSingleDefRecipe {
+/// Class to record LLVM IR flags for a recipe.
+class VPRecipeIRFlags {
   enum class OperationType : unsigned char {
     Cmp,
     OverflowingBinOp,
@@ -930,6 +939,8 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
   };
 
 public:
+  void transferFlags(const VPRecipeIRFlags &Other) { *this = Other; }
+
   struct WrapFlagsTy {
     char HasNUW : 1;
     char HasNSW : 1;
@@ -979,23 +990,10 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
     unsigned AllFlags;
   };
 
-protected:
-  void transferFlags(VPRecipeWithIRFlags &Other) {
-    OpType = Other.OpType;
-    AllFlags = Other.AllFlags;
-  }
-
 public:
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
-      : VPSingleDefRecipe(SC, Operands, DL) {
-    OpType = OperationType::Other;
-    AllFlags = 0;
-  }
+  VPRecipeIRFlags() : OpType(OperationType::Other), AllFlags(0) {}
 
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)
-      : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) {
+  VPRecipeIRFlags(Instruction &I) {
     if (auto *Op = dyn_cast<CmpInst>(&I)) {
       OpType = OperationType::Cmp;
       CmpPredicate = Op->getPredicate();
@@ -1023,54 +1021,22 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
     }
   }
 
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
-                      CmpInst::Predicate Pred, DebugLoc DL = {})
-      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp),
-        CmpPredicate(Pred) {}
+  VPRecipeIRFlags(CmpInst::Predicate Pred)
+      : OpType(OperationType::Cmp), CmpPredicate(Pred) {}
 
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
-                      WrapFlagsTy WrapFlags, DebugLoc DL = {})
-      : VPSingleDefRecipe(SC, Operands, DL),
-        OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {}
+  VPRecipeIRFlags(WrapFlagsTy WrapFlags)
+      : OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {}
 
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
-                      FastMathFlags FMFs, DebugLoc DL = {})
-      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp),
-        FMFs(FMFs) {}
+  VPRecipeIRFlags(FastMathFlags FMFs)
+      : OpType(OperationType::FPMathOp), FMFs(FMFs) {}
 
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
-                      DisjointFlagsTy DisjointFlags, DebugLoc DL = {})
-      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp),
-        DisjointFlags(DisjointFlags) {}
+  VPRecipeIRFlags(DisjointFlagsTy DisjointFlags)
+      : OpType(OperationType::DisjointOp), DisjointFlags(DisjointFlags) {}
 
-protected:
-  template <typename IterT>
-  VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
-                      GEPFlagsTy GEPFlags, DebugLoc DL = {})
-      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
-        GEPFlags(GEPFlags) {}
+  VPRecipeIRFlags(GEPFlagsTy GEPFlags)
+      : OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {}
 
 public:
-  static inline bool classof(const VPRecipeBase *R) {
-    return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
-           R->getVPDefID() == VPRecipeBase::VPWidenSC ||
-           R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
-           R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
-           R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
-           R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
-           R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC ||
-           R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
-  }
-
-  static inline bool classof(const VPUser *U) {
-    auto *R = dyn_cast<VPRecipeBase>(U);
-    return R && classof(R);
-  }
-
   /// Drop all poison-generating flags.
   void dropPoisonGeneratingFlags() {
     // NOTE: This needs to be kept in-sync with
@@ -1179,6 +1145,56 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
 #endif
 };
 
+// Class to record LLVM IR flags for a single-def recipe along with it.
+class VPSingleDefRecipeWithIRFlags : public VPSingleDefRecipe,
+                                     public VPRecipeIRFlags {
+public:
+  using VPRecipeBase::getIRFlags;
+
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               DebugLoc DL = {})
+      : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags() {}
+
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               Instruction &I)
+      : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()),
+        VPRecipeIRFlags(I) {}
+
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               CmpInst::Predicate Pred, DebugLoc DL = {})
+      : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(Pred) {}
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               VPRecipeIRFlags::WrapFlagsTy WrapFlags,
+                               DebugLoc DL = {})
+      : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(WrapFlags) {}
+
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               FastMathFlags FMFs, DebugLoc DL = {})
+      : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(FMFs) {}
+
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               VPRecipeIRFlags::DisjointFlagsTy DisjointFlags,
+                               DebugLoc DL = {})
+      : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(DisjointFlags) {}
+
+  virtual const VPRecipeIRFlags *getIRFlags() const override {
+    return static_cast<const VPRecipeIRFlags *>(this);
+  }
+
+protected:
+  template <typename IterT>
+  VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+                               VPRecipeIRFlags::GEPFlagsTy GEPFlags,
+                               DebugLoc DL = {})
+      : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(GEPFlags) {}
+};
+
 /// Helper to access the operand that contains the unroll part for this recipe
 /// after unrolling.
 template <unsigned PartOpIdx> class VPUnrollPartAccessor {
@@ -1195,7 +1211,7 @@ template <unsigned PartOpIdx> class VPUnrollPartAccessor {
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPRecipeWithIRFlags,
+class VPInstruction : public VPSingleDefRecipeWithIRFlags,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -1270,7 +1286,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
                 const Twine &Name = "")
-      : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
         Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
@@ -1281,22 +1297,27 @@ class VPInstruction : public VPRecipeWithIRFlags,
                 VPValue *B, DebugLoc DL = {}, const Twine &Name = "");
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
-                WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")
-      : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),
+                VPRecipeIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL = {},
+                const Twine &Name = "")
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands,
+                                     WrapFlags, DL),
         Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
-                DisjointFlagsTy DisjointFlag, DebugLoc DL = {},
+                VPRecipeIRFlags::DisjointFlagsTy DisjointFlag, DebugLoc DL = {},
                 const Twine &Name = "")
-      : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands,
+                                     DisjointFlag, DL),
         Opcode(Opcode), Name(Name.str()) {
     assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
   }
 
-  VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags,
-                DebugLoc DL = {}, const Twine &Name = "")
-      : VPRecipeWithIRFlags(VPDef::VPInstructionSC,
-                            ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL),
+  VPInstruction(VPValue *Ptr, VPValue *Offset,
+                VPRecipeIRFlags::GEPFlagsTy Flags, DebugLoc DL = {},
+                const Twine &Name = "")
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC,
+                                     ArrayRef<VPValue *>({Ptr, Offset}), Flags,
+                                     DL),
         Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
@@ -1435,14 +1456,15 @@ class VPIRInstruction : public VPRecipeBase {
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
 /// vectorized version of itself.
-class VPWidenRecipe : public VPRecipeWithIRFlags {
+class VPWidenRecipe : public VPSingleDefRecipeWithIRFlags {
   unsigned Opcode;
 
 protected:
   template <typename IterT>
   VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
                 iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {}
+      : VPSingleDefRecipeWithIRFlags(VPDefOpcode, Operands, I),
+        Opcode(I.getOpcode()) {}
 
 public:
   template <typename IterT>
@@ -1487,8 +1509,6 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 /// A recipe for widening operations with vector-predication intrinsics with
 /// explicit vector length (EVL).
 class VPWidenEVLRecipe : public VPWidenRecipe {
-  using VPRecipeWithIRFlags::transferFlags;
-
 public:
   template <typename IterT>
   VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL)
@@ -1533,7 +1553,7 @@ class VPWidenEVLRecipe : public VPWidenRecipe {
 };
 
 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
-class VPWidenCastRecipe : public VPRecipeWithIRFlags {
+class VPWidenCastRecipe : public VPSingleDefRecipeWithIRFlags {
   /// Cast instruction opcode.
   Instruction::CastOps Opcode;
 
@@ -1543,14 +1563,14 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
                     CastInst &UI)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
-        ResultTy(ResultTy) {
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI),
+        Opcode(Opcode), ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
         ResultTy(ResultTy) {}
 
   ~VPWidenCastRecipe() override = default;
@@ -1631,7 +1651,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
 };
 
 /// A recipe for widening vector intrinsics.
-class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
+class VPWidenIntrinsicRecipe : public VPSingleDefRecipeWithIRFlags {
   /// ID of the vector intrinsic to widen.
   Intrinsic::ID VectorIntrinsicID;
 
@@ -1651,7 +1671,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
                          DebugLoc DL = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments,
+                                     CI),
         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
         MayWriteToMemory(CI.mayWriteToMemory()),
@@ -1660,7 +1681,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
                          DebugLoc DL = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
         VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID);
@@ -1720,7 +1741,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
 };
 
 /// A recipe for widening Call instructions using library calls.
-class VPWidenCallRecipe : public VPRecipeWithIRFlags {
+class VPWidenCallRecipe : public VPSingleDefRecipeWithIRFlags {
   /// Variant stores a pointer to the chosen function. There is a 1:1 mapping
   /// between a given VF and the chosen vectorized variant, so there will be a
   /// different VPlan for each VF with a valid variant.
@@ -1729,8 +1750,8 @@ class VPWidenCallRecipe : public VPRecipeWithIRFlags {
 public:
   VPWidenCallRecipe(Value *UV, Function *Variant,
                     ArrayRef<VPValue *> CallArguments, DebugLoc DL = {})
-      : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
-                            *cast<Instruction>(UV)),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
+                                     *cast<Instruction>(UV)),
         Variant(Variant) {
     assert(
         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
@@ -1855,7 +1876,7 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe {
 };
 
 /// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
+class VPWidenGEPRecipe : public VPSingleDefRecipeWithIRFlags {
   bool isPointerLoopInvariant() const {
     return getOperand(0)->isDefinedOutsideLoopRegions();
   }
@@ -1873,7 +1894,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 public:
   template <typename IterT>
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {}
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {}
 
   ~VPWidenGEPRecipe() override = default;
 
@@ -1903,16 +1924,16 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
 
 /// A recipe to compute the pointers for widened memory accesses of IndexTy
 /// in reverse order.
-class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags,
+class VPReverseVectorPointerRecipe : public VPSingleDefRecipeWithIRFlags,
                                      public VPUnrollPartAccessor<2> {
   Type *IndexedTy;
 
 public:
   VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
                                bool IsInBounds, DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC,
-                            ArrayRef<VPValue *>({Ptr, VF}),
-                            GEPFlagsTy(IsInBounds), DL),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC,
+                                     ArrayRef<VPValue *>({Ptr, VF}),
+                                     GEPFlagsTy(IsInBounds), DL),
         IndexedTy(IndexedTy) {}
 
   VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC)
@@ -1956,15 +1977,16 @@ class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags,
 };
 
 /// A recipe to compute the pointers for widened memory accesses of IndexTy.
-class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
+class VPVectorPointerRecipe : public VPSingleDefRecipeWithIRFlags,
                               public VPUnrollPartAccessor<1> {
   Type *IndexedTy;
 
 public:
   VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsInBounds,
                         DebugLoc DL)
-      : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
-                            GEPFlagsTy(IsInBounds), DL),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPVectorPointerSC,
+                                     ArrayRef<VPValue *>(Ptr),
+                                     GEPFlagsTy(IsInBounds), DL),
         IndexedTy(IndexedTy) {}
 
   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -2657,7 +2679,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeWithIRFlags {
+class VPReplicateRecipe : public VPSingleDefRecipeWithIRFlags {
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -2668,7 +2690,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags {
   template <typename IterT>
   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
                     bool IsUniform, VPValue *Mask = nullptr)
-      : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
         IsUniform(IsUniform), IsPredicated(Mask) {
     if (Mask)
       addOperand(Mask);
@@ -3356,15 +3378,15 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their scalar values.
-class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
+class VPScalarIVStepsRecipe : public VPSingleDefRecipeWithIRFlags,
                               public VPUnrollPartAccessor<2> {
   Instruction::BinaryOps InductionOpcode;
 
 public:
   VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step,
                         Instruction::BinaryOps Opcode, FastMathFlags FMFs)
-      : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
-                            ArrayRef<VPValue *>({IV, Step}), FMFs),
+      : VPSingleDefRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
+                                     ArrayRef<VPValue *>({IV, Step}), FMFs),
         InductionOpcode(Opcode) {}
 
   VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ef2ca9af7268d1..0a84166093be08 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -291,7 +291,7 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
   llvm_unreachable("subclasses should implement computeCost");
 }
 
-FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
+FastMathFlags VPRecipeIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
   FastMathFlags Res;
@@ -327,8 +327,8 @@ unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
                              VPValue *A, VPValue *B, DebugLoc DL,
                              const Twine &Name)
-    : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
-                          Pred, DL),
+    : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC,
+                                   ArrayRef<VPValue *>({A, B}), Pred, DL),
       Opcode(Opcode), Name(Name.str()) {
   assert(Opcode == Instruction::ICmp &&
          "only ICmp predicates supported at the moment");
@@ -337,7 +337,7 @@ VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
 VPInstruction::VPInstruction(unsigned Opcode,
                              std::initializer_list<VPValue *> Operands,
                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
-    : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
+    : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
       Opcode(Opcode), Name(Name.str()) {
   // Make sure the VPInstruction is a floating-point operation.
   assert(isFPMathOp() && "this op can't take fast-math flags");
@@ -807,7 +807,10 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   }
 
   printFlags(O);
-  printOperands(O, SlotTracker);
+  if (getNumOperands() > 0) {
+    O << " ";
+    printOperands(O, SlotTracker);
+  }
 
   if (auto DL = getDebugLoc()) {
     O << ", !dbg ";
@@ -1039,7 +1042,7 @@ void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
 
   O << "call";
   printFlags(O);
-  O << getIntrinsicName() << "(";
+  O << " " << getIntrinsicName() << "(";
 
   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
     Op->printAsOperand(O, SlotTracker);
@@ -1207,8 +1210,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
                                     {TTI::OK_AnyValue, TTI::OP_None}, SI);
 }
 
-VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
-    const FastMathFlags &FMF) {
+VPRecipeIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
   AllowReassoc = FMF.allowReassoc();
   NoNaNs = FMF.noNaNs();
   NoInfs = FMF.noInfs();
@@ -1219,7 +1221,7 @@ VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
+void VPRecipeIRFlags::printFlags(raw_ostream &O) const {
   switch (OpType) {
   case OperationType::Cmp:
     O << " " << CmpInst::getPredicateName(getPredicate());
@@ -1252,8 +1254,6 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
   case OperationType::Other:
     break;
   }
-  if (getNumOperands() > 0)
-    O << " ";
 }
 #endif
 
@@ -1460,7 +1460,10 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode);
   printFlags(O);
-  printOperands(O, SlotTracker);
+  if (getNumOperands() > 0) {
+    O << " ";
+    printOperands(O, SlotTracker);
+  }
 }
 
 void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -1469,7 +1472,10 @@ void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = vp." << Instruction::getOpcodeName(getOpcode());
   printFlags(O);
-  printOperands(O, SlotTracker);
+  if (getNumOperands() > 0) {
+    O << " ";
+    printOperands(O, SlotTracker);
+  }
 }
 #endif
 
@@ -1547,9 +1553,12 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CAST ";
   printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(Opcode) << " ";
+  O << " = " << Instruction::getOpcodeName(Opcode);
   printFlags(O);
-  printOperands(O, SlotTracker);
+  if (getNumOperands() > 0) {
+    O << " ";
+    printOperands(O, SlotTracker);
+  }
   O << " to " << *getResultType();
 }
 #endif
@@ -1938,6 +1947,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = getelementptr";
   printFlags(O);
+  O << " ";
   printOperands(O, SlotTracker);
 }
 #endif
@@ -2293,7 +2303,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
     O << "call";
     printFlags(O);
-    O << "@" << CB->getCalledFunction()->getName() << "(";
+    O << " @" << CB->getCalledFunction()->getName() << "(";
     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
                     O, [&O, &SlotTracker](VPValue *Op) {
                       Op->printAsOperand(O, SlotTracker);
@@ -2302,7 +2312,10 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
   } else {
     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
     printFlags(O);
-    printOperands(O, SlotTracker);
+    if (getNumOperands() > 0) {
+      O << " ";
+      printOperands(O, SlotTracker);
+    }
   }
 
   if (shouldPack())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b9ab8a8fe60107..9b30cfa17a3bca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -934,8 +934,9 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
       continue;
 
     for (VPUser *U : collectUsersRecursively(PhiR))
-      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
-        RecWithFlags->dropPoisonGeneratingFlags();
+      if (auto *R = dyn_cast<VPRecipeBase>(U)) {
+        if (auto *IRFlags = R->getIRFlags())
+          IRFlags->dropPoisonGeneratingFlags();
       }
   }
 }
@@ -1180,8 +1181,8 @@ void VPlanTransforms::truncateToMinimalBitwidths(
       // Any wrapping introduced by shrinking this operation shouldn't be
       // considered undefined behavior. So, we can't unconditionally copy
       // arithmetic wrapping flags to VPW.
-      if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
-        VPW->dropPoisonGeneratingFlags();
+      if (auto *Flags = R.getIRFlags())
+        Flags->dropPoisonGeneratingFlags();
 
       using namespace llvm::VPlanPatternMatch;
       if (OldResSizeInBits != NewResSizeInBits &&
@@ -1649,7 +1650,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // This recipe contributes to the address computation of a widen
       // load/store. If the underlying instruction has poison-generating flags,
       // drop them directly.
-      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
+      if (auto *Flags = CurRec->getIRFlags()) {
         VPValue *A, *B;
         using namespace llvm::VPlanPatternMatch;
         // Dropping disjoint from an OR may yield incorrect results, as some
@@ -1657,25 +1658,25 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
         // for dependence analysis). Instead, replace it with an equivalent Add.
         // This is possible as all users of the disjoint OR only access lanes
         // where the operands are disjoint or poison otherwise.
-        if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
-            RecWithFlags->isDisjoint()) {
-          VPBuilder Builder(RecWithFlags);
+        if (match(CurRec, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
+            Flags->isDisjoint()) {
+          VPValue *OldValue = CurRec->getVPSingleValue();
+          VPBuilder Builder(CurRec);
           VPInstruction *New = Builder.createOverflowingOp(
-              Instruction::Add, {A, B}, {false, false},
-              RecWithFlags->getDebugLoc());
-          New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
-          RecWithFlags->replaceAllUsesWith(New);
-          RecWithFlags->eraseFromParent();
+              Instruction::Add, {A, B}, {false, false}, CurRec->getDebugLoc());
+          New->setUnderlyingValue(OldValue->getUnderlyingValue());
+          OldValue->replaceAllUsesWith(New);
+          CurRec->eraseFromParent();
           CurRec = New;
         } else
-          RecWithFlags->dropPoisonGeneratingFlags();
+          Flags->dropPoisonGeneratingFlags();
       } else {
         Instruction *Instr = dyn_cast_or_null<Instruction>(
             CurRec->getVPSingleValue()->getUnderlyingValue());
         (void)Instr;
         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
                "found instruction with poison generating flags not covered by "
-               "VPRecipeWithIRFlags");
+               "without VPRecipeIRFlags");
       }
 
       // Add new definitions to the worklist.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
index 6257d3325f9796..7585a005b1117f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll
@@ -69,7 +69,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]]
 ; CHECK-NEXT:     [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]]
 ; CHECK-NEXT:     WIDEN [[IDX:.*]] = load [[VECP_IDX]]
-; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext  [[IDX]] to i64
+; CHECK-NEXT:     WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64
 ; CHECK-NEXT:     WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]]
 ; CHECK-NEXT:     WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1>
 ; CHECK-NEXT:     EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]]

>From 0f551b02f6f48761677e6c28e434a9c978477a17 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 15 Oct 2024 15:50:03 +0000
Subject: [PATCH 2/3] Initial changes picked over from #109833

---
 llvm/include/llvm/Analysis/VectorUtils.h      | 15 +---
 llvm/include/llvm/IR/VectorUtils.h            | 53 ++++++++++++++
 llvm/lib/Analysis/VectorUtils.cpp             | 14 ++++
 llvm/lib/IR/CMakeLists.txt                    |  1 +
 llvm/lib/IR/VFABIDemangler.cpp                | 18 +++--
 llvm/lib/IR/VectorUtils.cpp                   | 69 +++++++++++++++++++
 .../Vectorize/LoopVectorizationLegality.cpp   |  4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 46 +++++++------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  2 +-
 9 files changed, 180 insertions(+), 42 deletions(-)
 create mode 100644 llvm/include/llvm/IR/VectorUtils.h
 create mode 100644 llvm/lib/IR/VectorUtils.cpp

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 467d5932cacf91..1f8c583d020d15 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/VFABIDemangler.h"
+#include "llvm/IR/VectorUtils.h"
 #include "llvm/Support/CheckedArithmetic.h"
 
 namespace llvm {
@@ -127,18 +128,8 @@ namespace Intrinsic {
 typedef unsigned ID;
 }
 
-/// A helper function for converting Scalar types to vector types. If
-/// the incoming type is void, we return void. If the EC represents a
-/// scalar, we return the scalar type.
-inline Type *ToVectorTy(Type *Scalar, ElementCount EC) {
-  if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar())
-    return Scalar;
-  return VectorType::get(Scalar, EC);
-}
-
-inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
-  return ToVectorTy(Scalar, ElementCount::getFixed(VF));
-}
+/// Returns true if `Ty` can be widened by the loop vectorizer.
+bool canWidenType(Type *Ty);
 
 /// Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all scalars
diff --git a/llvm/include/llvm/IR/VectorUtils.h b/llvm/include/llvm/IR/VectorUtils.h
new file mode 100644
index 00000000000000..e8e838d8287c42
--- /dev/null
+++ b/llvm/include/llvm/IR/VectorUtils.h
@@ -0,0 +1,53 @@
+//===----------- VectorUtils.h -  Vector type utility functions -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DerivedTypes.h"
+
+namespace llvm {
+
+/// A helper function for converting Scalar types to vector types. If
+/// the incoming type is void, we return void. If the EC represents a
+/// scalar, we return the scalar type.
+inline Type *ToVectorTy(Type *Scalar, ElementCount EC) {
+  if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar())
+    return Scalar;
+  return VectorType::get(Scalar, EC);
+}
+
+inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
+  return ToVectorTy(Scalar, ElementCount::getFixed(VF));
+}
+
+/// A helper for converting to wider (vector) types. For scalar types, this is
+/// equivalent to calling `ToVectorTy`. For struct types, this returns a new
+/// struct where each element type has been widened to a vector type. Note: Only
+/// unpacked literal struct types are supported.
+Type *ToWideTy(Type *Ty, ElementCount EC);
+
+/// A helper for converting wide types to narrow (non-vector) types. For vector
+/// types, this is equivalent to calling .getScalarType(). For struct types,
+/// this returns a new struct where each element type has been converted to a
+/// scalar type. Note: Only unpacked literal struct types are supported.
+Type *ToNarrowTy(Type *Ty);
+
+/// Returns the types contained in `Ty`. For struct types, it returns the
+/// elements, all other types are returned directly.
+SmallVector<Type *, 2> getContainedTypes(Type *Ty);
+
+/// Returns true if `Ty` is a vector type or a struct of vector types where all
+/// vector types share the same VF.
+bool isWideTy(Type *Ty);
+
+/// Returns the vectorization factor for a widened type.
+inline ElementCount getWideTypeVF(Type *Ty) {
+  assert(isWideTy(Ty) && "expected widened type!");
+  return cast<VectorType>(getContainedTypes(Ty).front())->getElementCount();
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 15e325a0fffca5..5d59de6a907169 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -39,6 +39,20 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
     cl::init(8));
 
+/// Returns true if `Ty` can be widened by the loop vectorizer.
+bool llvm::canWidenType(Type *Ty) {
+  Type *ElTy = Ty;
+  // For now, only allow widening non-packed literal structs where all
+  // element types are the same. This simplifies the cost model and
+  // conversion between scalar and wide types.
+  if (auto *StructTy = dyn_cast<StructType>(Ty);
+      StructTy && !StructTy->isPacked() && StructTy->isLiteral() &&
+      StructTy->containsHomogeneousTypes()) {
+    ElTy = StructTy->elements().front();
+  }
+  return VectorType::isValidElementType(ElTy);
+}
+
 /// Return true if all of the intrinsic's arguments and return type are scalars
 /// for the scalar form of the intrinsic, and vectors for the vector form of the
 /// intrinsic (except operands that are marked as always being scalar by
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 544f4ea9223d0e..7eaf35e10ebc67 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -73,6 +73,7 @@ add_llvm_component_library(LLVMCore
   Value.cpp
   ValueSymbolTable.cpp
   VectorBuilder.cpp
+  VectorUtils.cpp
   Verifier.cpp
   VFABIDemangler.cpp
   RuntimeLibcalls.cpp
diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp
index cdfb9fbfaa084d..6ccd77fd23793a 100644
--- a/llvm/lib/IR/VFABIDemangler.cpp
+++ b/llvm/lib/IR/VFABIDemangler.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/VectorUtils.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <limits>
@@ -346,12 +347,15 @@ getScalableECFromSignature(const FunctionType *Signature, const VFISAKind ISA,
   // Also check the return type if not void.
   Type *RetTy = Signature->getReturnType();
   if (!RetTy->isVoidTy()) {
-    std::optional<ElementCount> ReturnEC = getElementCountForTy(ISA, RetTy);
-    // If we have an unknown scalar element type we can't find a reasonable VF.
-    if (!ReturnEC)
-      return std::nullopt;
-    if (ElementCount::isKnownLT(*ReturnEC, MinEC))
-      MinEC = *ReturnEC;
+    for (Type *RetTy : getContainedTypes(RetTy)) {
+      std::optional<ElementCount> ReturnEC = getElementCountForTy(ISA, RetTy);
+      // If we have an unknown scalar element type we can't find a reasonable
+      // VF.
+      if (!ReturnEC)
+        return std::nullopt;
+      if (ElementCount::isKnownLT(*ReturnEC, MinEC))
+        MinEC = *ReturnEC;
+    }
   }
 
   // The SVE Vector function call ABI bases the VF on the widest element types
@@ -566,7 +570,7 @@ FunctionType *VFABI::createFunctionType(const VFInfo &Info,
 
   auto *RetTy = ScalarFTy->getReturnType();
   if (!RetTy->isVoidTy())
-    RetTy = VectorType::get(RetTy, VF);
+    RetTy = ToWideTy(RetTy, VF);
   return FunctionType::get(RetTy, VecTypes, false);
 }
 
diff --git a/llvm/lib/IR/VectorUtils.cpp b/llvm/lib/IR/VectorUtils.cpp
new file mode 100644
index 00000000000000..c89a8eaf2ad1e0
--- /dev/null
+++ b/llvm/lib/IR/VectorUtils.cpp
@@ -0,0 +1,69 @@
+//===----------- VectorUtils.cpp - Vector type utility functions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/VectorUtils.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+
+using namespace llvm;
+
+/// A helper for converting to wider (vector) types. For scalar types, this is
+/// equivalent to calling `ToVectorTy`. For struct types, this returns a new
+/// struct where each element type has been widened to a vector type. Note: Only
+/// unpacked literal struct types are supported.
+Type *llvm::ToWideTy(Type *Ty, ElementCount EC) {
+  if (EC.isScalar())
+    return Ty;
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  if (!StructTy)
+    return ToVectorTy(Ty, EC);
+  assert(StructTy->isLiteral() && !StructTy->isPacked() &&
+         "expected unpacked struct literal");
+  return StructType::get(
+      Ty->getContext(),
+      map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
+        return VectorType::get(ElTy, EC);
+      }));
+}
+
+/// A helper for converting wide types to narrow (non-vector) types. For vector
+/// types, this is equivalent to calling .getScalarType(). For struct types,
+/// this returns a new struct where each element type has been converted to a
+/// scalar type. Note: Only unpacked literal struct types are supported.
+Type *llvm::ToNarrowTy(Type *Ty) {
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  if (!StructTy)
+    return Ty->getScalarType();
+  assert(StructTy->isLiteral() && !StructTy->isPacked() &&
+         "expected unpacked struct literal");
+  return StructType::get(
+      Ty->getContext(),
+      map_to_vector(StructTy->elements(), [](Type *ElTy) -> Type * {
+        return ElTy->getScalarType();
+      }));
+}
+
+/// Returns the types contained in `Ty`. For struct types, it returns the
+/// elements, all other types are returned directly.
+SmallVector<Type *, 2> llvm::getContainedTypes(Type *Ty) {
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  if (StructTy)
+    return to_vector<2>(StructTy->elements());
+  return {Ty};
+}
+
+/// Returns true if `Ty` is a vector type or a struct of vector types where all
+/// vector types share the same VF.
+bool llvm::isWideTy(Type *Ty) {
+  auto ContainedTys = getContainedTypes(Ty);
+  if (ContainedTys.empty() || !ContainedTys.front()->isVectorTy())
+    return false;
+  ElementCount VF = cast<VectorType>(ContainedTys.front())->getElementCount();
+  return all_of(ContainedTys, [&](Type *Ty) {
+    return Ty->isVectorTy() && cast<VectorType>(Ty)->getElementCount() == VF;
+  });
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index f1568781252c06..0ad4766b1a6243 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -946,8 +946,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
       // Also, we can't vectorize extractelement instructions.
-      if ((!VectorType::isValidElementType(I.getType()) &&
-           !I.getType()->isVoidTy()) ||
+      Type *InstTy = I.getType();
+      if (!(InstTy->isVoidTy() || canWidenType(InstTy)) ||
           (isa<CastInst>(I) &&
            !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
           isa<ExtractElementInst>(I)) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ebc62f9843905..ac252f61d5e40b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2908,10 +2908,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
-static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
-  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
-    return Elt;
-  return VectorType::get(Elt, VF);
+static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
+  if (VF.isScalar() || !canWidenType(Ty))
+    return Ty;
+  return ToWideTy(Ty, VF);
 }
 
 InstructionCost
@@ -3675,9 +3675,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
       // ExtractValue instructions must be uniform, because the operands are
       // known to be loop-invariant.
-      if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
-        assert(IsOutOfScope(EVI->getAggregateOperand()) &&
-               "Expected aggregate value to be loop invariant");
+      if (auto *EVI = dyn_cast<ExtractValueInst>(&I);
+          EVI && IsOutOfScope(EVI->getAggregateOperand())) {
         AddToWorklistIfAllowed(EVI);
         continue;
       }
@@ -5504,10 +5503,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // and phi nodes.
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
-      ScalarCost += TTI.getScalarizationOverhead(
-          cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
-          /*Extract*/ false, CostKind);
+      Type *WideTy = ToWideTy(I->getType(), VF);
+      for (Type *VectorTy : getContainedTypes(WideTy)) {
+        ScalarCost += TTI.getScalarizationOverhead(
+            cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+            /*Insert*/ true,
+            /*Extract*/ false, CostKind);
+      }
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5998,13 +6000,17 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
     return 0;
 
   InstructionCost Cost = 0;
-  Type *RetTy = ToVectorTy(I->getType(), VF);
+  Type *RetTy = ToWideTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
-      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
-        /*Insert*/ true,
-        /*Extract*/ false, CostKind);
+      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
+
+    for (Type *VectorTy : getContainedTypes(RetTy)) {
+      Cost += TTI.getScalarizationOverhead(
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          /*Insert*/ true,
+          /*Extract*/ false, CostKind);
+    }
+  }
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6264,9 +6270,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
-      Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+      Type *RetTy = ToWideTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
-        Tys.push_back(ToVectorTy(ScalarTy, VF));
+        Tys.push_back(ToWideTy(ScalarTy, VF));
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6456,7 +6462,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
-    VectorTy = ToVectorTy(RetTy, VF);
+    VectorTy = ToWideTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
       !TTI.getNumberOfParts(VectorTy))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0a84166093be08..79b8dee27c7551 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1003,7 +1003,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
-  Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF);
+  Type *RetTy = ToWideTy(Ctx.Types.inferScalarType(this), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
     ParamTys.push_back(

>From 727d62d5d854a913703a4a55d835731fe7145e27 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 12 Nov 2024 18:37:38 +0000
Subject: [PATCH 3/3] [LV] Add initial support for vectorizing literal struct
 return values

This patch adds initial support for vectorizing literal struct return
values. Currently, this is limited to the case where the struct is
homogeneous (all elements have the same type) and not packed.
Additionally, all users of the struct must be `extractvalue` operations.

The intended use case for this is vectorizing intrinsics such as:

```
declare { float, float } @llvm.sincos.f32(float %x)
```

Mapping them to structure-returning library calls such as:

```
declare { <4 x float>, <4 x i32> } @Sleef_sincosf4_u10advsimd(<4 x float>)
```

It could also be possible to vectorize the intrinsic (without a libcall)
and then later lower the intrinsic to a library call. This may be
desired if the only library calls available take output pointers rather
than return multiple values. Note this is not yet supported as
`VPWidenIntrinsicRecipe` needs to be updated to support struct types
too (though this should be a relatively small change after this patch).

To represent struct return types in VPlan the `VPWidenCallRecipe` and
`VPReplicateRecipe` have been updated to support multiple definitions.
This allows these recipes to create a new value per element of the
struct value, which may help cost the extractions.

Before this change, there were quite a few assumptions (especially for
replication), that recipes that map to LLVM-IR instructions are
single-def. Where possible, these have been fixed, but several
transforms do not yet support multiple definitions.
---
 .../Vectorize/LoopVectorizationLegality.cpp   |  19 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  88 +++++--
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  32 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  61 +++--
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  31 ++-
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |   4 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  43 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  59 +++--
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  23 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |   2 +-
 .../AArch64/scalable-struct-return.ll         | 109 ++++++++
 .../Transforms/LoopVectorize/struct-return.ll | 244 ++++++++++++++++++
 .../LoopVectorize/vplan-dot-printing.ll       |   4 +-
 .../vplan-widen-struct-return.ll              | 112 ++++++++
 .../Transforms/Vectorize/VPlanTest.cpp        |   2 +-
 15 files changed, 726 insertions(+), 107 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/struct-return.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0ad4766b1a6243..ccc99eada55674 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -943,11 +943,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
+      // TODO: Tidy up these checks.
+      auto canWidenInst = [](Instruction &I) {
+        Type *InstTy = I.getType();
+        if (isa<CallInst>(I) && isa<StructType>(InstTy) &&
+            canWidenType(InstTy)) {
+          // We can only widen struct calls where the users are extractvalues.
+          for (auto &U : I.uses()) {
+            if (!isa<ExtractValueInst>(U.getUser()))
+              return false;
+          }
+          return true;
+        }
+        return VectorType::isValidElementType(InstTy) || InstTy->isVoidTy();
+      };
+
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
       // Also, we can't vectorize extractelement instructions.
-      Type *InstTy = I.getType();
-      if (!(InstTy->isVoidTy() || canWidenType(InstTy)) ||
+      // TODO: Tidy up these checks.
+      if (!canWidenInst(I) ||
           (isa<CastInst>(I) &&
            !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
           isa<ExtractElementInst>(I)) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac252f61d5e40b..6a21f976da3a47 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2325,7 +2325,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPLane &Lane,
                                                VPTransformState &State) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  assert((!Instr->getType()->isAggregateType() ||
+          canWidenType(Instr->getType())) &&
+         "Expected widenable or non-aggregate type.");
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2336,8 +2338,18 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
 #if !defined(NDEBUG)
     // Verify that VPlan type inference results agree with the type of the
     // generated values.
-    assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
-           "inferred type and type from generated instructions do not match");
+    if (auto *StructTy = dyn_cast<StructType>(Instr->getType())) {
+      for (auto [I, VPV] : enumerate(RepRecipe->definedValues())) {
+        assert(
+            State.TypeAnalysis.inferScalarType(VPV) ==
+                StructTy->getTypeAtIndex(I) &&
+            "inferred type and type from generated instructions do not match");
+      }
+    } else {
+      assert(State.TypeAnalysis.inferScalarType(
+                 RepRecipe->getVPSingleValue()) == Cloned->getType() &&
+             "inferred type and type from generated instructions do not match");
+    }
 #endif
   }
 
@@ -2360,7 +2372,14 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
   // Place the cloned scalar in the new loop.
   State.Builder.Insert(Cloned);
 
-  State.set(RepRecipe, Cloned, Lane);
+  if (isa<StructType>(Instr->getType())) {
+    for (auto [I, Def] : enumerate(RepRecipe->definedValues())) {
+      Value *AggV = State.Builder.CreateExtractValue(Cloned, I);
+      State.set(Def, AggV, Lane);
+    }
+  } else {
+    State.set(RepRecipe->getVPSingleValue(), Cloned, Lane);
+  }
 
   // If we just cloned a new assumption, add it the assumption cache.
   if (auto *II = dyn_cast<AssumeInst>(Cloned))
@@ -7427,6 +7446,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
       return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
     if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
       return &WidenMem->getIngredient();
+    if (auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R))
+      return WidenCall->getUnderlyingCallInstruction();
+    if (auto *Rep = dyn_cast<VPReplicateRecipe>(R))
+      return Rep->getUnderlyingInstr();
     return nullptr;
   };
 
@@ -8404,9 +8427,9 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
   return new VPBlendRecipe(Phi, OperandsWithMask);
 }
 
-VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
-                                                   ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
+                                              ArrayRef<VPValue *> Operands,
+                                              VFRange &Range) {
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
         return CM.isScalarWithPredication(CI, VF);
@@ -9113,6 +9136,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // TODO: Model and preserve debug intrinsics in VPlan.
     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
       Instruction *Instr = &I;
+
+      // Special case: Handle extractvalues from struct ret calls.
+      if (auto *ExtractValue = dyn_cast<ExtractValueInst>(Instr)) {
+        if (auto *CI =
+                dyn_cast<CallInst>(ExtractValue->getAggregateOperand())) {
+          auto *R = RecipeBuilder.getRecipe(cast<Instruction>(CI));
+          assert(R->getNumDefinedValues() ==
+                 cast<StructType>(CI->getType())->getNumElements());
+          unsigned Idx = ExtractValue->getIndices()[0];
+          RecipeBuilder.setRecipe(Instr, R->getVPValue(Idx));
+          continue;
+        }
+      }
+
       SmallVector<VPValue *, 4> Operands;
       auto *Phi = dyn_cast<PHINode>(Instr);
       if (Phi && Phi->getParent() == HeaderBB) {
@@ -9324,12 +9361,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
            "AnyOf reductions are not allowed for in-loop reductions");
 
     // Collect the chain of "link" recipes for the reduction starting at PhiR.
-    SetVector<VPSingleDefRecipe *> Worklist;
+    SetVector<VPRecipeBase *> Worklist;
     Worklist.insert(PhiR);
     for (unsigned I = 0; I != Worklist.size(); ++I) {
-      VPSingleDefRecipe *Cur = Worklist[I];
-      for (VPUser *U : Cur->users()) {
-        auto *UserRecipe = cast<VPSingleDefRecipe>(U);
+      VPRecipeBase *Cur = Worklist[I];
+      // TODO: This may need to handle recipes with multiple defs.
+      for (VPUser *U : Cur->getVPSingleValue()->users()) {
+        auto *UserRecipe = cast<VPRecipeBase>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
           assert((UserRecipe->getParent() == MiddleVPBB ||
                   UserRecipe->getParent() == Plan->getScalarPreheader()) &&
@@ -9349,8 +9387,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // get folded to their non-phi operand, as the reduction recipe handles the
     // condition directly.
     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
-    for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
-      Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
+    for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
+      VPValue *CurrentLinkV = CurrentLink->getVPSingleValue();
+      Instruction *CurrentLinkI =
+          cast<Instruction>(CurrentLinkV->getUnderlyingValue());
 
       // Index of the first operand which holds a non-mask vector operand.
       unsigned IndexOfFirstOperand;
@@ -9433,7 +9473,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       // Note that this transformation may leave over dead recipes (including
       // CurrentLink), which will be cleaned by a later VPlan transform.
       LinkVPBB->appendRecipe(RedRecipe);
-      CurrentLink->replaceAllUsesWith(RedRecipe);
+      CurrentLinkV->replaceAllUsesWith(RedRecipe);
       PreviousLink = RedRecipe;
     }
   }
@@ -9584,16 +9624,18 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
            "uniform recipe shouldn't be predicated");
     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
     State.ILV->scalarizeInstruction(UI, this, *State.Lane, State);
-    // Insert scalar instance packing it into a vector.
-    if (State.VF.isVector() && shouldPack()) {
-      // If we're constructing lane 0, initialize to start from poison.
-      if (State.Lane->isFirstLane()) {
-        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
-        Value *Poison = PoisonValue::get(
-            VectorType::get(UI->getType(), State.VF));
-        State.set(this, Poison);
+    for (auto [I, Def] : enumerate(definedValues())) {
+      // Insert scalar instance packing it into a vector.
+      if (State.VF.isVector() && shouldPack(I)) {
+        // If we're constructing lane 0, initialize to start from poison.
+        if (State.Lane->isFirstLane()) {
+          assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
+          Value *Poison =
+              PoisonValue::get(VectorType::get(UI->getType(), State.VF));
+          State.set(Def, Poison);
+        }
+        State.packScalarIntoVectorValue(Def, *State.Lane);
       }
-      State.packScalarIntoVectorValue(this, *State.Lane);
     }
     return;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5d4a3b555981ce..1e37386714d2ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -54,9 +54,11 @@ class VPRecipeBuilder {
   EdgeMaskCacheTy EdgeMaskCache;
   BlockMaskCacheTy BlockMaskCache;
 
+  using RecipeOrResult = PointerUnion<VPRecipeBase *, VPValue *>;
+
   // VPlan construction support: Hold a mapping from ingredients to
   // their recipe.
-  DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
+  DenseMap<Instruction *, RecipeOrResult> Ingredient2Recipe;
 
   /// Cross-iteration reduction & first-order recurrence phis for which we need
   /// to add the incoming value from the backedge after all recipes have been
@@ -95,8 +97,8 @@ class VPRecipeBuilder {
   /// Handle call instructions. If \p CI can be widened for \p Range.Start,
   /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
   /// decreased to ensure same decision from \p Range.Start to \p Range.End.
-  VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range);
+  VPRecipeBase *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
+                               VFRange &Range);
 
   /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
   /// if it can. The function should only be called if the cost-model indicates
@@ -132,6 +134,15 @@ class VPRecipeBuilder {
     Ingredient2Recipe[I] = R;
   }
 
+  // Set the recipe value for a given ingredient.
+  void setRecipe(Instruction *I, VPValue *RecipeResult) {
+    assert(!Ingredient2Recipe.contains(I) &&
+           "Cannot reset recipe for instruction.");
+    assert(RecipeResult->getDefiningRecipe() &&
+           "Value must be defined by a recipe.");
+    Ingredient2Recipe[I] = RecipeResult;
+  }
+
   /// Create the mask for the vector loop header block.
   void createHeaderMask();
 
@@ -158,9 +169,11 @@ class VPRecipeBuilder {
   VPRecipeBase *getRecipe(Instruction *I) {
     assert(Ingredient2Recipe.count(I) &&
            "Recording this ingredients recipe was not requested");
-    assert(Ingredient2Recipe[I] != nullptr &&
-           "Ingredient doesn't have a recipe");
-    return Ingredient2Recipe[I];
+    assert(Ingredient2Recipe[I] && "Ingredient doesn't have a recipe");
+    auto RecipeInfo = Ingredient2Recipe[I];
+    if (auto *R = dyn_cast<VPRecipeBase *>(RecipeInfo))
+      return R;
+    return cast<VPValue *>(RecipeInfo)->getDefiningRecipe();
   }
 
   /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
@@ -179,8 +192,11 @@ class VPRecipeBuilder {
 
   VPValue *getVPValueOrAddLiveIn(Value *V) {
     if (auto *I = dyn_cast<Instruction>(V)) {
-      if (auto *R = Ingredient2Recipe.lookup(I))
-        return R->getVPSingleValue();
+      if (auto RecipeInfo = Ingredient2Recipe.lookup(I)) {
+        if (auto *R = dyn_cast<VPRecipeBase *>(RecipeInfo))
+          return R->getVPSingleValue();
+        return cast<VPValue *>(RecipeInfo);
+      }
     }
     return Plan.getOrAddLiveIn(V);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index b77b9e65ab69cc..ac34ce7cc73ad1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -62,6 +62,7 @@ class Type;
 class VPBasicBlock;
 class VPRegionBlock;
 class VPlan;
+class VPRecipeIRFlags;
 class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
@@ -700,8 +701,6 @@ struct VPCostContext {
   TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
 };
 
-class VPRecipeIRFlags;
-
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -866,11 +865,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
     case VPRecipeBase::VPReductionSC:
-    case VPRecipeBase::VPReplicateSC:
     case VPRecipeBase::VPScalarIVStepsSC:
     case VPRecipeBase::VPVectorPointerSC:
     case VPRecipeBase::VPReverseVectorPointerSC:
-    case VPRecipeBase::VPWidenCallSC:
     case VPRecipeBase::VPWidenCanonicalIVSC:
     case VPRecipeBase::VPWidenCastSC:
     case VPRecipeBase::VPWidenGEPSC:
@@ -892,6 +889,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
+    case VPRecipeBase::VPWidenCallSC:
+    case VPRecipeBase::VPReplicateSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1740,28 +1739,58 @@ class VPWidenIntrinsicRecipe : public VPSingleDefRecipeWithIRFlags {
   bool onlyFirstLaneUsed(const VPValue *Op) const override;
 };
 
+/// A base class to define recipes with multiple results and IR flags.
+class VPMultipleDefRecipeWithIRFlags : public VPRecipeBase,
+                                       public VPRecipeIRFlags {
+public:
+  using VPRecipeBase::getIRFlags;
+
+  template <typename IterT>
+  VPMultipleDefRecipeWithIRFlags(const unsigned char SC, Instruction *I,
+                                 IterT Operands, unsigned NumDefs,
+                                 DebugLoc DL = {})
+      : VPRecipeBase(SC, Operands, DL), VPRecipeIRFlags(*I) {
+    assert(NumDefs >= 1 && "Expected at least one defined value.");
+    for (unsigned Def = 0; Def < NumDefs; ++Def)
+      new VPValue(I, this);
+  }
+
+  Instruction *getUnderlyingInstr() const {
+    return cast<Instruction>(getVPValue(0)->getUnderlyingValue());
+  }
+
+  virtual const VPRecipeIRFlags *getIRFlags() const override {
+    return static_cast<const VPRecipeIRFlags *>(this);
+  }
+};
+
 /// A recipe for widening Call instructions using library calls.
-class VPWidenCallRecipe : public VPSingleDefRecipeWithIRFlags {
+class VPWidenCallRecipe : public VPMultipleDefRecipeWithIRFlags {
   /// Variant stores a pointer to the chosen function. There is a 1:1 mapping
   /// between a given VF and the chosen vectorized variant, so there will be a
   /// different VPlan for each VF with a valid variant.
   Function *Variant;
 
 public:
-  VPWidenCallRecipe(Value *UV, Function *Variant,
+  VPWidenCallRecipe(CallInst *CI, Function *Variant,
                     ArrayRef<VPValue *> CallArguments, DebugLoc DL = {})
-      : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments,
-                                     *cast<Instruction>(UV)),
+      : VPMultipleDefRecipeWithIRFlags(
+            VPDef::VPWidenCallSC, CI, CallArguments,
+            /*NumDefs=*/getContainedTypes(CI->getType()).size(), DL),
         Variant(Variant) {
     assert(
         isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
         "last operand must be the called function");
   }
 
+  CallInst *getUnderlyingCallInstruction() const {
+    return cast<CallInst>(getUnderlyingInstr());
+  }
+
   ~VPWidenCallRecipe() override = default;
 
   VPWidenCallRecipe *clone() override {
-    return new VPWidenCallRecipe(getUnderlyingValue(), Variant,
+    return new VPWidenCallRecipe(getUnderlyingCallInstruction(), Variant,
                                  {op_begin(), op_end()}, getDebugLoc());
   }
 
@@ -2679,7 +2708,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPSingleDefRecipeWithIRFlags {
+class VPReplicateRecipe : public VPMultipleDefRecipeWithIRFlags {
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -2690,7 +2719,9 @@ class VPReplicateRecipe : public VPSingleDefRecipeWithIRFlags {
   template <typename IterT>
   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
                     bool IsUniform, VPValue *Mask = nullptr)
-      : VPSingleDefRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
+      : VPMultipleDefRecipeWithIRFlags(
+            VPDef::VPReplicateSC, I, Operands,
+            /*NumDefs=*/getContainedTypes(I->getType()).size()),
         IsUniform(IsUniform), IsPredicated(Mask) {
     if (Mask)
       addOperand(Mask);
@@ -2741,10 +2772,10 @@ class VPReplicateRecipe : public VPSingleDefRecipeWithIRFlags {
     return true;
   }
 
-  /// Returns true if the recipe is used by a widened recipe via an intervening
-  /// VPPredInstPHIRecipe. In this case, the scalar values should also be packed
-  /// in a vector.
-  bool shouldPack() const;
+  /// Returns true if the recipe value at index \p I is used by a widened recipe
+  /// via an intervening VPPredInstPHIRecipe. In this case, the scalar values
+  /// should also be packed in a vector.
+  bool shouldPack(unsigned I) const;
 
   /// Return the mask of a predicated VPReplicateRecipe.
   VPValue *getMask() {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 8b8ab6be99b0d5..a2ba2fe19ae89a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -136,9 +136,21 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   llvm_unreachable("Unhandled opcode!");
 }
 
-Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
-  auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
-  return CI.getType();
+/// Maps a VPlan value to an IR type. Note: This handles mapping struct types,
+/// which are represented as multiple VPlan values.
+static Type *getTypeForVPValue(const VPRecipeBase *R, const VPValue *V,
+                               Type *Type) {
+  for (auto [I, Ty] : enumerate(getContainedTypes(Type))) {
+    if (R->getVPValue(I) == V)
+      return Ty;
+  }
+  llvm_unreachable("Unexpected value!");
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R,
+                                               const VPValue *V) {
+  auto &CI = *cast<CallInst>(R->getUnderlyingCallInstruction());
+  return getTypeForVPValue(R, V, CI.getType());
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
@@ -156,12 +168,14 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) {
   return ResTy;
 }
 
-Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R,
+                                               const VPValue *V) {
   switch (R->getUnderlyingInstr()->getOpcode()) {
   case Instruction::Call: {
     unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);
-    return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
-        ->getReturnType();
+    Type *RetTy = cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
+                      ->getReturnType();
+    return getTypeForVPValue(R, V, RetTy);
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -268,12 +282,13 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
                 return inferScalarType(R->getOperand(0));
               })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
-                VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
-                VPWidenSelectRecipe>(
+                VPWidenMemoryRecipe, VPWidenSelectRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPWidenIntrinsicRecipe>([](const VPWidenIntrinsicRecipe *R) {
             return R->getResultType();
           })
+          .Case<VPWidenCallRecipe, VPReplicateRecipe>(
+              [&](const auto *R) { return inferScalarTypeForRecipe(R, V); })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
             return V->getUnderlyingValue()->getType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index cc21870bee2e3b..1aed9fb885ecf8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -47,12 +47,12 @@ class VPTypeAnalysis {
 
   Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);
   Type *inferScalarTypeForRecipe(const VPInstruction *R);
-  Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R, const VPValue *V);
   Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
-  Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R, const VPValue *V);
 
 public:
   VPTypeAnalysis(Type *CanonicalIVTy)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 79b8dee27c7551..277deda00100c4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -268,6 +268,10 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
     UI = IG->getInsertPos();
   else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
     UI = &WidenMem->getIngredient();
+  else if (auto *WidenCall = dyn_cast<VPWidenCallRecipe>(this))
+    UI = WidenCall->getUnderlyingCallInstruction();
+  else if (auto *Rep = dyn_cast<VPReplicateRecipe>(this))
+    UI = Rep->getUnderlyingInstr();
 
   InstructionCost RecipeCost;
   if (UI && Ctx.skipCostComputation(UI, VF.isVector())) {
@@ -889,7 +893,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
 
   assert(Variant != nullptr && "Can't create vector function.");
 
-  auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
+  auto *CI = getUnderlyingCallInstruction();
   SmallVector<OperandBundleDef, 1> OpBundles;
   if (CI)
     CI->getOperandBundlesAsDefs(OpBundles);
@@ -897,8 +901,16 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
   CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles);
   setFlags(V);
 
-  if (!V->getType()->isVoidTy())
-    State.set(this, V);
+  if (!V->getType()->isVoidTy()) {
+    if (isa<StructType>(V->getType())) {
+      for (auto [I, Def] : enumerate(definedValues())) {
+        Value *AggV = State.Builder.CreateExtractValue(V, I);
+        State.set(Def, AggV);
+      }
+    } else {
+      State.set(getVPSingleValue(), V);
+    }
+  }
   State.addMetadata(V, CI);
 }
 
@@ -919,7 +931,9 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
   if (CalledFn->getReturnType()->isVoidTy())
     O << "void ";
   else {
-    printAsOperand(O, SlotTracker);
+    interleaveComma(definedValues(), O, [&O, &SlotTracker](VPValue *Def) {
+      Def->printAsOperand(O, SlotTracker);
+    });
     O << " = ";
   }
 
@@ -2270,10 +2284,10 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-bool VPReplicateRecipe::shouldPack() const {
+bool VPReplicateRecipe::shouldPack(unsigned I) const {
   // Find if the recipe is used by a widened recipe via an intervening
   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
-  return any_of(users(), [](const VPUser *U) {
+  return any_of(getVPValue(I)->users(), [](const VPUser *U) {
     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
       return any_of(PredR->users(), [PredR](const VPUser *U) {
         return !U->usesScalars(PredR);
@@ -2284,7 +2298,7 @@ bool VPReplicateRecipe::shouldPack() const {
 
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
-  Instruction *UI = cast<Instruction>(getUnderlyingValue());
+  Instruction *UI = getUnderlyingInstr();
   // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
   // transform, avoid computing their cost multiple times for now.
   Ctx.SkipCostComputation.insert(UI);
@@ -2297,7 +2311,9 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
 
   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
-    printAsOperand(O, SlotTracker);
+    interleaveComma(definedValues(), O, [&O, &SlotTracker](VPValue *Def) {
+      Def->printAsOperand(O, SlotTracker);
+    });
     O << " = ";
   }
   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
@@ -2318,8 +2334,15 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
     }
   }
 
-  if (shouldPack())
-    O << " (S->V)";
+  for (unsigned I = 0, E = getNumDefinedValues(); I != E; ++I) {
+    if (I > 0)
+      O << ',';
+    O << ' ';
+    if (shouldPack(I))
+      O << "(S->V)";
+    else
+      O << "(S->S)";
+  }
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9b30cfa17a3bca..bb46b87dfb63cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -111,7 +111,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
   bool Changed = false;
   // First, collect the operands of all recipes in replicate blocks as seeds for
   // sinking.
-  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+  SetVector<std::pair<VPBasicBlock *, VPRecipeBase *>> WorkList;
   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -121,8 +121,9 @@ static bool sinkScalarOperands(VPlan &Plan) {
       continue;
     for (auto &Recipe : *VPBB) {
       for (VPValue *Op : Recipe.operands())
-        if (auto *Def =
-                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
+        // TODO: Add support for multiple-def recipes.
+        if (auto *Def = Op->getDefiningRecipe();
+            Def && Def->getNumDefinedValues() == 1)
           WorkList.insert(std::make_pair(VPBB, Def));
     }
   }
@@ -131,15 +132,18 @@ static bool sinkScalarOperands(VPlan &Plan) {
   // Try to sink each replicate or scalar IV steps recipe in the worklist.
   for (unsigned I = 0; I != WorkList.size(); ++I) {
     VPBasicBlock *SinkTo;
-    VPSingleDefRecipe *SinkCandidate;
+    VPRecipeBase *SinkCandidate;
     std::tie(SinkTo, SinkCandidate) = WorkList[I];
     if (SinkCandidate->getParent() == SinkTo ||
         SinkCandidate->mayHaveSideEffects() ||
         SinkCandidate->mayReadOrWriteMemory())
       continue;
+
+    Instruction *UI = nullptr;
     if (auto *RepR = dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
       if (!ScalarVFOnly && RepR->isUniform())
         continue;
+      UI = RepR->getUnderlyingInstr();
     } else if (!isa<VPScalarIVStepsRecipe>(SinkCandidate))
       continue;
 
@@ -150,32 +154,33 @@ static bool sinkScalarOperands(VPlan &Plan) {
     // SinkCandidate.
     auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
                             SinkCandidate](VPUser *U) {
-      auto *UI = cast<VPRecipeBase>(U);
-      if (UI->getParent() == SinkTo)
+      auto *UR = cast<VPRecipeBase>(U);
+      if (UR->getParent() == SinkTo)
         return true;
-      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
+      NeedsDuplicating =
+          UR->onlyFirstLaneUsed(SinkCandidate->getVPSingleValue());
       // We only know how to duplicate VPRecipeRecipes for now.
       return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate);
     };
-    if (!all_of(SinkCandidate->users(), CanSinkWithUser))
+    if (!all_of(SinkCandidate->getVPSingleValue()->users(), CanSinkWithUser))
       continue;
 
     if (NeedsDuplicating) {
       if (ScalarVFOnly)
         continue;
-      Instruction *I = SinkCandidate->getUnderlyingInstr();
-      auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
+      auto *Clone = new VPReplicateRecipe(UI, SinkCandidate->operands(), true);
       // TODO: add ".cloned" suffix to name of Clone's VPValue.
 
       Clone->insertBefore(SinkCandidate);
-      SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
-        return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
-      });
+      SinkCandidate->getVPSingleValue()->replaceUsesWithIf(
+          Clone->getVPSingleValue(), [SinkTo](VPUser &U, unsigned) {
+            return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
+          });
     }
     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
     for (VPValue *Op : SinkCandidate->operands())
-      if (auto *Def =
-              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
+      if (auto *Def = Op->getDefiningRecipe();
+          Def && Def->getNumDefinedValues() == 1)
         WorkList.insert(std::make_pair(SinkTo, Def));
     Changed = true;
   }
@@ -321,10 +326,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
 
   VPPredInstPHIRecipe *PHIRecipe = nullptr;
-  if (PredRecipe->getNumUsers() != 0) {
-    PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask);
-    PredRecipe->replaceAllUsesWith(PHIRecipe);
-    PHIRecipe->setOperand(0, RecipeWithoutMask);
+  if (PredRecipe->getVPSingleValue()->getNumUsers() != 0) {
+    PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask->getVPSingleValue());
+    PredRecipe->getVPSingleValue()->replaceAllUsesWith(PHIRecipe);
+    PHIRecipe->setOperand(0, RecipeWithoutMask->getVPSingleValue());
   }
   PredRecipe->eraseFromParent();
   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
@@ -344,7 +349,8 @@ static void addReplicateRegions(VPlan &Plan) {
            vp_depth_first_deep(Plan.getEntry()))) {
     for (VPRecipeBase &R : *VPBB)
       if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
-        if (RepR->isPredicated())
+        // TODO: Support for multiple-def replications.
+        if (RepR->isPredicated() && RepR->getNumDefinedValues() == 1)
           WorkList.push_back(RepR);
       }
   }
@@ -434,11 +440,12 @@ static void removeRedundantInductionCasts(VPlan &Plan) {
     auto &Casts = IV->getInductionDescriptor().getCastInsts();
     VPValue *FindMyCast = IV;
     for (Instruction *IRCast : reverse(Casts)) {
-      VPSingleDefRecipe *FoundUserCast = nullptr;
+      VPValue *FoundUserCast = nullptr;
       for (auto *U : FindMyCast->users()) {
-        auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
-        if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
-          FoundUserCast = UserCast;
+        auto *UserCast = dyn_cast<VPRecipeBase>(U);
+        if (UserCast && UserCast->getNumDefinedValues() == 1 &&
+            UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+          FoundUserCast = UserCast->getVPSingleValue();
           break;
         }
       }
@@ -1130,6 +1137,10 @@ void VPlanTransforms::truncateToMinimalBitwidths(
                VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
         continue;
 
+      // TODO: Figure out how to handle multiple defs.
+      if (R.getNumDefinedValues() > 1)
+        continue;
+
       VPValue *ResultVPV = R.getVPSingleValue();
       auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
       unsigned NewResSizeInBits = MinBWs.lookup(UI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index f653269713b30b..78550f7762b6be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -97,12 +97,13 @@ class UnrollState {
     }
   }
 
-  /// Given a uniform recipe \p R, add it for all parts.
-  void addUniformForAllParts(VPSingleDefRecipe *R) {
-    auto Ins = VPV2Parts.insert({R, {}});
-    assert(Ins.second && "uniform value already added");
-    for (unsigned Part = 0; Part != UF; ++Part)
-      Ins.first->second.push_back(R);
+  /// Given a uniform def \p Def, add it for all parts.
+  void addUniformForAllParts(VPDef *Def) {
+    for (VPValue *VPV : Def->definedValues()) {
+      auto [_, Inserted] =
+          VPV2Parts.insert({VPV, SmallVector<VPValue *>(UF, VPV)});
+      assert(Inserted && "uniform value already added");
+    }
   }
 
   bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
@@ -265,13 +266,13 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
     }
   }
   if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
-    if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
+    if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
         RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
       // Stores to an invariant address only need to store the last part.
       remapOperands(&R, UF - 1);
       return;
     }
-    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
+    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingInstr())) {
       if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
         addUniformForAllParts(RepR);
         return;
@@ -373,9 +374,9 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
 
-    auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
-    if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
-      addUniformForAllParts(SingleDef);
+    if (R.getNumDefinedValues() >= 1 &&
+        all_of(R.definedValues(), vputils::isUniformAcrossVFsAndUFs)) {
+      addUniformForAllParts(&R);
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 4621c28b051298..83621d20ff7f10 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -102,7 +102,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
         // all their operands are invariant.
         // TODO: Further relax the restrictions.
         return R->isUniform() &&
-               (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) &&
+               (isa<LoadInst, StoreInst>(R->getUnderlyingInstr())) &&
                all_of(R->operands(),
                       [](VPValue *Op) { return isUniformAcrossVFsAndUFs(Op); });
       })
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
new file mode 100644
index 00000000000000..5a2d710375064a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Tests basic vectorization of scalable homogeneous struct literal returns.
+
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 1
+; CHECK:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_A]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_B]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f64_widen
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 1
+; CHECK:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_A]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_B]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @bar(double %in_val) #1
+  %extract_a = extractvalue { double, double } %call, 0
+  %extract_b = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
+; CHECK-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; CHECK:       entry:
+; CHECK:         br i1 false, label %scalar.ph, label %vector.memcheck
+; CHECK:       vector.memcheck:
+; CHECK:       vector.body:
+; CHECK:         call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; CHECK:       for.body:
+; CHECK:         call { float, float } @foo(float [[LOAD:%.*]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare { float, float } @foo(float)
+declare { double, double } @bar(double)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double>, <vscale x 2 x i1>)
+
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_bar(scalable_vec_masked_bar)" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
new file mode 100644
index 00000000000000..94c9e0c11463e9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -0,0 +1,244 @@
+; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Tests basic vectorization of homogeneous struct literal returns.
+
+; CHECK-REMARKS-COUNT-4: remark: {{.*}} vectorized loop
+; CHECK-REMARKS-COUNT-2: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+; CHECK-REMARKS:         remark: {{.*}} loop not vectorized: call instruction cannot be vectorized
+
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK:         [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:         [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0
+; CHECK:         [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1
+; CHECK:         store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4
+; CHECK:         store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f64_widen
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:        vector.body:
+; CHECK:          [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]])
+; CHECK:          [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0
+; CHECK:          [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1
+; CHECK:          store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8
+; CHECK:          store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @bar(double %in_val) #1
+  %extract_a = extractvalue { double, double } %call, 0
+  %extract_b = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_replicate
+; CHECK-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; CHECK:       vector.body:
+; CHECK-DAG:     [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
+; CHECK-DAG:     [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
+; CHECK-DAG:     [[LANE_0_A:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
+; CHECK-DAG:     [[TMP_A:%.*]] = insertelement <2 x float> poison, float [[LANE_0_A]], i64 0
+; CHECK-DAG:     [[LANE_0_B:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
+; CHECK-DAG:     [[TMP_B:%.*]] = insertelement <2 x float> poison, float [[LANE_0_B]], i64 0
+; CHECK-DAG:     [[LANE_1_A:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
+; CHECK-DAG:     [[WIDE_A:%.*]] = insertelement <2 x float> [[TMP_A]], float [[LANE_1_A]], i64 1
+; CHECK-DAG:     [[LANE_1_B:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
+; CHECK-DAG:     [[WIDE_B:%.*]] = insertelement <2 x float> [[TMP_B]], float [[LANE_1_B]], i64 1
+; CHECK-DAG:     store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4
+; CHECK-DAG:     store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  ; #3 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { float, float } @foo(float %in_val) #3
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
+; CHECK-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; CHECK:       entry:
+; CHECK:         br i1 false, label %scalar.ph, label %vector.memcheck
+; CHECK:       vector.memcheck:
+; CHECK:       vector.body:
+; CHECK:         call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]])
+; CHECK:       for.body:
+; CHECK          call { float, float } @foo(float [[LOAD:%.*]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Negative test. Widening structs with mixed element types is not supported.
+define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_mixed_element_type_struct_return
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   call {{.*}} @fixed_vec_baz
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, i32 } @baz(float %in_val) #2
+  %extract_a = extractvalue { float, i32 } %call, 0
+  %extract_b = extractvalue { float, i32 } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %out_b, i64 %iv
+  store i32 %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+%named_struct = type { double, double }
+
+; Negative test. Widening non-literal structs is not supported.
+define void @test_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @test_named_struct_return
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   call {{.*}} @fixed_vec_bar
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call %named_struct @bar_named(double %in_val) #4
+  %extract_a = extractvalue %named_struct %call, 0
+  %extract_b = extractvalue %named_struct %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable.
+define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @test_overflow_intrinsic
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   @llvm.sadd.with.overflow.v{{.+}}i32
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load i32, ptr %arrayidx, align 4
+  %call = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %in_val, i32 %in_val)
+  %extract_ret = extractvalue { i32, i1 } %call, 0
+  %extract_overflow = extractvalue { i32, i1 } %call, 1
+  %zext_overflow = zext i1 %extract_overflow to i8
+  %arrayidx2 = getelementptr inbounds i32, ptr %out_a, i64 %iv
+  store i32 %extract_ret, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i8, ptr %out_b, i64 %iv
+  store i8 %zext_overflow, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare { float, float } @foo(float)
+declare { double, double } @bar(double)
+declare { float, i32 } @baz(float)
+declare %named_struct @bar_named(double)
+
+declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
+declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)
+declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec_baz)" }
+attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" }
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
index 68dd47537fdfdf..5777740864d016 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
@@ -28,11 +28,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:    "vector.body:\l" +
 ; CHECK-NEXT:    "  EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" +
 ; CHECK-NEXT:    "  vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>\l" +
-; CHECK-NEXT:    "  CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" +
+; CHECK-NEXT:    "  CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\> (S-\>S)\l" +
 ; CHECK-NEXT:    "  vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" +
 ; CHECK-NEXT:    "  WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" +
 ; CHECK-NEXT:    "  WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" +
-; CHECK-NEXT:    "  CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" +
+; CHECK-NEXT:    "  CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\> (S-\>S)\l" +
 ; CHECK-NEXT:    "  vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" +
 ; CHECK-NEXT:    "  WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" +
 ; CHECK-NEXT:    "  EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" +
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
new file mode 100644
index 00000000000000..6f85575cb725c9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s
+
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_widen'
+; CHECK:       VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<%3>
+; CHECK-NEXT:      vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<%4>
+; CHECK-NEXT:      WIDEN-CALL ir<%call>, ir<%call>.1 = call @foo(ir<%in_val>) (using library function: fixed_vec_foo)
+; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<%3>
+; CHECK-NEXT:      vp<%5> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:      WIDEN store vp<%5>, ir<%call>
+; CHECK-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<%3>
+; CHECK-NEXT:      vp<%6> = vector-pointer ir<%arrayidx4>
+; CHECK-NEXT:      WIDEN store vp<%6>, ir<%call>.1
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_replicate'
+; CHECK:       VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<%3>
+; CHECK-NEXT:      vp<%4> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%in_val> = load vp<%4>
+; CHECK-NEXT:      REPLICATE ir<%call>, ir<%call>.1 = call @foo(ir<%in_val>) (S->S), (S->S)
+; CHECK-NEXT:      CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<%3>
+; CHECK-NEXT:      vp<%5> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:      WIDEN store vp<%5>, ir<%call>
+; CHECK-NEXT:      CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<%3>
+; CHECK-NEXT:      vp<%6> = vector-pointer ir<%arrayidx4>
+; CHECK-NEXT:      WIDEN store vp<%6>, ir<%call>.1
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  ; #3 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { float, float } @foo(float %in_val) #1
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+
+declare { float, float } @foo(float)
+
+declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 3179cfc676ab67..95092a2354391e 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -953,7 +953,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
   EXPECT_TRUE(isa<VPUser>(BaseR));
   EXPECT_EQ(&Recipe, BaseR);
 
-  VPValue *VPV = &Recipe;
+  VPValue *VPV = Recipe.getVPSingleValue();
   EXPECT_TRUE(VPV->getDefiningRecipe());
   EXPECT_EQ(&Recipe, VPV->getDefiningRecipe());