[llvm] [LV] Teach LoopVectorizationLegality about struct vector calls (PR #119221)

Fri Dec 20 03:06:17 PST 2024

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/119221

>From d5b8cfcf23493ac4a545bfe9db0791dffd0eb4c2 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 9 Dec 2024 15:05:00 +0000
Subject: [PATCH 1/6] [LV] Teach LoopVectorizationLegality about struct vector
 calls

This is a split-off from #109833 and only adds code relating to checking
if a struct-returning call can be vectorized.

This initial patch only allows the case where all users of the struct
return are `extractvalue` operations that can be widened.

```
%call = tail call { float, float } @foo(float %in_val) #0
%extract_a = extractvalue { float, float } %call, 0
%extract_b = extractvalue { float, float } %call, 1
```

Note: The tests require the VFABI changes from #119000 to pass.
---
 llvm/include/llvm/IR/VectorTypeUtils.h        |  16 ++
 .../Vectorize/LoopVectorizationLegality.h     |  10 +
 llvm/lib/IR/VectorTypeUtils.cpp               |   9 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  29 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   9 +
 .../AArch64/scalable-struct-return.ll         |  97 +++++++
 .../Transforms/LoopVectorize/struct-return.ll | 268 ++++++++++++++++++
 7 files changed, 436 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/struct-return.ll

diff --git a/llvm/include/llvm/IR/VectorTypeUtils.h b/llvm/include/llvm/IR/VectorTypeUtils.h
index f30bf9ee9240b0..d4236b193bc5b1 100644
--- a/llvm/include/llvm/IR/VectorTypeUtils.h
+++ b/llvm/include/llvm/IR/VectorTypeUtils.h
@@ -40,6 +40,10 @@ Type *toScalarizedStructTy(StructType *StructTy);
 /// are vectors of matching element count. This does not include empty structs.
 bool isVectorizedStructTy(StructType *StructTy);
 
+/// Returns true if `StructTy` is an unpacked literal struct where all elements
+/// are scalars that can be used as vector element types.
+bool canVectorizeStructTy(StructType *StructTy);
+
 /// A helper for converting to vectorized types. For scalar types, this is
 /// equivalent to calling `ToVectorTy`. For struct types, this returns a new
 /// struct where each element type has been widened to a vector type.
@@ -71,6 +75,18 @@ inline bool isVectorizedTy(Type *Ty) {
   return Ty->isVectorTy();
 }
 
+// Returns true if `Ty` is a valid vector element type, void, or an unpacked
+// literal struct where all elements are valid vector element types.
+// Note: Even if a type can be vectorized that does not mean it is valid to do
+// so in all cases. For example, a vectorized struct (as returned by
+// toVectorizedTy) does not perform (de)interleaving, so it can't be used for
+// vectorizing loads/stores.
+inline bool canVectorizeTy(Type *Ty) {
+  if (StructType *StructTy = dyn_cast<StructType>(Ty))
+    return canVectorizeStructTy(StructTy);
+  return Ty->isVoidTy() || VectorType::isValidElementType(Ty);
+}
+
 /// Returns the types contained in `Ty`. For struct types, it returns the
 /// elements, all other types are returned directly.
 inline ArrayRef<Type *> getContainedTypes(Type *const &Ty) {
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index fbe80eddbae07a..cd64b13a034b79 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -422,6 +422,10 @@ class LoopVectorizationLegality {
   /// has a vectorized variant available.
   bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
+  /// Returns true if there is at least one function call in the loop which
+  /// returns a struct type and needs to be vectorized.
+  bool hasStructVectorCall() const { return StructVecVecCallFound; }
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -644,6 +648,12 @@ class LoopVectorizationLegality {
   /// the use of those function variants.
   bool VecCallVariantsFound = false;
 
+  /// If we find a call (to be vectorized) that returns a struct type, record
+  /// that so we can bail out until this is supported.
+  /// TODO: Remove this flag once vectorizing calls with struct returns is
+  /// supported.
+  bool StructVecVecCallFound = false;
+
   /// Indicates whether this loop has an uncountable early exit, i.e. an
   /// uncountable exiting block that is not the latch.
   bool HasUncountableEarlyExit = false;
diff --git a/llvm/lib/IR/VectorTypeUtils.cpp b/llvm/lib/IR/VectorTypeUtils.cpp
index e6e265414a2b8e..2f05e3b8619fb4 100644
--- a/llvm/lib/IR/VectorTypeUtils.cpp
+++ b/llvm/lib/IR/VectorTypeUtils.cpp
@@ -52,3 +52,12 @@ bool llvm::isVectorizedStructTy(StructType *StructTy) {
     return Ty->isVectorTy() && cast<VectorType>(Ty)->getElementCount() == VF;
   });
 }
+
+/// Returns true if `StructTy` is an unpacked literal struct where all elements
+/// are scalars that can be used as vector element types.
+bool llvm::canVectorizeStructTy(StructType *StructTy) {
+  auto ElemTys = StructTy->elements();
+  if (ElemTys.empty() || !isUnpackedStructLiteral(StructTy))
+    return false;
+  return all_of(ElemTys, VectorType::isValidElementType);
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 555c8435dd330d..c68e95a42be579 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -779,6 +779,18 @@ static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
   return Scalarize;
 }
 
+/// Returns true if the call return type `Ty` can be widened by the loop
+/// vectorizer.
+static bool canWidenCallReturnType(Type *Ty) {
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  // TODO: Remove the homogeneous types restriction. This is just an initial
+  // simplification. When we want to support things like the overflow intrinsics
+  // we will have to lift this restriction.
+  if (StructTy && !StructTy->containsHomogeneousTypes())
+    return false;
+  return canVectorizeTy(StructTy);
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *Header = TheLoop->getHeader();
 
@@ -943,11 +955,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
+      auto canWidenInstruction = [this](Instruction const &Inst) {
+        Type *InstTy = Inst.getType();
+        if (isa<CallInst>(Inst) && isa<StructType>(InstTy) &&
+            canWidenCallReturnType(InstTy)) {
+          StructVecVecCallFound = true;
+          // For now, we can only widen struct values returned from calls where
+          // all users are extractvalue instructions.
+          return llvm::all_of(Inst.uses(), [](auto &Use) {
+            return isa<ExtractValueInst>(Use.getUser());
+          });
+        }
+        return VectorType::isValidElementType(InstTy) || InstTy->isVoidTy();
+      };
+
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
       // Also, we can't vectorize extractelement instructions.
-      if ((!VectorType::isValidElementType(I.getType()) &&
-           !I.getType()->isVoidTy()) ||
+      if (!canWidenInstruction(I) ||
           (isa<CastInst>(I) &&
            !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
           isa<ExtractElementInst>(I)) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a6acc710a34c89..c3e607eb5cf90f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10258,6 +10258,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     }
   }
 
+  if (LVL.hasStructVectorCall()) {
+    constexpr StringLiteral FailureMessage(
+        "Auto-vectorization of calls that return struct types is not yet "
+        "supported");
+    reportVectorizationFailure(FailureMessage, FailureMessage,
+                               "StructCallVectorizationUnsupported", ORE, L);
+    return false;
+  }
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
new file mode 100644
index 00000000000000..0454272d3f3dd6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -0,0 +1,97 @@
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -pass-remarks-analysis=loop-vectorize -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Tests basic vectorization of scalable homogeneous struct literal returns.
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f64_widen
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @bar(double %in_val) #1
+  %extract_a = extractvalue { double, double } %call, 0
+  %extract_b = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare { float, float } @foo(float)
+declare { double, double } @bar(double)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double>, <vscale x 2 x i1>)
+
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_bar(scalable_vec_masked_bar)" }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
new file mode 100644
index 00000000000000..1ac0c1670b9dc3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -0,0 +1,268 @@
+; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -pass-remarks-analysis=loop-vectorize  -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Tests basic vectorization of homogeneous struct literal returns.
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f64_widen
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @bar(double %in_val) #1
+  %extract_a = extractvalue { double, double } %call, 0
+  %extract_b = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_replicate
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  ; #3 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { float, float } @foo(float %in_val) #3
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
+; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Negative test. Widening structs with mixed element types is not supported.
+; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_mixed_element_type_struct_return
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   call {{.*}} @fixed_vec_baz
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, i32 } @baz(float %in_val) #2
+  %extract_a = extractvalue { float, i32 } %call, 0
+  %extract_b = extractvalue { float, i32 } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %out_b, i64 %iv
+  store i32 %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+%named_struct = type { double, double }
+
+; Negative test. Widening non-literal structs is not supported.
+; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @test_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @test_named_struct_return
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   call {{.*}} @fixed_vec_bar
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv
+  %in_val = load double, ptr %arrayidx, align 8
+  %call = tail call %named_struct @bar_named(double %in_val) #4
+  %extract_a = extractvalue %named_struct %call, 0
+  %extract_b = extractvalue %named_struct %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv
+  store double %extract_a, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv
+  store double %extract_b, ptr %arrayidx4, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable.
+; CHECK-REMARKS:         remark: {{.*}} loop not vectorized: call instruction cannot be vectorized
+define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @test_overflow_intrinsic
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   @llvm.sadd.with.overflow.v{{.+}}i32
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load i32, ptr %arrayidx, align 4
+  %call = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %in_val, i32 %in_val)
+  %extract_ret = extractvalue { i32, i1 } %call, 0
+  %extract_overflow = extractvalue { i32, i1 } %call, 1
+  %zext_overflow = zext i1 %extract_overflow to i8
+  %arrayidx2 = getelementptr inbounds i32, ptr %out_a, i64 %iv
+  store i32 %extract_ret, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i8, ptr %out_b, i64 %iv
+  store i8 %zext_overflow, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Negative test. Widening struct loads is not supported.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_struct_load(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_struct_load
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds { float, float }, ptr %in, i64 %iv
+  %call = load { float, float }, ptr %arrayidx, align 8
+  %extract_a = extractvalue { float, float } %call, 0
+  %extract_b = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Negative test. Widening struct stores is not supported.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_struct_return_store_struct(ptr noalias %in, ptr noalias writeonly %out) {
+; CHECK-LABEL: define void @negative_struct_return_store_struct
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds { float, float }, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %in_val) #0
+  %out_ptr = getelementptr inbounds { float, float }, ptr %out, i64 %iv
+  store { float, float } %call, ptr %out_ptr, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+declare { float, float } @foo(float)
+declare { double, double } @bar(double)
+declare { float, i32 } @baz(float)
+declare %named_struct @bar_named(double)
+
+declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
+declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)
+declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec_baz)" }
+attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" }

>From 129bdbd845ca4a6c64f11a64fbf5614878f437a7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 18 Dec 2024 14:22:45 +0000
Subject: [PATCH 2/6] Rename test

---
 llvm/test/Transforms/LoopVectorize/struct-return.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 1ac0c1670b9dc3..0a06484c85e32c 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -146,8 +146,8 @@ exit:
 
 ; Negative test. Widening non-literal structs is not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
-define void @test_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
-; CHECK-LABEL: define void @test_named_struct_return
+define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_named_struct_return
 ; CHECK-NOT:   vector.body:
 ; CHECK-NOT:   call {{.*}} @fixed_vec_bar
 entry:

>From 419e516d07f1962a0770572ae03eb85c181636b5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 18 Dec 2024 15:23:28 +0000
Subject: [PATCH 3/6] Fixups

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 +-
 .../Transforms/LoopVectorize/struct-return.ll | 86 ++++++++++++++++---
 2 files changed, 74 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c3e607eb5cf90f..b18f1d1472a0b0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10259,10 +10259,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   if (LVL.hasStructVectorCall()) {
-    constexpr StringLiteral FailureMessage(
-        "Auto-vectorization of calls that return struct types is not yet "
-        "supported");
-    reportVectorizationFailure(FailureMessage, FailureMessage,
+    reportVectorizationFailure("Auto-vectorization of calls that return struct "
+                               "types is not yet supported",
                                "StructCallVectorizationUnsupported", ORE, L);
     return false;
   }
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 0a06484c85e32c..78e05d55d1c9da 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -114,6 +114,35 @@ exit:
   ret void
 }
 
+; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable.
+; CHECK-REMARKS:         remark: {{.*}} loop not vectorized: call instruction cannot be vectorized
+define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @test_overflow_intrinsic
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   @llvm.sadd.with.overflow.v{{.+}}i32
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load i32, ptr %arrayidx, align 4
+  %call = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %in_val, i32 %in_val)
+  %extract_ret = extractvalue { i32, i1 } %call, 0
+  %extract_overflow = extractvalue { i32, i1 } %call, 1
+  %zext_overflow = zext i1 %extract_overflow to i8
+  %arrayidx2 = getelementptr inbounds i32, ptr %out_a, i64 %iv
+  store i32 %extract_ret, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i8, ptr %out_b, i64 %iv
+  store i8 %zext_overflow, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 ; Negative test. Widening structs with mixed element types is not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
 define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -172,27 +201,54 @@ exit:
   ret void
 }
 
-; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable.
-; CHECK-REMARKS:         remark: {{.*}} loop not vectorized: call instruction cannot be vectorized
-define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
-; CHECK-LABEL: define void @test_overflow_intrinsic
+; Negative test. Nested homogeneous structs are not supported.
+; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_nested_struct(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_nested_struct
 ; CHECK-NOT:   vector.body:
-; CHECK-NOT:   @llvm.sadd.with.overflow.v{{.+}}i32
 entry:
   br label %for.body
 
 for.body:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
   %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
-  %in_val = load i32, ptr %arrayidx, align 4
-  %call = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %in_val, i32 %in_val)
-  %extract_ret = extractvalue { i32, i1 } %call, 0
-  %extract_overflow = extractvalue { i32, i1 } %call, 1
-  %zext_overflow = zext i1 %extract_overflow to i8
-  %arrayidx2 = getelementptr inbounds i32, ptr %out_a, i64 %iv
-  store i32 %extract_ret, ptr %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds i8, ptr %out_b, i64 %iv
-  store i8 %zext_overflow, ptr %arrayidx4, align 4
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { { float, float } } @foo_nested_struct(float %in_val) #0
+  %extract_inner = extractvalue { { float, float } } %call, 0
+  %extract_a = extractvalue { float, float } %extract_inner, 0
+  %extract_b = extractvalue { float, float } %extract_inner, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Negative test. Homogeneous structs of arrays are not supported.
+; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_struct_array_elements(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_struct_array_elements
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { [2 x float] } @foo_arrays(float %in_val) #0
+  %extract_inner = extractvalue { [2 x float] } %call, 0
+  %extract_a = extractvalue [2 x float] %extract_inner, 0
+  %extract_b = extractvalue [2 x float] %extract_inner, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store float %extract_b, ptr %arrayidx4, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %exit, label %for.body
@@ -254,6 +310,8 @@ declare { float, float } @foo(float)
 declare { double, double } @bar(double)
 declare { float, i32 } @baz(float)
 declare %named_struct @bar_named(double)
+declare { { float, float } } @foo_nested_struct(float)
+declare { [2 x float] } @foo_arrays(float)
 
 declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
 declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)

>From 684d42048d251dfcb6525592594d9279ec5a8b25 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 18 Dec 2024 16:25:00 +0000
Subject: [PATCH 4/6] Fix typo

---
 .../llvm/Transforms/Vectorize/LoopVectorizationLegality.h     | 4 ++--
 llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index cd64b13a034b79..72fda911962ad2 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -424,7 +424,7 @@ class LoopVectorizationLegality {
 
   /// Returns true if there is at least one function call in the loop which
   /// returns a struct type and needs to be vectorized.
-  bool hasStructVectorCall() const { return StructVecVecCallFound; }
+  bool hasStructVectorCall() const { return StructVecCallFound; }
 
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
@@ -652,7 +652,7 @@ class LoopVectorizationLegality {
   /// that so we can bail out until this is supported.
   /// TODO: Remove this flag once vectorizing calls with struct returns is
   /// supported.
-  bool StructVecVecCallFound = false;
+  bool StructVecCallFound = false;
 
   /// Indicates whether this loop has an uncountable early exit, i.e. an
   /// uncountable exiting block that is not the latch.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index c68e95a42be579..572d66d2b0c3e2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -959,7 +959,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         Type *InstTy = Inst.getType();
         if (isa<CallInst>(Inst) && isa<StructType>(InstTy) &&
             canWidenCallReturnType(InstTy)) {
-          StructVecVecCallFound = true;
+          StructVecCallFound = true;
           // For now, we can only widen struct values returned from calls where
           // all users are extractvalue instructions.
           return llvm::all_of(Inst.uses(), [](auto &Use) {

>From c9b6df8d253871f512ffa3df037a6434e7d5c621 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 19 Dec 2024 12:01:54 +0000
Subject: [PATCH 5/6] Fixups

---
 .../Vectorize/LoopVectorizationLegality.cpp   |  8 ++-
 .../Transforms/LoopVectorize/struct-return.ll | 52 +++++++++++++++++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 572d66d2b0c3e2..fc6443f9f07255 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -955,16 +955,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       if (CI && !VFDatabase::getMappings(*CI).empty())
         VecCallVariantsFound = true;
 
-      auto canWidenInstruction = [this](Instruction const &Inst) {
+      auto CanWidenInstruction = [this](Instruction const &Inst) {
         Type *InstTy = Inst.getType();
         if (isa<CallInst>(Inst) && isa<StructType>(InstTy) &&
             canWidenCallReturnType(InstTy)) {
           StructVecCallFound = true;
           // For now, we can only widen struct values returned from calls where
           // all users are extractvalue instructions.
-          return llvm::all_of(Inst.uses(), [](auto &Use) {
-            return isa<ExtractValueInst>(Use.getUser());
-          });
+          return llvm::all_of(Inst.users(), IsaPred<ExtractValueInst>);
         }
         return VectorType::isValidElementType(InstTy) || InstTy->isVoidTy();
       };
@@ -972,7 +970,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
       // Also, we can't vectorize extractelement instructions.
-      if (!canWidenInstruction(I) ||
+      if (!CanWidenInstruction(I) ||
           (isa<CastInst>(I) &&
            !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
           isa<ExtractElementInst>(I)) {
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 78e05d55d1c9da..8b6cbc8c49d508 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -143,6 +143,30 @@ exit:
   ret void
 }
 
+; TODO: Support vectorization in this case.
+; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
+define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias writeonly %out_a) {
+; CHECK-LABEL: define void @struct_return_i32_three_results_widen
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %iv
+  %in_val = load i32, ptr %arrayidx, align 4
+  %call = tail call { i32, i32, i32 } @qux(i32 %in_val) #5
+  %extract_a = extractvalue { i32, i32, i32 } %call, 0
+  %arrayidx2 = getelementptr inbounds i32, ptr %out_a, i64 %iv
+  store i32 %extract_a, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 ; Negative test. Widening structs with mixed element types is not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
 define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -229,6 +253,30 @@ exit:
   ret void
 }
 
+; Negative test. The second element of the struct cannot be widened.
+; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_non_widenable_element(ptr noalias %in, ptr noalias writeonly %out_a) {
+; CHECK-LABEL: define void @negative_non_widenable_element
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load float, ptr %arrayidx, align 4
+  %call = tail call { float, [1 x float] } @foo_one_non_widenable_element(float %in_val) #0
+  %extract_a = extractvalue { float, [1 x float] } %call, 0
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store float %extract_a, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 ; Negative test. Homogeneous structs of arrays are not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
 define void @negative_struct_array_elements(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -312,10 +360,13 @@ declare { float, i32 } @baz(float)
 declare %named_struct @bar_named(double)
 declare { { float, float } } @foo_nested_struct(float)
 declare { [2 x float] } @foo_arrays(float)
+declare { float, [1 x float] } @foo_one_non_widenable_element(float)
+declare { i32, i32, i32 } @qux(i32)
 
 declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)
 declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)
 declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32>)
 
 declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
 
@@ -324,3 +375,4 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec
 attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec_baz)" }
 attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
 attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" }
+attributes #5 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_qux(fixed_vec_qux)" }

>From f1b0fccd45ec64d62b03c220f6ae960c19070327 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 20 Dec 2024 10:24:52 +0000
Subject: [PATCH 6/6] Fixups

---
 llvm/include/llvm/IR/VectorTypeUtils.h        | 12 ++++----
 .../Vectorize/LoopVectorizationLegality.cpp   |  8 ++++--
 .../Transforms/LoopVectorize/struct-return.ll | 28 +++++++++++++++++++
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/IR/VectorTypeUtils.h b/llvm/include/llvm/IR/VectorTypeUtils.h
index d4236b193bc5b1..eff2b161afbe90 100644
--- a/llvm/include/llvm/IR/VectorTypeUtils.h
+++ b/llvm/include/llvm/IR/VectorTypeUtils.h
@@ -75,12 +75,12 @@ inline bool isVectorizedTy(Type *Ty) {
   return Ty->isVectorTy();
 }
 
-// Returns true if `Ty` is a valid vector element type, void, or an unpacked
-// literal struct where all elements are valid vector element types.
-// Note: Even if a type can be vectorized that does not mean it is valid to do
-// so in all cases. For example, a vectorized struct (as returned by
-// toVectorizedTy) does not perform (de)interleaving, so it can't be used for
-// vectorizing loads/stores.
+/// Returns true if `Ty` is a valid vector element type, void, or an unpacked
+/// literal struct where all elements are valid vector element types.
+/// Note: Even if a type can be vectorized that does not mean it is valid to do
+/// so in all cases. For example, a vectorized struct (as returned by
+/// toVectorizedTy) does not perform (de)interleaving, so it can't be used for
+/// vectorizing loads/stores.
 inline bool canVectorizeTy(Type *Ty) {
   if (StructType *StructTy = dyn_cast<StructType>(Ty))
     return canVectorizeStructTy(StructTy);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fc6443f9f07255..a36799a95012a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -959,10 +959,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         Type *InstTy = Inst.getType();
         if (isa<CallInst>(Inst) && isa<StructType>(InstTy) &&
             canWidenCallReturnType(InstTy)) {
+          // TODO: Remove the `StructVecCallFound` flag once vectorizing calls
+          // with struct returns is supported.
           StructVecCallFound = true;
-          // For now, we can only widen struct values returned from calls where
-          // all users are extractvalue instructions.
-          return llvm::all_of(Inst.users(), IsaPred<ExtractValueInst>);
+          // For now, we only recognize struct values returned from calls where
+          // all users are extractvalue as vectorizable.
+          return all_of(Inst.users(), IsaPred<ExtractValueInst>);
         }
         return VectorType::isValidElementType(InstTy) || InstTy->isVoidTy();
       };
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 8b6cbc8c49d508..d239c6b3e741b3 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -167,6 +167,33 @@ exit:
   ret void
 }
 
+; Negative test. Widening structs of vectors is not supported.
+; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
+define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; CHECK-LABEL: define void @negative_struct_of_vectors
+; CHECK-NOT:   vector.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv
+  %in_val = load <1 x float>, ptr %arrayidx, align 4
+  %call = tail call { <1 x float>, <1 x float> } @foo(<1 x float> %in_val) #0
+  %extract_a = extractvalue { <1 x float>, <1 x float> } %call, 0
+  %extract_b = extractvalue { <1 x float>, <1 x float> } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv
+  store <1 x float> %extract_a, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv
+  store <1 x float> %extract_b, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 ; Negative test. Widening structs with mixed element types is not supported.
 ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized
 define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
@@ -361,6 +388,7 @@ declare %named_struct @bar_named(double)
 declare { { float, float } } @foo_nested_struct(float)
 declare { [2 x float] } @foo_arrays(float)
 declare { float, [1 x float] } @foo_one_non_widenable_element(float)
+declare { <1 x float>, <1 x float> } @foo_vectors(<1 x float>)
 declare { i32, i32, i32 } @qux(i32)
 
 declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>)