[llvm] [VPlan] Add the cost of spills when considering register pressure (PR #179646)

Wed Feb 4 05:44:32 PST 2026

https://github.com/john-brawn-arm updated https://github.com/llvm/llvm-project/pull/179646

>From fd106f3b0c45e3872c8797364ec23f62bf706d84 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Thu, 8 Jan 2026 16:53:30 +0000
Subject: [PATCH 1/3] [VPlan] Add the cost of spills when considering register
 pressure

Currently when considering register pressure is enabled, we reject any VF that
has higher pressure than the number of registers. However this can result in
failing to vectorize in cases where it's beneficial, as the cost of the extra
spills is less than the benefit we get from vectorizing.

Deal with this by instead calculating the cost of spills and adding that to the
rest of the cost, so we can detect this kind of situation and still vectorize
while avoiding vectorizing in cases where the extra cost makes it not with it.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   3 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  44 ++-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  70 ++++-
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |  12 +-
 .../AArch64/maxbandwidth-regpressure.ll       |  94 ++++++-
 .../ARM/mve-reg-pressure-spills.ll            | 266 ++++++++++++++++++
 .../LoopVectorize/LoongArch/reg-usage.ll      |   4 +-
 7 files changed, 442 insertions(+), 51 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 44d4d92d4a7e2..06e8efef20c03 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,6 +45,7 @@ class OptimizationRemarkEmitter;
 class TargetTransformInfo;
 class TargetLibraryInfo;
 class VPRecipeBuilder;
+class VPRegisterUsage;
 struct VFRange;
 
 extern cl::opt<bool> EnableVPlanNativePath;
@@ -497,7 +498,7 @@ class LoopVectorizationPlanner {
   ///
   /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
   /// been retired.
-  InstructionCost cost(VPlan &Plan, ElementCount VF) const;
+  InstructionCost cost(VPlan &Plan, ElementCount VF, VPRegisterUsage *RU) const;
 
   /// Precompute costs for certain instructions using the legacy cost model. The
   /// function is used to bring up the VPlan-based cost model to initially avoid
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index abac45b265d10..492e716fd6ad2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4247,13 +4247,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       if (VF.isScalar())
         continue;
 
-      /// If the register pressure needs to be considered for VF,
-      /// don't consider the VF as valid if it exceeds the number
-      /// of registers for the target.
-      if (CM.shouldConsiderRegPressureForVF(VF) &&
-          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
-        continue;
-
       InstructionCost C = CM.expectedCost(VF);
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
@@ -4302,6 +4295,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
         }
       }
 
+      // Add the cost of any spills due to excess register usage
+      if (CM.shouldConsiderRegPressureForVF(VF))
+        C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs);
+
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
       unsigned Width =
           estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
@@ -4687,13 +4684,16 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   if (hasFindLastReductionPhi(Plan))
     return 1;
 
+  VPRegisterUsage R =
+      calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
+
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0) {
     if (VF.isScalar())
       LoopCost = CM.expectedCost(VF);
     else
-      LoopCost = cost(Plan, VF);
+      LoopCost = cost(Plan, VF, &R);
     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
 
     // Loop body is free and there is no need for interleaving.
@@ -4701,8 +4701,6 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
       return 1;
   }
 
-  VPRegisterUsage R =
-      calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   for (auto &Pair : R.MaxLocalUsers) {
@@ -7027,13 +7025,18 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
   return Cost;
 }
 
-InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
-                                               ElementCount VF) const {
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
+                                               VPRegisterUsage *RU) const {
   VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
+
+  // Add the cost of spills due to excess register usage
+  if (CM.shouldConsiderRegPressureForVF(VF))
+    Cost += RU->spillCost(CostCtx, ForceTargetNumVectorRegs);
+
 #ifndef NDEBUG
   unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
@@ -7233,9 +7236,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
                                P->vectorFactors().end());
 
     SmallVector<VPRegisterUsage, 8> RUs;
-    if (any_of(VFs, [this](ElementCount VF) {
-          return CM.shouldConsiderRegPressureForVF(VF);
-        }))
+    bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
+      return CM.shouldConsiderRegPressureForVF(VF);
+    });
+    if (ConsiderRegPressure)
       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
 
     for (unsigned I = 0; I < VFs.size(); I++) {
@@ -7258,16 +7262,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
         continue;
       }
 
-      InstructionCost Cost = cost(*P, VF);
+      InstructionCost Cost =
+          cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (CM.shouldConsiderRegPressureForVF(VF) &&
-          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
-        LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
-                          << VF << " because it uses too many registers\n");
-        continue;
-      }
-
       if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
         BestFactor = CurrentFactor;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 8fbe7d93e6f45..b8be1be79831e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/PatternMatch.h"
 
@@ -389,13 +390,33 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
   return Base::properlyDominates(ParentA, ParentB);
 }
 
-bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
-                                        unsigned OverrideMaxNumRegs) const {
-  return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
-    return LU.second > (OverrideMaxNumRegs > 0
-                            ? OverrideMaxNumRegs
-                            : TTI.getNumberOfRegisters(LU.first));
-  });
+InstructionCost VPRegisterUsage::spillCost(VPCostContext &Ctx,
+                                           unsigned OverrideMaxNumRegs) const {
+  InstructionCost Cost;
+  DataLayout DL = Ctx.PSE.getSE()->getDataLayout();
+  for (const auto &Pair : MaxLocalUsers) {
+    unsigned AvailableRegs = OverrideMaxNumRegs > 0
+                                 ? OverrideMaxNumRegs
+                                 : Ctx.TTI.getNumberOfRegisters(Pair.first);
+    if (Pair.second > AvailableRegs) {
+      // Assume that for each register used past what's available we get one
+      // spill and reload of the largest type seen for that register class.
+      unsigned Spills = Pair.second - AvailableRegs;
+      Type *SpillType = LargestType.at(Pair.first);
+      Align Alignment = DL.getPrefTypeAlign(SpillType);
+      InstructionCost SpillCost =
+          Ctx.TTI.getMemoryOpCost(Instruction::Load, SpillType, Alignment, 0,
+                                  Ctx.CostKind) +
+          Ctx.TTI.getMemoryOpCost(Instruction::Store, SpillType, Alignment, 0,
+                                  Ctx.CostKind);
+      InstructionCost TotalCost = SpillCost * Spills;
+      LLVM_DEBUG(dbgs() << "LV(REG): Cost of " << TotalCost << " from "
+                        << Spills << " spills of "
+                        << Ctx.TTI.getRegisterClassName(Pair.first) << "\n");
+      Cost += TotalCost;
+    }
+  }
+  return Cost;
 }
 
 SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
@@ -479,6 +500,15 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
   SmallPtrSet<VPValue *, 8> OpenIntervals;
   SmallVector<VPRegisterUsage, 8> RUs(VFs.size());
   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
+  SmallVector<SmallMapVector<unsigned, Type *, 4>, 8> LargestTypes(VFs.size());
+  auto MaxType = [](Type *CurMax, Type *T) {
+    if (!CurMax)
+      return T;
+    if (TypeSize::isKnownGT(T->getPrimitiveSizeInBits(),
+                            CurMax->getPrimitiveSizeInBits()))
+      return T;
+    return CurMax;
+  };
 
   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
 
@@ -540,17 +570,19 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
             match(VPV, m_ExtractLastPart(m_VPValue())))
           continue;
 
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
         if (VFs[J].isScalar() ||
             isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
                 VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(VPV) ||
             (isa<VPInstruction>(VPV) && vputils::onlyScalarValuesUsed(VPV)) ||
             (isa<VPReductionPHIRecipe>(VPV) &&
              (cast<VPReductionPHIRecipe>(VPV))->isInLoop())) {
-          unsigned ClassID =
-              TTI.getRegisterClassForType(false, TypeInfo.inferScalarType(VPV));
+          unsigned ClassID = TTI.getRegisterClassForType(false, ScalarTy);
           // FIXME: The target might use more than one register for the type
           // even in the scalar case.
           RegUsage[ClassID] += 1;
+          LargestTypes[J][ClassID] =
+              MaxType(LargestTypes[J][ClassID], ScalarTy);
         } else {
           // The output from scaled phis and scaled reductions actually has
           // fewer lanes than the VF.
@@ -562,10 +594,12 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
             LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
                               << " to " << VF << " for " << *R << "\n";);
           }
-
-          Type *ScalarTy = TypeInfo.inferScalarType(VPV);
           unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
           RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
+          if (VectorType::isValidElementType(ScalarTy)) {
+            Type *T = VectorType::get(ScalarTy, VF);
+            LargestTypes[J][ClassID] = MaxType(LargestTypes[J][ClassID], T);
+          }
         }
       }
 
@@ -602,9 +636,11 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
       bool IsScalar = vputils::onlyScalarValuesUsed(In);
 
       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
-      unsigned ClassID = TTI.getRegisterClassForType(
-          VF.isVector(), TypeInfo.inferScalarType(In));
-      Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
+      Type *ScalarTy = TypeInfo.inferScalarType(In);
+      unsigned ClassID = TTI.getRegisterClassForType(VF.isVector(), ScalarTy);
+      Invariant[ClassID] += GetRegUsage(ScalarTy, VF);
+      Type *SpillTy = IsScalar ? ScalarTy : VectorType::get(ScalarTy, VF);
+      LargestTypes[Idx][ClassID] = MaxType(LargestTypes[Idx][ClassID], SpillTy);
     }
 
     LLVM_DEBUG({
@@ -623,10 +659,16 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
                << " registers\n";
       }
+      for (const auto &pair : LargestTypes[Idx]) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << *pair.second
+               << " is largest type potentially spilled\n";
+      }
     });
 
     RU.LoopInvariantRegs = Invariant;
     RU.MaxLocalUsers = MaxUsages[Idx];
+    RU.LargestType = LargestTypes[Idx];
     RUs[Idx] = RU;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index dc4be4270f7f1..3affa211dd140 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -19,6 +19,7 @@ namespace llvm {
 class LLVMContext;
 class VPValue;
 class VPBlendRecipe;
+class VPCostContext;
 class VPInstruction;
 class VPWidenRecipe;
 class VPWidenCallRecipe;
@@ -30,6 +31,7 @@ class VPlan;
 class Value;
 class TargetTransformInfo;
 class Type;
+class InstructionCost;
 
 /// An analysis for type-inference for VPValues.
 /// It infers the scalar type for a given VPValue by bottom-up traversing
@@ -78,12 +80,14 @@ struct VPRegisterUsage {
   /// Holds the maximum number of concurrent live intervals in the loop.
   /// The key is ClassID of target-provided register class.
   SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
+  /// Holds the largest type used in each register class.
+  SmallMapVector<unsigned, Type *, 4> LargestType;
 
-  /// Check if any of the tracked live intervals exceeds the number of
-  /// available registers for the target. If non-zero, OverrideMaxNumRegs
+  /// Calculate the estimated cost of any spills due to using more registers
+  /// than the number available for the target. If non-zero, OverrideMaxNumRegs
   /// is used in place of the target's number of registers.
-  bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
-                         unsigned OverrideMaxNumRegs = 0) const;
+  InstructionCost spillCost(VPCostContext &Ctx,
+                            unsigned OverrideMaxNumRegs = 0) const;
 };
 
 /// Estimate the register usage for \p Plan and vectorization factors in \p VFs
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
index 8109d0683fe71..2a4d16979e0d8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -1,16 +1,31 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
+; The use of the dotp instruction means we never have an i32 vector, so we don't
+; get any spills normally and with a reduced number of registers the number of
+; spills is small enough that it doesn't prevent use of a larger VF.
 define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'dotp'
+;
+; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
+;
+; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-REGS-VP: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
+; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.3)
 ; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
 ;
-; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
-; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
-; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 14 (Estimated cost per lane: 1.8)
+; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.8)
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
 entry:
   br label %for.body
 
@@ -24,8 +39,7 @@ for.body:                                         ; preds = %for.body, %entry
   %load.b = load i8, ptr %gep.b, align 1
   %ext.b = zext i8 %load.b to i32
   %mul = mul i32 %ext.b, %ext.a
-  %sub = sub i32 0, %mul
-  %add = add i32 %accum, %sub
+  %add = add i32 %accum, %mul
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %for.exit, label %for.body
@@ -34,4 +48,70 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
+; The largest type used in the loop is small enough that we already consider all
+; VFs and maximize-bandwidth does nothing.
+define void @type_too_small(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
+; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
+; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.4)
+; CHECK: LV: Selecting VF: vscale x 16.
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.a, %load.b
+  store i8 %add, ptr %gep.a, align 1
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; With reduced number of registers the spills from high pressure are enough that
+; we use the same VF as if we hadn't maximized the bandwidth.
+define void @high_pressure(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
+;
+; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
+;
+; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.2)
+; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.3)
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+
+; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 12 from 3 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 26 (Estimated cost per lane: 3.2)
+; CHECK-NOREGS-VP: LV(REG): Cost of 56 from 7 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 81 (Estimated cost per lane: 5.1)
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %add = add i32 %load.a, %ext.b
+  store i32 %add, ptr %gep.a, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll
new file mode 100644
index 0000000000000..6e9fec23195bc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll
@@ -0,0 +1,266 @@
+; RUN: opt -mcpu=cortex-m55 -passes=loop-vectorize -disable-output -debug-only=loop-vectorize,vplan -vectorizer-consider-reg-pressure=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOPRESSURE
+; RUN: opt -mcpu=cortex-m55 -passes=loop-vectorize -disable-output -debug-only=loop-vectorize,vplan -vectorizer-consider-reg-pressure=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PRESSURE
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+; In this function the spills make it not profitable to vectorize if considering
+; register pressure.
+define void @spills_not_profitable(ptr %in1, ptr %in2, ptr %out, i32 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'spills_not_profitable'
+; CHECK: LV: Scalar loop costs: 86
+; CHECK-NOPRESSURE: Cost for VF 2: 394 (Estimated cost per lane: 197.0)
+; CHECK-NOPRESSURE: Cost for VF 4: 338 (Estimated cost per lane: 84.5)
+; CHECK-NOPRESSURE: LV: Selecting VF: 4
+; CHECK-PRESSURE: LV(REG): Cost of 300 from 25 spills of Generic::VectorRC
+; CHECK-PRESSURE-NEXT: Cost for VF 2: 694 (Estimated cost per lane: 347.0)
+; CHECK-PRESSURE: LV(REG): Cost of 100 from 25 spills of Generic::VectorRC
+; CHECK-PRESSURE-NEXT: Cost for VF 4: 438 (Estimated cost per lane: 109.5)
+; CHECK-PRESSURE: LV: Selecting VF: 1
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %exit, label %for.body
+
+for.body:
+  %i = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %x4 = phi float [ %x4.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x3 = phi float [ %x3.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x2 = phi float [ %x2.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x1 = phi float [ %x1.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x0 = phi float [ %x0.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc7 = phi float [ %acc7.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc6 = phi float [ %acc6.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc5 = phi float [ %acc5.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc4 = phi float [ %acc4.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc3 = phi float [ %acc3.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc2 = phi float [ %acc2.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc1 = phi float [ %acc1.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc0 = phi float [ %acc0.next, %for.body ], [ 0.000000e+00, %entry ]
+  %in1.addr = phi ptr [ %in1.addr.next, %for.body ], [ %in1, %entry ]
+  %in2.addr = phi ptr [ %in2.addr.next, %for.body ], [ %in2, %entry ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %in1.addr, i32 4
+  %0 = load float, ptr %in1.addr, align 4
+  %incdec.ptr1 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 4
+  %1 = load float, ptr %in2.addr, align 4
+  %mul = fmul fast float %0, %x0
+  %add = fadd fast float %mul, %acc0
+  %mul2 = fmul fast float %0, %x1
+  %add3 = fadd fast float %mul2, %acc1
+  %mul4 = fmul fast float %0, %x2
+  %add5 = fadd fast float %mul4, %acc2
+  %mul6 = fmul fast float %0, %x3
+  %add7 = fadd fast float %mul6, %acc3
+  %mul8 = fmul fast float %0, %x4
+  %add9 = fadd fast float %mul8, %acc4
+  %mul10 = fmul fast float %1, %0
+  %add11 = fadd fast float %mul10, %acc7
+  %incdec.ptr12 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 8
+  %2 = load float, ptr %incdec.ptr, align 4
+  %incdec.ptr13 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 8
+  %x0.next = load float, ptr %incdec.ptr1, align 4
+  %mul14 = fmul fast float %2, %x1
+  %add15 = fadd fast float %add, %mul14
+  %mul16 = fmul fast float %2, %x2
+  %add17 = fadd fast float %add3, %mul16
+  %mul18 = fmul fast float %2, %x3
+  %add19 = fadd fast float %add5, %mul18
+  %mul20 = fmul fast float %2, %x4
+  %add21 = fadd fast float %add7, %mul20
+  %mul22 = fmul fast float %2, %1
+  %add23 = fadd fast float %mul22, %acc6
+  %mul24 = fmul fast float %x0.next, %2
+  %add25 = fadd fast float %add11, %mul24
+  %incdec.ptr26 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 12
+  %4 = load float, ptr %incdec.ptr12, align 4
+  %incdec.ptr27 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 12
+  %x1.next = load float, ptr %incdec.ptr13, align 4
+  %mul28 = fmul fast float %4, %x2
+  %add29 = fadd fast float %add15, %mul28
+  %mul30 = fmul fast float %4, %x3
+  %add31 = fadd fast float %add17, %mul30
+  %mul32 = fmul fast float %4, %x4
+  %add33 = fadd fast float %add19, %mul32
+  %mul34 = fmul fast float %4, %1
+  %add35 = fadd fast float %mul34, %acc5
+  %mul36 = fmul fast float %4, %x0.next
+  %add37 = fadd fast float %add23, %mul36
+  %mul38 = fmul fast float %x1.next, %4
+  %add39 = fadd fast float %add25, %mul38
+  %incdec.ptr40 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 16
+  %6 = load float, ptr %incdec.ptr26, align 4
+  %incdec.ptr41 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 16
+  %x2.next = load float, ptr %incdec.ptr27, align 4
+  %mul42 = fmul fast float %6, %x3
+  %add43 = fadd fast float %add29, %mul42
+  %mul44 = fmul fast float %6, %x4
+  %acc1.next = fadd fast float %add31, %mul44
+  %mul46 = fmul fast float %6, %1
+  %add47 = fadd fast float %add9, %mul46
+  %mul48 = fmul fast float %6, %x0.next
+  %add49 = fadd fast float %add35, %mul48
+  %mul50 = fmul fast float %6, %x1.next
+  %add51 = fadd fast float %add37, %mul50
+  %mul52 = fmul fast float %x2.next, %6
+  %add53 = fadd fast float %add39, %mul52
+  %incdec.ptr54 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 20
+  %8 = load float, ptr %incdec.ptr40, align 4
+  %incdec.ptr55 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 20
+  %x3.next = load float, ptr %incdec.ptr41, align 4
+  %mul56 = fmul fast float %8, %x4
+  %acc0.next = fadd fast float %add43, %mul56
+  %mul58 = fmul fast float %8, %1
+  %add59 = fadd fast float %add21, %mul58
+  %mul60 = fmul fast float %8, %x0.next
+  %add61 = fadd fast float %add47, %mul60
+  %mul62 = fmul fast float %8, %x1.next
+  %add63 = fadd fast float %add49, %mul62
+  %mul64 = fmul fast float %8, %x2.next
+  %add65 = fadd fast float %add51, %mul64
+  %mul66 = fmul fast float %x3.next, %8
+  %add67 = fadd fast float %add53, %mul66
+  %in1.addr.next = getelementptr inbounds nuw i8, ptr %in1.addr, i32 24
+  %10 = load float, ptr %incdec.ptr54, align 4
+  %in2.addr.next = getelementptr inbounds nuw i8, ptr %in2.addr, i32 24
+  %x4.next = load float, ptr %incdec.ptr55, align 4
+  %mul70 = fmul fast float %10, %1
+  %acc2.next = fadd fast float %add33, %mul70
+  %mul72 = fmul fast float %10, %x0.next
+  %acc3.next = fadd fast float %add59, %mul72
+  %mul74 = fmul fast float %10, %x1.next
+  %acc4.next = fadd fast float %add61, %mul74
+  %mul76 = fmul fast float %10, %x2.next
+  %acc5.next = fadd fast float %add63, %mul76
+  %mul78 = fmul fast float %10, %x3.next
+  %acc6.next = fadd fast float %add65, %mul78
+  %mul80 = fmul fast float %x4.next, %10
+  %acc7.next = fadd fast float %add67, %mul80
+  %inc = add nuw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %acc0.exit = phi float [ 0.000000e+00, %entry ], [ %acc0.next, %for.body ]
+  %acc1.exit = phi float [ 0.000000e+00, %entry ], [ %acc1.next, %for.body ]
+  %acc2.exit = phi float [ 0.000000e+00, %entry ], [ %acc2.next, %for.body ]
+  %acc3.exit = phi float [ 0.000000e+00, %entry ], [ %acc3.next, %for.body ]
+  %acc4.exit = phi float [ 0.000000e+00, %entry ], [ %acc4.next, %for.body ]
+  %acc5.exit = phi float [ 0.000000e+00, %entry ], [ %acc5.next, %for.body ]
+  %acc6.exit = phi float [ 0.000000e+00, %entry ], [ %acc6.next, %for.body ]
+  %acc7.exit = phi float [ 0.000000e+00, %entry ], [ %acc7.next, %for.body ]
+  store float %acc0.exit, ptr %out, align 4
+  %arrayidx82 = getelementptr inbounds nuw i8, ptr %out, i32 4
+  store float %acc1.exit, ptr %arrayidx82, align 4
+  %arrayidx83 = getelementptr inbounds nuw i8, ptr %out, i32 8
+  store float %acc2.exit, ptr %arrayidx83, align 4
+  %arrayidx84 = getelementptr inbounds nuw i8, ptr %out, i32 12
+  store float %acc3.exit, ptr %arrayidx84, align 4
+  %arrayidx85 = getelementptr inbounds nuw i8, ptr %out, i32 16
+  store float %acc4.exit, ptr %arrayidx85, align 4
+  %arrayidx86 = getelementptr inbounds nuw i8, ptr %out, i32 20
+  store float %acc5.exit, ptr %arrayidx86, align 4
+  %arrayidx87 = getelementptr inbounds nuw i8, ptr %out, i32 24
+  store float %acc6.exit, ptr %arrayidx87, align 4
+  %arrayidx88 = getelementptr inbounds nuw i8, ptr %out, i32 28
+  store float %acc7.exit, ptr %arrayidx88, align 4
+  ret void
+}
+
+; In this function we have spills but it is still profitable to vectorize when
+; considering register pressure.
+define void @spills_profitable(ptr %in1, ptr %in2, ptr %out, i32 %n, i32 %m) {
+; CHECK-LABEL: LV: Checking a loop in 'spills_profitable'
+; CHECK: LV: Scalar loop costs: 54
+; CHECK-NOPRESSURE: Cost for VF 2: 1530 (Estimated cost per lane: 765.0)
+; CHECK-NOPRESSURE: Cost for VF 4: 38 (Estimated cost per lane: 9.5)
+; CHECK-PRESSURE: LV(REG): Cost of 8 from 2 spills of Generic::ScalarRC
+; CHECK-PRESSURE-NEXT: Cost for VF 2: 1538 (Estimated cost per lane: 769.0)
+; CHECK-PRESSURE: LV(REG): Cost of 24 from 3 spills of Generic::VectorRC
+; CHECK-PRESSURE-NEXT: Cost for VF 4: 62 (Estimated cost per lane: 15.5)
+; CHECK: LV: Selecting VF: 4
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %exit, label %for.body.preheader
+
+for.body.preheader:
+  %add.ptr3.idx = mul i32 %m, 12
+  %add.ptr3 = getelementptr inbounds nuw i8, ptr %in1, i32 %add.ptr3.idx
+  %add.ptr1.idx = shl i32 %m, 3
+  %add.ptr1 = getelementptr inbounds nuw i8, ptr %in1, i32 %add.ptr1.idx
+  %add.ptr = getelementptr inbounds nuw i32, ptr %in1, i32 %m
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %acc3 = phi i64 [ %acc3.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc2 = phi i64 [ %acc2.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc1 = phi i64 [ %acc1.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc0 = phi i64 [ %acc0.next, %for.body ], [ 0, %for.body.preheader ]
+  %in2.addr = phi ptr [ %in2.addr.next, %for.body ], [ %in2, %for.body.preheader ]
+  %px3 = phi ptr [ %px3.next, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %px2 = phi ptr [ %px2.next, %for.body ], [ %add.ptr1, %for.body.preheader ]
+  %px1 = phi ptr [ %px1.next, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %px0 = phi ptr [ %px0.next, %for.body ], [ %in1, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %in2.addr, i32 4
+  %0 = load i32, ptr %in2.addr, align 4
+  %incdec.ptr4 = getelementptr inbounds nuw i8, ptr %px0, i32 4
+  %1 = load i32, ptr %px0, align 4
+  %incdec.ptr5 = getelementptr inbounds nuw i8, ptr %px1, i32 4
+  %2 = load i32, ptr %px1, align 4
+  %incdec.ptr6 = getelementptr inbounds nuw i8, ptr %px2, i32 4
+  %3 = load i32, ptr %px2, align 4
+  %incdec.ptr7 = getelementptr inbounds nuw i8, ptr %px3, i32 4
+  %4 = load i32, ptr %px3, align 4
+  %conv = sext i32 %1 to i64
+  %conv8 = sext i32 %0 to i64
+  %mul9 = mul nsw i64 %conv, %conv8
+  %add = add nsw i64 %mul9, %acc0
+  %conv10 = sext i32 %2 to i64
+  %mul12 = mul nsw i64 %conv10, %conv8
+  %add13 = add nsw i64 %mul12, %acc1
+  %conv14 = sext i32 %3 to i64
+  %mul16 = mul nsw i64 %conv14, %conv8
+  %add17 = add nsw i64 %mul16, %acc2
+  %conv18 = sext i32 %4 to i64
+  %mul20 = mul nsw i64 %conv18, %conv8
+  %add21 = add nsw i64 %mul20, %acc3
+  %in2.addr.next = getelementptr inbounds nuw i8, ptr %in2.addr, i32 8
+  %5 = load i32, ptr %incdec.ptr, align 4
+  %px0.next = getelementptr inbounds nuw i8, ptr %px0, i32 8
+  %6 = load i32, ptr %incdec.ptr4, align 4
+  %px1.next = getelementptr inbounds nuw i8, ptr %px1, i32 8
+  %7 = load i32, ptr %incdec.ptr5, align 4
+  %px2.next = getelementptr inbounds nuw i8, ptr %px2, i32 8
+  %8 = load i32, ptr %incdec.ptr6, align 4
+  %px3.next = getelementptr inbounds nuw i8, ptr %px3, i32 8
+  %9 = load i32, ptr %incdec.ptr7, align 4
+  %conv27 = sext i32 %6 to i64
+  %conv28 = sext i32 %5 to i64
+  %mul29 = mul nsw i64 %conv27, %conv28
+  %acc0.next = add nsw i64 %add, %mul29
+  %conv31 = sext i32 %7 to i64
+  %mul33 = mul nsw i64 %conv31, %conv28
+  %acc1.next = add nsw i64 %add13, %mul33
+  %conv35 = sext i32 %8 to i64
+  %mul37 = mul nsw i64 %conv35, %conv28
+  %acc2.next = add nsw i64 %add17, %mul37
+  %conv39 = sext i32 %9 to i64
+  %mul41 = mul nsw i64 %conv39, %conv28
+  %acc3.next = add nsw i64 %add21, %mul41
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %acc0.exit = phi i64 [ 0, %entry ], [ %acc0.next, %for.body ]
+  %acc1.exit = phi i64 [ 0, %entry ], [ %acc1.next, %for.body ]
+  %acc2.exit = phi i64 [ 0, %entry ], [ %acc2.next, %for.body ]
+  %acc3.exit = phi i64 [ 0, %entry ], [ %acc3.next, %for.body ]
+  store i64 %acc0.exit, ptr %out, align 8
+  %arrayidx43 = getelementptr inbounds nuw i8, ptr %out, i32 8
+  store i64 %acc1.exit, ptr %arrayidx43, align 8
+  %arrayidx44 = getelementptr inbounds nuw i8, ptr %out, i32 16
+  store i64 %acc2.exit, ptr %arrayidx44, align 8
+  %arrayidx45 = getelementptr inbounds nuw i8, ptr %out, i32 24
+  store i64 %acc3.exit, ptr %arrayidx45, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
index de49337c185ac..3654e82423317 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
@@ -13,14 +13,14 @@ define void @bar(ptr %A, i32 signext %n) {
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::FPRRC, 1 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
-; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
+; CHECK-SCALAR:      LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class
 ; CHECK-VECTOR:      LV(REG): Found max usage: 2 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 2 registers
 ; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
-; CHECK-VECTOR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
+; CHECK-VECTOR:      LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class
 
 entry:

>From 05c8f73dc00648a65d7b686877f14172f57c8124 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Wed, 4 Feb 2026 13:07:32 +0000
Subject: [PATCH 2/3] Change VPCostContext declaration to struct

---
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 3affa211dd140..c4b8018b5f545 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -19,7 +19,6 @@ namespace llvm {
 class LLVMContext;
 class VPValue;
 class VPBlendRecipe;
-class VPCostContext;
 class VPInstruction;
 class VPWidenRecipe;
 class VPWidenCallRecipe;
@@ -33,6 +32,8 @@ class TargetTransformInfo;
 class Type;
 class InstructionCost;
 
+struct VPCostContext;
+
 /// An analysis for type-inference for VPValues.
 /// It infers the scalar type for a given VPValue by bottom-up traversing
 /// through defining recipes until root nodes with known types are reached (e.g.

>From eccba25d095860a8d4cf41c6575d484d2e3aa958 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Wed, 4 Feb 2026 13:43:57 +0000
Subject: [PATCH 3/3] Fix the other VPRegisterUsage declaration

---
 llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 06e8efef20c03..56e94782abd07 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,7 +45,7 @@ class OptimizationRemarkEmitter;
 class TargetTransformInfo;
 class TargetLibraryInfo;
 class VPRecipeBuilder;
-class VPRegisterUsage;
+struct VPRegisterUsage;
 struct VFRange;
 
 extern cl::opt<bool> EnableVPlanNativePath;