[llvm] Revert "[VPlan] Extract reverse mask from reverse accesses" (PR #189637)

Tue Mar 31 04:50:46 PDT 2026

llvmbot wrote:



@llvm/pr-subscribers-backend-risc-v

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Reverts llvm/llvm-project#155579

Assertion added triggers on some buildbots
clang: /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/llvm/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp:3840: virtual InstructionCost llvm::VPWidenMemoryRecipe::computeCost(ElementCount, VPCostContext &) const: Assertion `!IsReverse() && "Inconsecutive memory access should not have reverse order"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.      Program arguments: /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/stage1.install/bin/clang -DNDEBUG -mcpu=neoverse-v2 -mllvm -scalable-vectorization=preferred -O3 -std=gnu17 -fcommon -Wno-error=incompatible-pointer-types -MD -MT MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/CMakeFiles/timberwolfmc.dir/finalpin.c.o -MF CMakeFiles/timberwolfmc.dir/finalpin.c.o.d -o CMakeFiles/timberwolfmc.dir/finalpin.c.o -c /home/tcwg-buildbot/worker/clang-aarch64-sve2-vla/test/test-suite/MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/finalpin.c

---

Patch is 57.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189637.diff


14 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+21-18) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+27-12) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+42-50) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+30-34) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll (+2-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/dbg-tail-folding-by-evl.ll (+3-3) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/predicated-reverse-store.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll (+8-8) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll (+8-6) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/RISCV/vplan-riscv-vector-reverse.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll (+6-6) 
- (modified) llvm/unittests/Transforms/Vectorize/VPlanTest.cpp (+8-6) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b268c81e550cf..ac94d0dbcc4cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7137,18 +7137,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
     return nullptr;
   };
 
+  // Check if a select for a safe divisor was hoisted to the pre-header. If so,
+  // the select doesn't need to be considered for the vector loop cost; go with
+  // the more accurate VPlan-based cost model.
   for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
-    // Reverse operations for reverse memory accesses may be hoisted to the
-    // preheader by LICM if the reversed value is loop invariant. In this case,
-    // the VPlan-based cost model diverges from the legacy cost model.
-    if (match(&R,
-              m_CombineOr(m_Reverse(m_VPValue()),
-                          m_Intrinsic<Intrinsic::experimental_vp_reverse>())))
-      return true;
-
-    // Check if a select for a safe divisor was hoisted to the pre-header. If
-    // so, the select doesn't need to be considered for the vector loop cost; go
-    // with the more accurate VPlan-based cost model.
     auto *VPI = dyn_cast<VPInstruction>(&R);
     if (!VPI || VPI->getOpcode() != Instruction::Select)
       continue;
@@ -7201,6 +7193,20 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
         if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
                          CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
           return true;
+
+        if (WidenMemR->isReverse()) {
+          // If the stored value of a reverse store is invariant, LICM will
+          // hoist the reverse operation to the preheader. In this case, the
+          // result of the VPlan-based cost model will diverge from that of
+          // the legacy model.
+          if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
+            if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
+              return true;
+
+          if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
+            if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
+              return true;
+        }
       }
 
       // The legacy cost model costs non-header phis with a scalar VF as a phi,
@@ -7748,13 +7754,10 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
     Ptr = VectorPtr;
   }
 
-  if (Reverse && Mask)
-    Mask = Builder.createNaryOp(VPInstruction::Reverse, Mask, I->getDebugLoc());
-
   if (VPI->getOpcode() == Instruction::Load) {
     auto *Load = cast<LoadInst>(I);
-    auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, *VPI,
-                                        Load->getDebugLoc());
+    auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+                                        *VPI, Load->getDebugLoc());
     if (Reverse) {
       Builder.insert(LoadR);
       return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
@@ -7768,8 +7771,8 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
   if (Reverse)
     StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
                                      Store->getDebugLoc());
-  return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, *VPI,
-                                Store->getDebugLoc());
+  return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
+                                Reverse, *VPI, Store->getDebugLoc());
 }
 
 VPWidenIntOrFpInductionRecipe *
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5bac173262468..ab47d927942db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3535,6 +3535,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
   /// Whether the accessed addresses are consecutive.
   bool Consecutive;
 
+  /// Whether the consecutive accessed addresses are in reverse order.
+  bool Reverse;
+
   /// Whether the memory access is masked.
   bool IsMasked = false;
 
@@ -3548,10 +3551,15 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, const VPIRMetadata &Metadata,
-                      DebugLoc DL)
+                      bool Consecutive, bool Reverse,
+                      const VPIRMetadata &Metadata, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
-        Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive) {}
+        Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
+        Reverse(Reverse) {
+    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    assert((isa<VPVectorEndPointerRecipe>(getAddr()) || !Reverse) &&
+           "Reversed acccess without VPVectorEndPointerRecipe address?");
+  }
 
 public:
   VPWidenMemoryRecipe *clone() override {
@@ -3573,6 +3581,10 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
   /// Return whether the loaded-from / stored-to addresses are consecutive.
   bool isConsecutive() const { return Consecutive; }
 
+  /// Return whether the consecutive loaded/stored addresses are in reverse
+  /// order.
+  bool isReverse() const { return Reverse; }
+
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const { return getOperand(0); }
 
@@ -3606,16 +3618,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
 struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
                                                    public VPRecipeValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, const VPIRMetadata &Metadata, DebugLoc DL)
+                    bool Consecutive, bool Reverse,
+                    const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPRecipeBase::VPWidenLoadSC, Load, {Addr},
-                            Consecutive, Metadata, DL),
+                            Consecutive, Reverse, Metadata, DL),
         VPRecipeValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, *this, getDebugLoc());
+                                 getMask(), Consecutive, Reverse, *this,
+                                 getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPRecipeBase::VPWidenLoadSC);
@@ -3648,7 +3662,7 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe,
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
                        VPValue *Mask)
       : VPWidenMemoryRecipe(VPRecipeBase::VPWidenLoadEVLSC, L.getIngredient(),
-                            {Addr, &EVL}, L.isConsecutive(), L,
+                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
                             L.getDebugLoc()),
         VPRecipeValue(this, &getIngredient()) {
     setMask(Mask);
@@ -3687,17 +3701,18 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe,
 /// to store to and an optional mask.
 struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
-                     VPValue *Mask, bool Consecutive,
+                     VPValue *Mask, bool Consecutive, bool Reverse,
                      const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPRecipeBase::VPWidenStoreSC, Store,
-                            {Addr, StoredVal}, Consecutive, Metadata, DL) {
+                            {Addr, StoredVal}, Consecutive, Reverse, Metadata,
+                            DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  *this, getDebugLoc());
+                                  Reverse, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPRecipeBase::VPWidenStoreSC);
@@ -3732,8 +3747,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
                         VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
       : VPWidenMemoryRecipe(VPRecipeBase::VPWidenStoreEVLSC, S.getIngredient(),
-                            {Addr, StoredVal, &EVL}, S.isConsecutive(), S,
-                            S.getDebugLoc()) {
+                            {Addr, StoredVal, &EVL}, S.isConsecutive(),
+                            S.isReverse(), S, S.getDebugLoc()) {
     setMask(Mask);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a53ebd375ca2..7eefd77045050 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1029,6 +1029,8 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
         return TTI::CastContextHint::Normal;
       if (!WidenMemoryRecipe->isConsecutive())
         return TTI::CastContextHint::GatherScatter;
+      if (WidenMemoryRecipe->isReverse())
+        return TTI::CastContextHint::Reversed;
       if (WidenMemoryRecipe->isMasked())
         return TTI::CastContextHint::Masked;
       return TTI::CastContextHint::Normal;
@@ -1036,7 +1038,6 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
 
     VPValue *Operand = getOperand(0);
     TTI::CastContextHint CCH = TTI::CastContextHint::None;
-    bool IsReverse = false;
     // For Trunc/FPTrunc, get the context from the only user.
     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
       auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
@@ -1045,10 +1046,8 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
         return dyn_cast<VPRecipeBase>(*R->user_begin());
       };
       if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
-        if (match(Recipe, m_Reverse(m_VPValue()))) {
+        if (match(Recipe, m_Reverse(m_VPValue())))
           Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
-          IsReverse = true;
-        }
         if (Recipe)
           CCH = ComputeCCH(Recipe);
       }
@@ -1058,16 +1057,12 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
              Opcode == Instruction::FPExt) {
       if (auto *Recipe = Operand->getDefiningRecipe()) {
         VPValue *ReverseOp;
-        if (match(Recipe, m_Reverse(m_VPValue(ReverseOp)))) {
+        if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
           Recipe = ReverseOp->getDefiningRecipe();
-          IsReverse = true;
-        }
         if (Recipe)
           CCH = ComputeCCH(Recipe);
       }
     }
-    if (IsReverse && CCH != TTI::CastContextHint::None)
-      CCH = TTI::CastContextHint::Reversed;
 
     auto *ScalarSrcTy = Ctx.Types.inferScalarType(Operand);
     Type *SrcTy = VF.isVector() ? toVectorTy(ScalarSrcTy, VF) : ScalarSrcTy;
@@ -1249,13 +1244,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
   case VPInstruction::Reverse: {
     assert(VF.isVector() && "Reverse operation must be vector type");
-    Type *EltTy = Ctx.Types.inferScalarType(this);
-    // Skip the reverse operation cost for the mask.
-    // FIXME: Remove this once redundant mask reverse operations can be
-    // eliminated by VPlanTransforms::cse before cost computation.
-    if (EltTy->isIntegerTy(1))
-      return 0;
-    auto *VectorTy = cast<VectorType>(toVectorTy(EltTy, VF));
+    auto *VectorTy = cast<VectorType>(
+        toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
     return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
                                   VectorTy, /*Mask=*/{}, Ctx.CostKind,
                                   /*Index=*/0);
@@ -1943,13 +1933,6 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
                                             const VPRecipeWithIRFlags &R,
                                             ElementCount VF,
                                             VPCostContext &Ctx) {
-  Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
-  // Skip the reverse operation cost for the mask.
-  // FIXME: Remove this once redundant mask reverse operations can be eliminated
-  // by VPlanTransforms::cse before cost computation.
-  if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(1))
-    return InstructionCost(0);
-
   // Some backends analyze intrinsic arguments to determine cost. Use the
   // underlying value for the operand if it has one. Otherwise try to use the
   // operand of the underlying call instruction, if there is one. Otherwise
@@ -1969,6 +1952,7 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
     Arguments.push_back(V);
   }
 
+  Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
   Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
   SmallVector<Type *> ParamTys;
   for (const VPValue *Op : Operands) {
@@ -3817,27 +3801,9 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
     // TODO: Using the original IR may not be accurate.
     // Currently, ARM will use the underlying IR to calculate gather/scatter
     // instruction cost.
-    [[maybe_unused]] auto IsReverse = [this]() {
-      // Check if mask is reversed.
-      if (VPValue *Mask = getMask())
-        if (match(Mask, m_Reverse(m_VPValue())))
-          return true;
-
-      // For loads, check if the single user is a reverse operation.
-      if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)) {
-        auto *U = getVPSingleValue()->getSingleUser();
-        return U && match(cast<VPRecipeBase>(U), m_Reverse(m_VPValue()));
-      }
+    assert(!Reverse &&
+           "Inconsecutive memory access should not have the order.");
 
-      // For stores, check if the stored value is reversed.
-      VPValue *StoredVal =
-          isa<VPWidenStoreRecipe>(this)
-              ? cast<VPWidenStoreRecipe>(this)->getStoredValue()
-              : cast<VPWidenStoreEVLRecipe>(this)->getStoredValue();
-      return match(StoredVal, m_Reverse(m_VPValue()));
-    };
-    assert(!IsReverse() &&
-           "Inconsecutive memory access should not have reverse order");
     const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
     Type *PtrTy = Ptr->getType();
 
@@ -3881,8 +3847,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
 
   auto &Builder = State.Builder;
   Value *Mask = nullptr;
-  if (auto *VPMask = getMask())
+  if (auto *VPMask = getMask()) {
+    // Mask reversal is only needed for non-all-one (null) masks, as reverse
+    // of a null all-one mask is a null mask.
     Mask = State.get(VPMask);
+    if (isReverse())
+      Mask = Builder.CreateVectorReverse(Mask, "reverse");
+  }
 
   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateGather);
   Value *NewLI;
@@ -3910,6 +3881,17 @@ void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+/// Use all-true mask for reverse rather than actual mask, as it avoids a
+/// dependence w/o affecting the result.
+static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
+                                     Value *EVL, const Twine &Name) {
+  VectorType *ValTy = cast<VectorType>(Operand->getType());
+  Value *AllTrueMask =
+      Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
+  return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
+                                 {Operand, AllTrueMask, EVL}, nullptr, Name);
+}
+
 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
@@ -3920,10 +3902,13 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Value *EVL = State.get(getEVL(), VPLane(0));
   Value *Addr = State.get(getAddr(), !CreateGather);
   Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask())
+  if (VPValue *VPMask = getMask()) {
     Mask = State.get(VPMask);
-  else
+    if (isReverse())
+      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
+  } else {
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  }
 
   if (CreateGather) {
     NewLI =
@@ -3975,8 +3960,13 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
   Value *Mask = nullptr;
-  if (auto *VPMask = getMask())
+  if (auto *VPMask = getMask()) {
+    // Mask reversal is only needed for non-all-one (null) masks, as reverse
+    // of a null all-one mask is a null mask.
     Mask = State.get(VPMask);
+    if (isReverse())
+      Mask = Builder.CreateVectorReverse(Mask, "reverse");
+  }
 
   Value *StoredVal = State.get(StoredVPValue);
   Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
@@ -4008,11 +3998,13 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   Value *StoredVal = State.get(StoredValue);
   Value *EVL = State.get(getEVL(), VPLane(0));
   Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask())
+  if (VPValue *VPMask = getMask()) {
     Mask = State.get(VPMask);
-  else
+    if (isReverse())
+      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
+  } else {
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-
+  }
   Value *Addr = State.get(getAddr(), !CreateScatter);
   if (CreateScatter) {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 78cc39522e8bc..d6e014e71e1f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -80,11 +80,12 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, *VPI, Ingredient.getDebugLoc());
+              false /*Consecutive*/, false /*Reverse*/, *VPI,
+              Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
-              nullptr /*Mask*/, false /*Consecutive*/, *VPI,
+              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
               Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
@@ -1784,6 +1785,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
       if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
           !WidenStoreR->isConsecutive()) {
+        assert(!WidenStoreR->isReverse() &&
+               "Not consecutive memory recipes shouldn't be reversed");
         VPValue *Mask = WidenStoreR->getMask();
 
         // Only convert the scatter to a scalar store if it is unmasked.
@@ -3057,32 +3060,20 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
     return EVLEndPtr;
   };
 
-  auto GetVPReverse = [&CurRecipe, &EVL, &TypeInfo, Plan,
-                       DL](VPValue *V) -> VPWidenIntrinsicRecipe * {
-    if (!V)
-      return nullptr;
-    auto *Reverse = new VPWidenIntrinsicRecipe(
-        Intrinsic::experimental_vp_reverse, {V, Plan->getTrue(), &EVL},
-        TypeInfo.inferScalarType(V), {}, {}, DL);
-    Reverse->insertBefore(&CurRecipe);
-    return Reverse;
-  };
-
   if (match(&CurRecipe,
-            m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))))
+            m_Ma...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/189637