[llvm] [VPlan] Sink predicated stores with complementary masks. (PR #168771)

Fri Nov 28 08:10:16 PST 2025

================
@@ -4127,119 +4143,217 @@ void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
   }
 }
 
-// Returns the intersection of metadata from a group of loads.
-static VPIRMetadata getCommonLoadMetadata(ArrayRef<VPReplicateRecipe *> Loads) {
-  VPIRMetadata CommonMetadata = *Loads.front();
-  for (VPReplicateRecipe *Load : drop_begin(Loads))
-    CommonMetadata.intersect(*Load);
+// Collect common metadata from a group of replicate recipes by intersecting
+// metadata from all recipes in the group.
+static VPIRMetadata getCommonMetadata(ArrayRef<VPReplicateRecipe *> Recipes) {
+  VPIRMetadata CommonMetadata = *Recipes.front();
+  for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
+    CommonMetadata.intersect(*Recipe);
   return CommonMetadata;
 }
 
-void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
-                                           const Loop *L) {
+template <unsigned Opcode>
+static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
+collectComplementaryPredicatedMemOps(VPlan &Plan, ScalarEvolution &SE,
+                                     const Loop *L) {
+  static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
+                "Only Load and Store opcodes supported");
+  constexpr bool IsLoad = (Opcode == Instruction::Load);
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPTypeAnalysis TypeInfo(Plan);
-  VPDominatorTree VPDT(Plan);
 
-  // Group predicated loads by their address SCEV.
-  DenseMap<const SCEV *, SmallVector<VPReplicateRecipe *>> LoadsByAddress;
+  // Group predicated operations by their address SCEV.
+  DenseMap<const SCEV *, SmallVector<VPReplicateRecipe *>> RecipesByAddress;
   for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
     auto *VPBB = cast<VPBasicBlock>(Block);
     for (VPRecipeBase &R : *VPBB) {
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
-      if (!RepR || RepR->getOpcode() != Instruction::Load ||
-          !RepR->isPredicated())
+      if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
         continue;
 
-      VPValue *Addr = RepR->getOperand(0);
+      // For loads, operand 0 is address; for stores, operand 1 is address.
+      VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
       const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
       if (!isa<SCEVCouldNotCompute>(AddrSCEV))
-        LoadsByAddress[AddrSCEV].push_back(RepR);
+        RecipesByAddress[AddrSCEV].push_back(RepR);
     }
   }
 
-  // For each address, collect loads with complementary masks, sort by
-  // dominance, and use the earliest load.
-  for (auto &[Addr, Loads] : LoadsByAddress) {
-    if (Loads.size() < 2)
+  // For each address, collect operations with the same or complementary masks.
+  SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;
+  auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
+    return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
+  };
+  for (auto &[Addr, Recipes] : RecipesByAddress) {
+    if (Recipes.size() < 2)
       continue;
 
-    // Collect groups of loads with complementary masks.
-    SmallVector<SmallVector<VPReplicateRecipe *, 4>> LoadGroups;
-    for (VPReplicateRecipe *&LoadI : Loads) {
-      if (!LoadI)
+    // Collect groups with the same or complementary masks.
+    for (VPReplicateRecipe *&RecipeI : Recipes) {
+      if (!RecipeI)
         continue;
 
-      VPValue *MaskI = LoadI->getMask();
-      Type *TypeI = TypeInfo.inferScalarType(LoadI);
+      VPValue *MaskI = RecipeI->getMask();
+      Type *TypeI = GetLoadStoreValueType(RecipeI);
       SmallVector<VPReplicateRecipe *, 4> Group;
-      Group.push_back(LoadI);
-      LoadI = nullptr;
+      Group.push_back(RecipeI);
+      RecipeI = nullptr;
 
-      // Find all loads with the same type.
-      for (VPReplicateRecipe *&LoadJ : Loads) {
-        if (!LoadJ)
+      // Find all operations with the same or complementary masks.
+      bool HasComplementaryMask = false;
+      for (VPReplicateRecipe *&RecipeJ : Recipes) {
+        if (!RecipeJ)
           continue;
 
-        Type *TypeJ = TypeInfo.inferScalarType(LoadJ);
+        VPValue *MaskJ = RecipeJ->getMask();
+        Type *TypeJ = GetLoadStoreValueType(RecipeJ);
         if (TypeI == TypeJ) {
-          Group.push_back(LoadJ);
-          LoadJ = nullptr;
+          // Check if any operation in the group has a complementary mask with
+          // another, that is M1 == NOT(M2) or M2 == NOT(M1).
+          HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
+                                  match(MaskJ, m_Not(m_Specific(MaskI)));
+          Group.push_back(RecipeJ);
+          RecipeJ = nullptr;
         }
       }
 
-      // Check if any load in the group has a complementary mask with another,
-      // that is M1 == NOT(M2) or M2 == NOT(M1).
-      bool HasComplementaryMask =
-          any_of(drop_begin(Group), [MaskI](VPReplicateRecipe *Load) {
-            VPValue *MaskJ = Load->getMask();
-            return match(MaskI, m_Not(m_Specific(MaskJ))) ||
-                   match(MaskJ, m_Not(m_Specific(MaskI)));
-          });
+      if (HasComplementaryMask) {
+        assert(Group.size() >= 2 && "must have at least 2 entries");
+        AllGroups.push_back(std::move(Group));
+      }
+    }
+  }
+
+  return AllGroups;
+}
+
+// Find the recipe with minimum alignment in the group.
+template <typename InstType>
+static VPReplicateRecipe *
+findRecipeWithMinAlign(ArrayRef<VPReplicateRecipe *> Group) {
+  return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
+    return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
+           cast<InstType>(B->getUnderlyingInstr())->getAlign();
+  });
+}
+
+void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
+                                           const Loop *L) {
+  auto Groups =
+      collectComplementaryPredicatedMemOps<Instruction::Load>(Plan, SE, L);
+  if (Groups.empty())
+    return;
+
+  VPDominatorTree VPDT(Plan);
 
-      if (HasComplementaryMask)
-        LoadGroups.push_back(std::move(Group));
+  // Process each group of loads.
+  for (auto &Group : Groups) {
+    // Sort loads by dominance order, with earliest (most dominating) first.
+    sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
+      return VPDT.properlyDominates(A, B);
+    });
+
+    // Try to use the earliest (most dominating) load to replace all others.
+    VPReplicateRecipe *EarliestLoad = Group[0];
+    VPBasicBlock *FirstBB = EarliestLoad->getParent();
+    VPBasicBlock *LastBB = Group.back()->getParent();
+
+    // Check that the load doesn't alias with stores between first and last.
+    auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
+    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB,
+                                                    /*CheckReads=*/false))
+      continue;
+
+    // Collect common metadata from all loads in the group.
+    VPIRMetadata CommonMetadata = getCommonMetadata(Group);
+
+    // Find the load with minimum alignment to use.
+    auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
+
+    // Create an unpredicated version of the earliest load with common
+    // metadata.
+    auto *UnpredicatedLoad = new VPReplicateRecipe(
+        LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
+        /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
+        CommonMetadata);
+
+    UnpredicatedLoad->insertBefore(EarliestLoad);
+
+    // Replace all loads in the group with the unpredicated load.
+    for (VPReplicateRecipe *Load : Group) {
+      Load->replaceAllUsesWith(UnpredicatedLoad);
+      Load->eraseFromParent();
     }
+  }
+}
 
-    // For each group, check memory dependencies and hoist the earliest load.
-    for (auto &Group : LoadGroups) {
-      // Sort loads by dominance order, with earliest (most dominating) first.
-      sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
-        return VPDT.properlyDominates(A, B);
-      });
+static bool
+canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
+  auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
+  if (!StoreLoc || !StoreLoc->AATags.Scope)
+    return false;
 
-      VPReplicateRecipe *EarliestLoad = Group.front();
-      VPBasicBlock *FirstBB = EarliestLoad->getParent();
-      VPBasicBlock *LastBB = Group.back()->getParent();
+  // When sinking a group of stores, all members of the group alias each other.
+  // Skip them  during the alias checks.
----------------
juliannagele wrote:

```suggestion
  // Skip them during the alias checks.
```

https://github.com/llvm/llvm-project/pull/168771