[llvm-branch-commits] [llvm] [VPlan] Implement compressed widening of memory instructions (PR #166956)

Fri Nov 7 07:24:02 PST 2025

llvmbot wrote:



@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-analysis

Author: Sergey Kachkov (skachkov-sc)

<details>
<summary>Changes</summary>

RFC link: https://discourse.llvm.org/t/rfc-loop-vectorization-of-compress-store-expand-load-patterns/86442

---
Full diff: https://github.com/llvm/llvm-project/pull/166956.diff


5 Files Affected:

- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+1) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+17-7) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+20-12) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+17-6) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+6-5) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..e8769f5860c77 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1442,6 +1442,7 @@ class TargetTransformInfo {
     Normal,        ///< The cast is used with a normal load/store.
     Masked,        ///< The cast is used with a masked load/store.
     GatherScatter, ///< The cast is used with a gather/scatter.
+    Compressed,    ///< The cast is used with an expand load/compress store.
     Interleave,    ///< The cast is used with an interleaved load/store.
     Reversed,      ///< The cast is used with a reversed load/store.
   };
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 914018591d832..25e8a63eae9cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1027,6 +1027,7 @@ class LoopVectorizationCostModel {
     CM_Widen_Reverse, // For consecutive accesses with stride -1.
     CM_Interleave,
     CM_GatherScatter,
+    CM_Compressed,
     CM_Scalarize,
     CM_VectorCall,
     CM_IntrinsicCall
@@ -3108,9 +3109,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     if (IsUniformMemOpUse(I))
       return true;
 
-    return (WideningDecision == CM_Widen ||
-            WideningDecision == CM_Widen_Reverse ||
-            WideningDecision == CM_Interleave);
+    return (
+        WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse ||
+        WideningDecision == CM_Interleave || WideningDecision == CM_Compressed);
   };
 
   // Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5191,12 +5192,17 @@ InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
     Instruction *I, ElementCount VF, InstWidening Decision) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
+  if (Decision == CM_Compressed)
+    return TTI.getExpandCompressMemoryOpCost(I->getOpcode(), VectorTy,
+                                             /*VariableMask*/ true, Alignment,
+                                             CostKind, I);
+
   assert((Decision == CM_Widen || Decision == CM_Widen_Reverse) &&
          "Expected widen decision.");
-  const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6299,6 +6305,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       switch (getWideningDecision(I, VF)) {
       case LoopVectorizationCostModel::CM_GatherScatter:
         return TTI::CastContextHint::GatherScatter;
+      case LoopVectorizationCostModel::CM_Compressed:
+        return TTI::CastContextHint::Compressed;
       case LoopVectorizationCostModel::CM_Interleave:
         return TTI::CastContextHint::Interleave;
       case LoopVectorizationCostModel::CM_Scalarize:
@@ -7514,8 +7522,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   LoopVectorizationCostModel::InstWidening Decision =
       CM.getWideningDecision(I, Range.Start);
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
+  bool Compressed = Decision == LoopVectorizationCostModel::CM_Compressed;
   bool Consecutive =
-      Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+      Reverse || Compressed || Decision == LoopVectorizationCostModel::CM_Widen;
 
   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -7545,11 +7554,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
+                                 Compressed, VPIRMetadata(*Load, LVer),
+                                 I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, VPIRMetadata(*Store, LVer),
+                                Reverse, Compressed, VPIRMetadata(*Store, LVer),
                                 I->getDebugLoc());
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbb03fbdff7a2..26256951a9c6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3193,6 +3193,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
   /// Whether the consecutive accessed addresses are in reverse order.
   bool Reverse;
 
+  /// Whether the consecutive accessed addresses are compressed with mask value.
+  bool Compressed;
+
   /// Whether the memory access is masked.
   bool IsMasked = false;
 
@@ -3206,12 +3209,13 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
 
   VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
                       std::initializer_list<VPValue *> Operands,
-                      bool Consecutive, bool Reverse,
+                      bool Consecutive, bool Reverse, bool Compressed,
                       const VPIRMetadata &Metadata, DebugLoc DL)
       : VPRecipeBase(SC, Operands, DL), VPIRMetadata(Metadata), Ingredient(I),
         Alignment(getLoadStoreAlignment(&I)), Consecutive(Consecutive),
-        Reverse(Reverse) {
+        Reverse(Reverse), Compressed(Compressed) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    assert((Consecutive || !Compressed) && "Compressed implies consecutive");
     assert(isa<VPVectorEndPointerRecipe>(getAddr()) ||
            !Reverse &&
                "Reversed acccess without VPVectorEndPointerRecipe address?");
@@ -3241,6 +3245,9 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
   /// order.
   bool isReverse() const { return Reverse; }
 
+  /// Return whether the consecutive loaded/stored addresses are compressed.
+  bool isCompressed() const { return Compressed; }
+
   /// Return the address accessed by this recipe.
   VPValue *getAddr() const { return getOperand(0); }
 
@@ -3274,18 +3281,18 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
 struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
                                                    public VPValue {
   VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                    bool Consecutive, bool Reverse,
+                    bool Consecutive, bool Reverse, bool Compressed,
                     const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
-                            Reverse, Metadata, DL),
+                            Reverse, Compressed, Metadata, DL),
         VPValue(this, &Load) {
     setMask(Mask);
   }
 
   VPWidenLoadRecipe *clone() override {
     return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
-                                 getMask(), Consecutive, Reverse, *this,
-                                 getDebugLoc());
+                                 getMask(), Consecutive, Reverse, Compressed,
+                                 *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
@@ -3316,8 +3323,8 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
                        VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
-                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
-                            L.getDebugLoc()),
+                            {Addr, &EVL}, L.isConsecutive(), L.isReverse(),
+                            L.isCompressed(), L, L.getDebugLoc()),
         VPValue(this, &getIngredient()) {
     setMask(Mask);
   }
@@ -3355,16 +3362,16 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
 struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
                      VPValue *Mask, bool Consecutive, bool Reverse,
-                     const VPIRMetadata &Metadata, DebugLoc DL)
+                     bool Compressed, const VPIRMetadata &Metadata, DebugLoc DL)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
-                            Consecutive, Reverse, Metadata, DL) {
+                            Consecutive, Reverse, Compressed, Metadata, DL) {
     setMask(Mask);
   }
 
   VPWidenStoreRecipe *clone() override {
     return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
                                   getStoredValue(), getMask(), Consecutive,
-                                  Reverse, *this, getDebugLoc());
+                                  Reverse, Compressed, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
@@ -3399,7 +3406,8 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
                         VPValue *Mask)
       : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
                             {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
-                            S.isReverse(), S, S.getDebugLoc()) {
+                            S.isReverse(), S.isCompressed(), S,
+                            S.getDebugLoc()) {
     setMask(Mask);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 80cd112dbcd8a..0b0bd63ee2b28 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3565,8 +3565,12 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
 
   InstructionCost Cost = 0;
   if (IsMasked) {
-    Cost +=
-        Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
+    Cost += Compressed
+                ? Ctx.TTI.getExpandCompressMemoryOpCost(Opcode, Ty,
+                                                        /*VariableMask*/ true,
+                                                        Alignment, Ctx.CostKind)
+                : Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS,
+                                                Ctx.CostKind);
   } else {
     TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
         isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
@@ -3603,9 +3607,13 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
     NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr,
                                        "wide.masked.gather");
   } else if (Mask) {
-    NewLI =
-        Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
-                                 PoisonValue::get(DataTy), "wide.masked.load");
+    NewLI = Compressed
+                ? Builder.CreateMaskedExpandLoad(DataTy, Addr, Alignment, Mask,
+                                                 PoisonValue::get(DataTy),
+                                                 "wide.masked.expand.load")
+                : Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask,
+                                           PoisonValue::get(DataTy),
+                                           "wide.masked.load");
   } else {
     NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
   }
@@ -3732,7 +3740,10 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   if (CreateScatter)
     NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask);
   else if (Mask)
-    NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
+    NewSI = Compressed
+                ? Builder.CreateMaskedCompressStore(StoredVal, Addr, Alignment,
+                                                    Mask)
+                : Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask);
   else
     NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment);
   applyMetadata(*NewSI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48bd697397f41..cdfbc531ebfa6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -91,13 +91,14 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
-              Ingredient.getDebugLoc());
+              false /*Consecutive*/, false /*Reverse*/, false /*Compressed*/,
+              VPIRMetadata(*Load), Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              VPIRMetadata(*Store), Ingredient.getDebugLoc());
+              false /*Compressed*/, VPIRMetadata(*Store),
+              Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -4207,7 +4208,7 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
     auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
     auto *L = new VPWidenLoadRecipe(
         *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
-        /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
+        /*Reverse=*/false, /*Compressed=*/false, {}, LoadGroup->getDebugLoc());
     L->insertBefore(LoadGroup);
     NarrowedOps.insert(L);
     return L;
@@ -4344,7 +4345,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
         cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
     auto *S = new VPWidenStoreRecipe(
         *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
-        /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
+        /*Reverse=*/false, /*Compressed=*/false, {}, StoreGroup->getDebugLoc());
     S->insertBefore(StoreGroup);
     StoreGroup->eraseFromParent();
   }

``````````

</details>


https://github.com/llvm/llvm-project/pull/166956