[llvm] [VPlan] Implement interleaving as VPlan-to-VPlan transform. (PR #95842)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 18 13:11:16 PDT 2024


================
@@ -1582,3 +1583,434 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+namespace {
+
+/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
+/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
+/// the unrolling transformation, where the original VPValues are retained for
+/// part zero.
+class UnrollState {
+  /// Plan to unroll.
+  VPlan &Plan;
+  /// Unroll factor to unroll by.
+  const unsigned UF;
+  /// Analysis for types.
+  VPTypeAnalysis TypeInfo;
+
+  /// Unrolling may create recipes that should not be unrolled themselves.
+  /// Those are tracked in ToSkip.
+  SmallPtrSet<VPRecipeBase *, 8> ToSkip;
+
+  // Associate with each VPValue of part 0 its unrolled instances of parts 1,
+  // ..., UF-1.
+  DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
+
+  void unrollReplicateRegion(VPRegionBlock *VPR);
+  void unrollRecipe(VPRecipeBase &R);
+  void unrollHeaderPHI(VPRecipeBase &R, VPBasicBlock::iterator InsertPtForPhi);
+  void unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+                            VPBasicBlock::iterator InsertPtForPhi);
+
+  VPValue *getConstantVPV(unsigned Part) {
+    Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+    return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+  }
+
+public:
+  UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
+      : Plan(Plan), UF(UF),
+        TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx) {}
+
+  void unrollBlock(VPBlockBase *VPB);
+
+  VPValue *getValueForPart(VPValue *V, unsigned Part) {
+    if (Part == 0 || V->isLiveIn())
+      return V;
+    assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
+           "accessed value does not exist");
+    return VPV2Parts[V][Part - 1];
+  }
+
+  /// Given a single original recipe \p OrigR (of part zero), and its copy \p
+  /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
+  /// corresponding VPValue defined by \p CopyR.
+  void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
+                        unsigned Part) {
+    for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
+      auto Ins = VPV2Parts.insert({VPV, {}});
+      assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
+      Ins.first->second.push_back(CopyR->getVPValue(Idx));
+    }
+  }
+
+  /// Given a uniform recipe \p R, add it for all parts.
+  void addUniformForAllParts(VPSingleDefRecipe *R) {
+    auto Ins = VPV2Parts.insert({R, {}});
+    assert(Ins.second && "uniform value already added");
+    for (unsigned Part = 0; Part != UF; ++Part)
+      Ins.first->second.push_back(R);
+  }
+
+  bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
+
+  /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
+  /// \p P.
+  void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
+    auto *Op = R->getOperand(OpIdx);
+    R->setOperand(OpIdx, getValueForPart(Op, Part));
+  }
+
+  /// Update \p R's operands with their corresponding VPValues for part \p P.
+  void remapOperands(VPRecipeBase *R, unsigned Part) {
+    for (const auto &[OpIdx, Op] : enumerate(R->operands()))
+      R->setOperand(OpIdx, getValueForPart(Op, Part));
+  }
+};
+} // namespace
+
+void UnrollState::unrollReplicateRegion(VPRegionBlock *VPR) {
+  VPBlockBase *InsertPt = VPR->getSingleSuccessor();
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    auto *Copy = VPR->clone();
+    VPBlockUtils::insertBlockBefore(Copy, InsertPt);
+
+    auto PartI = vp_depth_first_shallow(Copy->getEntry());
+    auto Part0 = vp_depth_first_shallow(VPR->getEntry());
+    for (const auto &[PartIVPBB, Part0VPBB] :
+         zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
+             VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
+      for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
+        remapOperands(&PartIR, Part);
+        if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
+          ScalarIVSteps->addOperand(getConstantVPV(Part));
+        }
+
+        addRecipeForPart(&Part0R, &PartIR, Part);
+      }
+    }
+  }
+}
+
+void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+                                       VPBasicBlock::iterator InsertPtForPhi) {
+  VPBasicBlock *PH = cast<VPBasicBlock>(
+      IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
+  Type *IVTy = TypeInfo.inferScalarType(IV);
+  auto &ID = IV->getInductionDescriptor();
+  FastMathFlags FMFs;
+  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
+    FMFs = ID.getInductionBinOp()->getFastMathFlags();
+
+  VPValue *VectorStep = &Plan.getVF();
+  VPBuilder Builder(PH);
+  if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
+    Instruction::CastOps CastOp =
+        IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
+    VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
+    ToSkip.insert(VectorStep->getDefiningRecipe());
+  }
+
+  VPValue *ScalarStep = IV->getStepValue();
+  auto *ConstStep = ScalarStep->isLiveIn()
+                        ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
+                        : nullptr;
+  if (!ConstStep || ConstStep->getZExtValue() != 1) {
+    if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
+      ScalarStep =
+          Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
+      ToSkip.insert(ScalarStep->getDefiningRecipe());
+    }
+
+    VPInstruction *Mul;
+    if (IVTy->isFloatingPointTy())
+      Mul = Builder.createFPOp(Instruction::FMul, {VectorStep, ScalarStep},
+                               FMFs, IV->getDebugLoc());
+    else
+      Mul = Builder.createNaryOp(Instruction::Mul, {VectorStep, ScalarStep},
+                                 IV->getDebugLoc());
+    VectorStep = Mul;
+    ToSkip.insert(Mul);
+  }
+
+  // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
+  // remains the header phi. Parts > 0 are computed by adding Step to the
+  // previous part. The header phi recipe will get 2 new operands: the step
+  // value for a single part and the last part, used to compute the backedge
+  // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
+  // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
+  // %Part.1 = %Part.0 + %VectorStep
+  // %Part.2 = %Part.1 + %VectorStep
+  // %Part.3 = %Part.2 + %VectorStep
+  //
+  // The newly added recipes are added to ToSkip to avoid interleaving them
+  // again.
+  VPValue *Prev = IV;
+  Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    VPInstruction *Add;
+    std::string Name =
+        Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
+
+    if (IVTy->isFloatingPointTy())
+      Add = Builder.createFPOp(ID.getInductionOpcode(),
+                               {
+                                   Prev,
+                                   VectorStep,
+                               },
+                               FMFs, IV->getDebugLoc(), Name);
+    else
+      Add = Builder.createNaryOp(Instruction::Add,
+                                 {
+                                     Prev,
+                                     VectorStep,
+                                 },
+                                 IV->getDebugLoc(), Name);
+    ToSkip.insert(Add);
+    addRecipeForPart(IV, Add, Part);
+    Prev = Add;
+  }
+  IV->addOperand(VectorStep);
+  IV->addOperand(Prev);
+}
+
+void UnrollState::unrollHeaderPHI(VPRecipeBase &R,
+                                  VPBasicBlock::iterator InsertPtForPhi) {
+  // First-order recurrences pass a single vector or scalar through their header
+  // phis, irrespective of interleaving.
+  if (isa<VPFirstOrderRecurrencePHIRecipe>(&R))
+    return;
+
+  // Generate step vectors for each unrolled part.
+  if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+    unrollWidenInduction(IV, InsertPtForPhi);
+    return;
+  }
+
+  auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(&R);
+  if (RdxPhi && RdxPhi->isOrdered())
+    return;
+
+  auto InsertPt = std::next(R.getIterator());
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertBefore(*R.getParent(), InsertPt);
+    addRecipeForPart(&R, Copy, Part);
+    if (isa<VPWidenPointerInductionRecipe>(&R)) {
+      Copy->addOperand(R.getVPSingleValue());
+      Copy->addOperand(getConstantVPV(Part));
+    } else if (RdxPhi) {
+      Copy->addOperand(getConstantVPV(Part));
+    } else {
+      assert(isa<VPActiveLaneMaskPHIRecipe>(&R) &&
+             "unexpected header phi recipe not needing unrolled part");
+    }
+  }
+}
+
+/// Handle non-header-phi recipes.
+void UnrollState::unrollRecipe(VPRecipeBase &R) {
+  using namespace llvm::VPlanPatternMatch;
+  if (match(&R, m_BranchOnCond(m_VPValue())) ||
+      match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+    return;
+
+  if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
+    if (vputils::onlyFirstPartUsed(VPI)) {
+      addUniformForAllParts(cast<VPInstruction>(&R));
+      return;
+    }
+    if (match(VPI, m_VPInstruction<VPInstruction::CalculateTripCountMinusVF>(
+                       m_VPValue()))) {
+      addUniformForAllParts(VPI);
+      return;
+    }
+
+    VPValue *Op0;
+    VPValue *Op1;
+    if (match(VPI, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+                       m_VPValue(Op0), m_VPValue(Op1)))) {
+      addUniformForAllParts(VPI);
+      if (Plan.hasScalarVFOnly()) {
+        // Extracting from end with VF = 1 implies retrieving the scalar part UF
+        // - Op1.
+        unsigned Offset =
+            cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
+        VPI->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));
+      } else {
+        // Otherwise we extract from the last part.
+        remapOperands(VPI, UF - 1);
+      }
+      return;
+    }
+  }
+  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
+    if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
+        RepR->getOperand(1)->isDefinedOutsideVectorRegions()) {
+      // Stores to an invariant address only need to store the last part.
+      remapOperands(&R, UF - 1);
+      return;
+    }
+    if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
+      if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
+        addUniformForAllParts(RepR);
+        return;
+      }
+    }
+  }
+
+  // Unroll non-uniform recipes.
+  auto InsertPt = std::next(R.getIterator());
+  VPBasicBlock &VPBB = *R.getParent();
+  for (unsigned Part = 1; Part != UF; ++Part) {
+    VPRecipeBase *Copy = R.clone();
+    Copy->insertBefore(VPBB, InsertPt);
+    addRecipeForPart(&R, Copy, Part);
+
+    VPValue *Op;
+    if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
+                      m_VPValue(), m_VPValue(Op)))) {
+      Copy->setOperand(0, getValueForPart(Op, Part - 1));
+      Copy->setOperand(1, getValueForPart(Op, Part));
+      continue;
+    }
+    if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
+      auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
+      if (Phi->isOrdered()) {
+        auto Ins = VPV2Parts.insert({Phi, {}});
+        if (Part == 1) {
+          Ins.first->second.clear();
+          Ins.first->second.push_back(Red);
+        }
+        Ins.first->second.push_back(Copy->getVPSingleValue());
+        Phi->setOperand(1, Copy->getVPSingleValue());
+      }
+    }
+    remapOperands(Copy, Part);
+
+    // Add operand indicating the part to generate code for, to recipes still
+    // requiring it.
+    if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
+            VPVectorPointerRecipe>(Copy) ||
+        match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
+                        m_VPValue())))
+      Copy->addOperand(getConstantVPV(Part));
+
+    if (isa<VPVectorPointerRecipe>(R))
+      Copy->setOperand(0, R.getOperand(0));
+  }
+}
+
+using namespace llvm::VPlanPatternMatch;
+void UnrollState::unrollBlock(VPBlockBase *VPB) {
+  auto *VPR = dyn_cast<VPRegionBlock>(VPB);
+  if (VPR) {
+    if (VPR->isReplicator())
+      return unrollReplicateRegion(VPR);
+
+    // Traverse blocks in region in RPO to ensure defs are visited before uses
+    // across blocks.
+    ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
+        RPOT(VPR->getEntry());
+    for (VPBlockBase *VPB : RPOT)
+      unrollBlock(VPB);
+    return;
+  }
+
+  // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
+  auto *VPBB = cast<VPBasicBlock>(VPB);
+  auto InsertPtForPhi = VPBB->getFirstNonPhi();
+  for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+    if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
+      continue;
+
+    // Add all VPValues for all parts to ComputeReductionResult which combines
+    // the parts to compute the final reduction value.
+    VPValue *Op1;
+    if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
+                      m_VPValue(), m_VPValue(Op1)))) {
+      addUniformForAllParts(cast<VPInstruction>(&R));
+      for (unsigned Part = 1; Part != UF; ++Part)
+        R.addOperand(getValueForPart(Op1, Part));
+      continue;
+    }
+
+    auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
+    if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
+      addUniformForAllParts(SingleDef);
+      continue;
+    }
+
+    if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
+      unrollHeaderPHI(R, InsertPtForPhi);
+      continue;
+    }
+
+    unrollRecipe(R);
+  }
+}
+
+/// Remove recipes that are redundant after unrolling.
+static void cleanupRedundantRecipesAfterUnroll(VPlan &Plan) {
----------------
ayalz wrote:

Can be a lambda inside unrollByUF()?

https://github.com/llvm/llvm-project/pull/95842


More information about the llvm-commits mailing list