[llvm] [VPlan] Implement interleaving as VPlan-to-VPlan transform. (PR #95842)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 20 11:38:23 PDT 2024
================
@@ -1582,3 +1583,449 @@ void VPlanTransforms::createInterleaveGroups(
}
}
}
+
+namespace {
+
+/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
+/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
+/// the unrolling transformation, where the original VPValues are retained for
+/// part zero.
+class UnrollState {
+ /// Plan to unroll.
+ VPlan &Plan;
+ /// Unroll factor to unroll by.
+ const unsigned UF;
+ /// Analysis for types.
+ VPTypeAnalysis TypeInfo;
+
+ /// Unrolling may create recipes that should not be unrolled themselves.
+ /// Those are tracked in ToSkip.
+ SmallPtrSet<VPRecipeBase *, 8> ToSkip;
+
+ // Associate with each VPValue of part 0 its unrolled instances of parts 1,
+ // ..., UF-1.
+ DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
+
+ void unrollReplicateRegion(VPRegionBlock *VPR);
+ void unrollRecipe(VPRecipeBase &R);
+ void unrollHeaderPHI(VPRecipeBase &R, VPBasicBlock::iterator InsertPtForPhi);
+ void unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+ VPBasicBlock::iterator InsertPtForPhi);
+
+ VPValue *getConstantVPV(unsigned Part) {
+ Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+ return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+ }
+
+public:
+ UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
+ : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
+
+ void unrollBlock(VPBlockBase *VPB);
+
+ VPValue *getValueForPart(VPValue *V, unsigned Part) {
+ if (Part == 0 || V->isLiveIn())
+ return V;
+ assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
+ "accessed value does not exist");
+ return VPV2Parts[V][Part - 1];
+ }
+
+ /// Given a single original recipe \p OrigR (of part zero), and its copy \p
+ /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
+ /// corresponding VPValue defined by \p CopyR.
+ void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
+ unsigned Part) {
+ for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
+ auto Ins = VPV2Parts.insert({VPV, {}});
+ assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
+ Ins.first->second.push_back(CopyR->getVPValue(Idx));
+ }
+ }
+
+ /// Given a uniform recipe \p R, add it for all parts.
+ void addUniformForAllParts(VPSingleDefRecipe *R) {
+ auto Ins = VPV2Parts.insert({R, {}});
+ assert(Ins.second && "uniform value already added");
+ for (unsigned Part = 0; Part != UF; ++Part)
+ Ins.first->second.push_back(R);
+ }
+
+ bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
+
+ /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
+ /// \p P.
+ void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
+ auto *Op = R->getOperand(OpIdx);
+ R->setOperand(OpIdx, getValueForPart(Op, Part));
+ }
+
+ /// Update \p R's operands with their corresponding VPValues for part \p P.
+ void remapOperands(VPRecipeBase *R, unsigned Part) {
+ for (const auto &[OpIdx, Op] : enumerate(R->operands()))
+ R->setOperand(OpIdx, getValueForPart(Op, Part));
+ }
+};
+} // namespace
+
+void UnrollState::unrollReplicateRegion(VPRegionBlock *VPR) {
+ VPBlockBase *InsertPt = VPR->getSingleSuccessor();
+ for (unsigned Part = 1; Part != UF; ++Part) {
+ auto *Copy = VPR->clone();
+ VPBlockUtils::insertBlockBefore(Copy, InsertPt);
+
+ auto PartI = vp_depth_first_shallow(Copy->getEntry());
+ auto Part0 = vp_depth_first_shallow(VPR->getEntry());
+ for (const auto &[PartIVPBB, Part0VPBB] :
+ zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
+ VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
+ for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
+ remapOperands(&PartIR, Part);
+ if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
+ ScalarIVSteps->addOperand(getConstantVPV(Part));
+ }
+
+ addRecipeForPart(&Part0R, &PartIR, Part);
+ }
+ }
+ }
+}
+
+void UnrollState::unrollWidenInduction(VPWidenIntOrFpInductionRecipe *IV,
+ VPBasicBlock::iterator InsertPtForPhi) {
+ VPBasicBlock *PH = cast<VPBasicBlock>(
+ IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
+ Type *IVTy = TypeInfo.inferScalarType(IV);
+ auto &ID = IV->getInductionDescriptor();
+ FastMathFlags FMFs;
+ if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
+ FMFs = ID.getInductionBinOp()->getFastMathFlags();
+
+ VPValue *VectorStep = &Plan.getVF();
+ VPBuilder Builder(PH);
+ if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
+ Instruction::CastOps CastOp =
+ IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
+ VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
+ ToSkip.insert(VectorStep->getDefiningRecipe());
+ }
+
+ VPValue *ScalarStep = IV->getStepValue();
+ auto *ConstStep = ScalarStep->isLiveIn()
+ ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
+ : nullptr;
+ if (!ConstStep || ConstStep->getZExtValue() != 1) {
+ if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
+ ScalarStep =
+ Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
+ ToSkip.insert(ScalarStep->getDefiningRecipe());
+ }
+
+ VPInstruction *Mul;
+ if (IVTy->isFloatingPointTy())
+ Mul = Builder.createFPOp(Instruction::FMul, {VectorStep, ScalarStep},
+ FMFs, IV->getDebugLoc());
+ else
+ Mul = Builder.createNaryOp(Instruction::Mul, {VectorStep, ScalarStep},
+ IV->getDebugLoc());
+ VectorStep = Mul;
+ ToSkip.insert(Mul);
+ }
+
+ // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
+ // remains the header phi. Parts > 0 are computed by adding Step to the
+ // previous part. The header phi recipe will get 2 new operands: the step
+ // value for a single part and the last part, used to compute the backedge
+ // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
+ // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
+ // %Part.1 = %Part.0 + %VectorStep
+ // %Part.2 = %Part.1 + %VectorStep
+ // %Part.3 = %Part.2 + %VectorStep
+ //
+ // The newly added recipes are added to ToSkip to avoid interleaving them
+ // again.
+ VPValue *Prev = IV;
+ Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
----------------
fhahn wrote:
Updated to replace cerateFPOp with createNaryOp which takes optional fast-math flags , allowing it to unify the code.
> Could ID.getInductionOpcode() work for non-floats too?
Unfortunately no, as it may be nullptr in some cases for ints (e.g. extends)
https://github.com/llvm/llvm-project/pull/95842
More information about the llvm-commits
mailing list