[llvm] [LV][VPlan] Add initial support for CSA vectorization (PR #106560)
Michael Maitland via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 3 12:00:47 PDT 2024
================
@@ -8540,6 +8599,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return tryToWiden(Instr, Operands, VPBB);
}
+/// Add CSA Recipes that can occur before each instruction in the input IR
+/// is processed and introduced into VPlan.
+static void
+addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
+ VPlan &Plan) {
+
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range);
+
+ for (const auto &CSA : CSAs) {
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn(
+ CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+
+ // Scalar VF builds the scalar version of the loop. In that case,
+ // no maintenence of mask nor extraction in middle block is needed.
+ if (IsScalarVF) {
+ VPCSAState *S = new VPCSAState(VPInitScalar);
+ Plan.addCSAState(CSA.first, S);
+ continue;
+ }
+
+ auto *VPInitMask =
+ new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
+ auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
+ {VPInitScalar}, DL, "csa.init.data");
+ PreheaderVPBB->appendRecipe(VPInitMask);
+ PreheaderVPBB->appendRecipe(VPInitData);
+
+ auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
+ DL, "csa.mask.phi");
+ HeaderVPBB->appendRecipe(VPMaskPhi);
+
+ auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
+ Plan.addCSAState(CSA.first, S);
+ }
+}
+
+/// Add CSA Recipes that must occur after each instruction in the input IR
+/// is processed and introduced into VPlan.
+static void
+addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
+ const LoopVectorizationLegality::CSAList &CSAs,
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
+ VPlan &Plan) {
+ // Don't build CSA for VF=ElementCount::getFixed(1)
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return;
+
+ for (const auto &CSA : CSAs) {
+ VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
+
+ assert(VPDataUpdate &&
+ "VPDataUpdate must have been introduced prior to postprocess");
+ assert(CSA.second.getCond() &&
+ "CSADescriptor must know how to describe the condition");
+ auto GetVPValue = [&](Value *I) {
+ return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
+ };
+ VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
+ VPValue *VPInitScalar = CSAState->getVPInitScalar();
+
+ // The CSA optimization wants to use a condition such that when it is
+ // true, a new value is assigned. However, it is possible that a true lane
+ // in WidenedCond corresponds to selection of the initial value instead.
+ // In that case, we must use the negation of WidenedCond.
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
+ VPValue *CondToUse = WidenedCond;
+ if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
+ CSA.first) {
+ auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
+ VPNotCond->insertBefore(
+ GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
----------------
michaelmaitland wrote:
Updated to use VPBuilder to create vp instructions. used builder to insert where appropriate.
https://github.com/llvm/llvm-project/pull/106560
More information about the llvm-commits
mailing list