[llvm] [VPlan] Introduce ComputeReductionResult VPInstruction opcode. (PR #70253)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 24 13:35:07 PST 2023
================
@@ -397,6 +398,132 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
return CondBr;
}
+ case VPInstruction::ComputeReductionResult: {
+ if (Part != 0)
+ return State.get(
+ this, VPIteration(State.UF - 1, VPLane::getLastLaneForVF(State.VF)));
+
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(getOperand(0));
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ // Get it's reduction variable descriptor.
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
+ State.setDebugLocFrom(I->getDebugLoc());
+
+ VPValue *LoopExitInstDef = getOperand(1);
+ Type *PhiTy = OrigPhi->getType();
+ VectorParts RdxParts(State.UF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ RdxParts[Part] = State.get(LoopExitInstDef, Part);
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ BasicBlock *LoopMiddleBlock = State.CFG.VPBB2IRBB[getParent()];
+ if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ Builder.SetInsertPoint(LoopMiddleBlock,
+ LoopMiddleBlock->getFirstInsertionPt());
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ }
+ // Reduce all of the unrolled parts into a single vector.
+ Value *ReducedPartRdx = RdxParts[0];
+ unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+
+ // The middle block terminator has already been assigned a DebugLoc here
+ // (the OrigLoop's single latch terminator). We want the whole middle block
+ // to appear to execute on this line because: (a) it is all compiler
+ // generated, (b) these instructions are always executed after evaluating
+ // the latch conditional branch, and (c) other passes may add new
+ // predecessors which terminate on this line. This is the easiest way to
+ // ensure we don't accidentally cause an extra step back into the loop while
+ // debugging.
+ State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
+ if (PhiR->isOrdered()) {
+ ReducedPartRdx = RdxParts[State.UF - 1];
+ } else {
+ // Floating-point operations should have some FMF to enable the reduction.
+ IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+ for (unsigned Part = 1; Part < State.UF; ++Part) {
+ Value *RdxPart = RdxParts[Part];
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ ReducedPartRdx = Builder.CreateBinOp(
+ (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+ else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
+ ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+ ReducedPartRdx, RdxPart);
+ else
+ ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+ }
+ }
+
+ // Create the reduction after the loop. Note that inloop reductions create
+ // the target reduction in the loop using a Reduction recipe.
+ if (State.VF.isVector() && !PhiR->isInLoop()) {
+ ReducedPartRdx =
+ createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (PhiTy != RdxDesc.getRecurrenceType())
+ ReducedPartRdx = RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
+ : Builder.CreateZExt(ReducedPartRdx, PhiTy);
+ }
+
+ auto *ResumePhi =
+ dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
+ auto *OrigLoop = State.LI->getLoopFor(OrigPhi->getParent());
+ // TODO: bc.merge.rdx should not be created here, instead it should be
+ // modeled in VPlan.
+ BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ auto *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
----------------
ayalz wrote:
Should ComputeReductionResult go ahead and record BCBlockPhi inside some `State.ReductionResumeValues[&RdxDesc] = BCBlockPhi;` when creating it here, similar to how VPExpandSCEVRecipe::execute() records `State.ExpandedSCEVs[Expr] = Res;`, rather than having `getMergePhiForReduction()` search for it later. The latter is arguably more encapsulated, the former seems more consistent?
https://github.com/llvm/llvm-project/pull/70253
More information about the llvm-commits
mailing list