[llvm] e4ea099 - Revert "[VPlan] Insert Trunc/Exts for reductions directly in VPlan."
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 18 23:25:36 PDT 2023
Author: Fangrui Song
Date: 2023-10-18T23:25:31-07:00
New Revision: e4ea0997486000b460c4875a00301b73b3c0d6a7
URL: https://github.com/llvm/llvm-project/commit/e4ea0997486000b460c4875a00301b73b3c0d6a7
DIFF: https://github.com/llvm/llvm-project/commit/e4ea0997486000b460c4875a00301b73b3c0d6a7.diff
LOG: Revert "[VPlan] Insert Trunc/Exts for reductions directly in VPlan."
This reverts commit fd311126349b8fe1684d62154a9fa5a7bbb0b713.
There are two different crash reports on https://github.com/llvm/llvm-project/commit/fd311126349b8fe1684d62154a9fa5a7bbb0b713
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 14c5c0d18a4db69..aa435b0d47aa599 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3792,6 +3792,8 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
State.setDebugLocFrom(I->getDebugLoc());
VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
+ // This is the vector-clone of the value that leaves the loop.
+ Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
@@ -3803,6 +3805,10 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
State.setDebugLocFrom(LoopExitInst->getDebugLoc());
Type *PhiTy = OrigPhi->getType();
+
+ VPBasicBlock *LatchVPBB =
+ PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
+ BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
@@ -3828,12 +3834,23 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
+ assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+ Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+ : Builder.CreateZExt(Trunc, VecTy);
+ for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
+ if (U != Trunc) {
+ U->replaceUsesOfWith(RdxParts[Part], Extnd);
+ RdxParts[Part] = Extnd;
+ }
}
+ Builder.SetInsertPoint(LoopMiddleBlock,
+ LoopMiddleBlock->getFirstInsertionPt());
+ for (unsigned Part = 0; Part < UF; ++Part)
+ RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
}
// Reduce all of the unrolled parts into a single vector.
@@ -9138,19 +9155,18 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
PreviousLink = RedRecipe;
}
}
+
+ // If tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the beginning of the
+ // dedicated latch block.
+ if (CM.foldTailByMasking()) {
Builder.setInsertPoint(&*LatchVPBB->begin());
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (!PhiR || PhiR->isInLoop())
- continue;
-
- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
- // If tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the beginning of the
- // dedicated latch block.
- if (CM.foldTailByMasking()) {
+ VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR || PhiR->isInLoop())
+ continue;
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
VPValue *Red = PhiR->getBackedgeValue();
@@ -9158,35 +9174,16 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
"reduction recipe must be defined before latch");
FastMathFlags FMFs = RdxDesc.getFastMathFlags();
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
- Result =
+ auto *Select =
PhiTy->isFloatingPointTy()
? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
: new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
- Result->insertBefore(&*Builder.getInsertPoint());
+ Select->insertBefore(&*Builder.getInsertPoint());
if (PreferPredicatedReductionSelect ||
TTI.preferPredicatedReductionSelect(
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
TargetTransformInfo::ReductionFlags()))
- PhiR->setOperand(1, Result->getVPSingleValue());
- }
- // If the vector reduction can be performed in a smaller type, we truncate
- // then extend the loop exit value to enable InstCombine to evaluate the
- // entire expression in the smaller type.
- Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
- if (PhiTy != RdxDesc.getRecurrenceType()) {
- assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
- Type *RdxTy = RdxDesc.getRecurrenceType();
- auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
- Result->getVPSingleValue(), RdxTy);
- auto *Extnd =
- RdxDesc.isSigned()
- ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
- : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
-
- Trunc->insertAfter(Result);
- Extnd->insertAfter(Trunc);
- Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
- Trunc->setOperand(0, Result->getVPSingleValue());
+ PhiR->setOperand(1, Select);
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index 03903d80cfd6ecb..7a3c7d6fbfea71f 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -207,10 +207,10 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
-; CHECK-NEXT: [[TMP8]] = zext <4 x i16> [[TMP7]] to <4 x i32>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
+; CHECK-NEXT: [[TMP8]] = zext <4 x i16> [[TMP7]] to <4 x i32>
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
@@ -234,10 +234,10 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, ptr [[TMP16]], align 2
; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP14]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
-; CHECK-NEXT: [[TMP21]] = zext <4 x i16> [[TMP20]] to <4 x i32>
; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 4
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 256
+; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
+; CHECK-NEXT: [[TMP21]] = zext <4 x i16> [[TMP20]] to <4 x i32>
; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP21]] to <4 x i16>
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index a4a075463b1b0be..837d663f4a9263a 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -22,10 +22,10 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 255, i32 255, i32 255, i32 255>
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
-; CHECK-NEXT: [[TMP5]] = zext <4 x i8> [[TMP4]] to <4 x i32>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
+; CHECK-NEXT: [[TMP5]] = zext <4 x i8> [[TMP4]] to <4 x i32>
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8>
@@ -99,10 +99,10 @@ define i32 @PR35734(i32 %x, i32 %y) {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i1>
-; CHECK-NEXT: [[TMP7]] = sext <4 x i1> [[TMP6]] to <4 x i32>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i1>
+; CHECK-NEXT: [[TMP7]] = sext <4 x i1> [[TMP6]] to <4 x i32>
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i1>
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
index afe16c71f7f9ca4..3cc6e5fa7b8d5fd 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
@@ -17,14 +17,14 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-NEXT: [[TMP28:%.*]] = add <vscale x 8 x i32> [[TMP14]], [[TMP26]]
; CHECK-NEXT: [[TMP29:%.*]] = add <vscale x 8 x i32> [[TMP15]], [[TMP27]]
-; CHECK-NEXT: [[TMP33:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP35:%.*]] = trunc <vscale x 8 x i32> [[TMP29]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP34]] = zext <vscale x 8 x i8> [[TMP33]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP36]] = zext <vscale x 8 x i8> [[TMP35]] to <vscale x 8 x i32>
; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]]
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}}
+; CHECK-NEXT: [[TMP33:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP34]] = zext <vscale x 8 x i8> [[TMP33]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[TMP35:%.*]] = trunc <vscale x 8 x i32> [[TMP29]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP36]] = zext <vscale x 8 x i8> [[TMP35]] to <vscale x 8 x i32>
; CHECK: middle.block:
; CHECK-NEXT: [[TMP37:%.*]] = trunc <vscale x 8 x i32> [[TMP34]] to <vscale x 8 x i8>
; CHECK-NEXT: [[TMP38:%.*]] = trunc <vscale x 8 x i32> [[TMP36]] to <vscale x 8 x i8>
More information about the llvm-commits
mailing list