[llvm] 87936c7 - [LoopVectorize] Fix assertion failure in fixReduction when tail-folding

Wed May 25 03:47:08 PDT 2022

Author: David Sherwood
Date: 2022-05-25T11:46:32+01:00
New Revision: 87936c7b131ee141a1309d5535f149ac48ff694e

URL: https://github.com/llvm/llvm-project/commit/87936c7b131ee141a1309d5535f149ac48ff694e
DIFF: https://github.com/llvm/llvm-project/commit/87936c7b131ee141a1309d5535f149ac48ff694e.diff

LOG: [LoopVectorize] Fix assertion failure in fixReduction when tail-folding

When compiling the attached new test in scalable-reductions-tf.ll we
were hitting this assertion in fixReduction:

  Assertion `isa<PHINode>(U) && "Reduction exit must feed Phi's or select"

The loop contains a reduction and an intermediate store of the reduction
value. When vectorising with tail-folding the contains of 'U' in the
assertion above happened to be a scatter_store. It turns out that we
were still creating a widen recipe for the invariant store, despite
knowing that we can actually sink it. The simplest fix is to change
buildVPlanWithVPRecipes so that we look for invariant stores before
attempting to widen it.

Differential Revision: https://reviews.llvm.org/D126295

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 22993f2ec7682..beddc3ee25173 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8829,6 +8829,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
         auto OpRange = Plan->mapToVPValues(Instr->operands());
         Operands = {OpRange.begin(), OpRange.end()};
       }
+
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
               Instr, Operands, Range, Plan)) {
         // If Instr can be simplified to an existing VPValue, use it.
@@ -8864,13 +8872,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
         continue;
       }
 
-      // Invariant stores inside loop will be deleted and a single store
-      // with the final reduction value will be added to the exit block
-      StoreInst *SI;
-      if ((SI = dyn_cast<StoreInst>(&I)) &&
-          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
-        continue;
-
       // Otherwise, if all widening options failed, Instruction is to be
       // replicated. This may create a successor for VPBB.
       VPBasicBlock *NextVPBB =

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
new file mode 100644
index 0000000000000..1749519883ead
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN:   -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S | FileCheck %s
+
+define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) {
+; CHECK-LABEL: @invariant_store_red_exit_is_phi(
+; CHECK: vector.body:
+; CHECK:      %[[VEC_PHI:.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ]
+; CHECK:      %[[ACTIVE_LANE_MASK:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n)
+; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32
+; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 4 x i32> %[[VEC_PHI]], %[[LOAD]]
+; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32> %[[VEC_PHI]]
+; CHECK: middle.block:
+; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
+; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %red = phi i32 [ 0, %entry ], [ %storemerge, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %load = load i32, i32* %arrayidx6, align 4
+  %storemerge = add i32 %red, %load
+  store i32 %storemerge, i32* %dst, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 56d8a076a1142..86f1f553d8217 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -328,8 +328,6 @@ define void @invariant_store(i32* %dst, i32* readonly %src) {
 ; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
 ; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
 ; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
 ; CHECK: middle.block:
 ; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
 ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])