[PATCH] D126295: [LoopVectorize] Fix assertion failure in fixReduction when tail-folding

Tue May 24 08:52:29 PDT 2022

david-arm updated this revision to Diff 431688.
david-arm added a comment.

- Address review comments on test.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126295/new/

https://reviews.llvm.org/D126295

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
  llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll


Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
===================================================================

--- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -328,8 +328,6 @@
 ; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
 ; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
 ; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
-; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
 ; CHECK: middle.block:
 ; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
 ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN:   -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S | FileCheck %s
+
+define void @invariant_store_red_exit_is_phi(i32* %dst, i32* readonly %src, i64 %n) {
+; CHECK-LABEL: @invariant_store_red_exit_is_phi(
+; CHECK: vector.body:
+; CHECK:      %[[VEC_PHI:.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %[[PREDPHI:.*]], %vector.body ]
+; CHECK:      %[[ACTIVE_LANE_MASK:.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 {{%.*}}, i64 %n)
+; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32
+; CHECK-NEXT: %[[ADD:.*]] = add <vscale x 4 x i32> %[[VEC_PHI]], %[[LOAD]]
+; CHECK-NEXT: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32> %[[VEC_PHI]]
+; CHECK: middle.block:
+; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[SELECT]])
+; CHECK-NEXT: store i32 %[[SUM]], i32* %dst, align 4
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %red = phi i32 [ 0, %entry ], [ %storemerge, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %load = load i32, i32* %arrayidx6, align 4
+  %storemerge = add i32 %red, %load
+  store i32 %storemerge, i32* %dst, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8830,6 +8830,14 @@
         auto OpRange = Plan->mapToVPValues(Instr->operands());
         Operands = {OpRange.begin(), OpRange.end()};
       }
+
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
               Instr, Operands, Range, Plan)) {
         // If Instr can be simplified to an existing VPValue, use it.
@@ -8865,13 +8873,6 @@
         continue;
       }
 
-      // Invariant stores inside loop will be deleted and a single store
-      // with the final reduction value will be added to the exit block
-      StoreInst *SI;
-      if ((SI = dyn_cast<StoreInst>(&I)) &&
-          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
-        continue;
-
       // Otherwise, if all widening options failed, Instruction is to be
       // replicated. This may create a successor for VPBB.
       VPBasicBlock *NextVPBB =


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D126295.431688.patch
Type: text/x-patch
Size: 4616 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220524/78a2ceb1/attachment.bin>