[llvm] [LoopVectorize] Don't discount instructions scalarized due to tail folding (PR #109289)

Mon Oct 14 07:28:08 PDT 2024

================
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "fold tail" --filter "estimated cost" --filter "costs" --filter "Selecting VF" --filter "loop costs" --version 5
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s
+
+; REQUIRE: asserts
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; These tests check that if the only way to vectorize is to tail fold a store by
+; masking then we properly account for the cost of creating a predicated block
+; for each vector element.
+
+define void @store_const_fixed_trip_count(ptr %dst) {
+; CHECK-LABEL: 'store_const_fixed_trip_count'
+; CHECK:  LV: can fold tail by masking.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 2 for VF 1 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Scalar loop costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 2 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 2 costs: 5.
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 16 for VF 4 For instruction: store i8 1, ptr %gep, align 1
+; CHECK:  LV: Found an estimated cost of 2 for VF 4 For instruction: %iv.next = add i64 %iv, 1
+; CHECK:  LV: Found an estimated cost of 1 for VF 4 For instruction: %ec = icmp eq i64 %iv.next, 7
+; CHECK:  LV: Found an estimated cost of 0 for VF 4 For instruction: br i1 %ec, label %exit, label %loop
+; CHECK:  LV: Vector loop of width 4 costs: 4.
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK:  LV: Found an estimated cost of 0 for VF 8 For instruction: %gep = getelementptr i8, ptr %dst, i64 %iv
+; CHECK:  LV: Found an estimated cost of 32 for VF 8 For instruction: store i8 1, ptr %gep, align 1
----------------
david-arm wrote:

Hi, I don't actually mean changing preferPredicateOverEpilogue because this is a top level decision that's used in general for all loops, regardless of the trip count, code size concerns due to -Os/-Oz. I meant the code where we decide to vectorise due to the low trip count, which can still happen even when preferPredicateOverEpilogue returns false. For example, if you look in the function LoopVectorizationCostModel::computeMaxVF at this code here:

```
  // If we don't know the precise trip count, or if the trip count that we
  // found modulo the vectorization factor is not zero, try to fold the tail
  // by masking.
  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
  setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
  if (foldTailByMasking()) {
```

I added this extra code just before:

```
  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop) {
    for (BasicBlock *BB : TheLoop->getBlocks()) {
      for (Instruction &I : *BB) {
        if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
          auto *Ptr = getLoadStorePointerOperand(&I);
          auto *Ty = getLoadStoreType(&I);
          const Align Alignment = getLoadStoreAlignment(&I);
          if ((isa<LoadInst>(&I) && !isLegalMaskedLoad(Ty, Ptr, Alignment)) ||
              (isa<StoreInst>(&I) && !isLegalMaskedStore(Ty, Ptr, Alignment))) {
            LLVM_DEBUG(dbgs() << "LV: Not tail-folding due to lack of masked load/store support\n");
            // We could also just return FixedScalableVFPair::getNone() here.
            ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
            return MaxFactors;
          }
        }
      }
    }
  }
```

And now the code for `@store_const_fixed_trip_count` looks something like this:

```
define void @store_const_fixed_trip_count(ptr %dst) {
entry:
  br i1 false, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %entry
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %0 = add i64 %index, 0
  %1 = getelementptr i8, ptr %dst, i64 %0
  %2 = getelementptr i8, ptr %1, i32 0
  store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, ptr %2, align 1
  %index.next = add nuw i64 %index, 4
  %3 = icmp eq i64 %index.next, 4
  br i1 %3, label %middle.block, label %vector.body, !llvm.loop !0

middle.block:                                     ; preds = %vector.body
  br i1 false, label %exit, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %entry
  %bc.resume.val = phi i64 [ 4, %middle.block ], [ 0, %entry ]
  br label %loop

loop:                                             ; preds = %loop, %scalar.ph
  %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %loop ]
  %gep = getelementptr i8, ptr %dst, i64 %iv
  store i8 1, ptr %gep, align 1
  %iv.next = add i64 %iv, 1
  %ec = icmp eq i64 %iv.next, 7
  br i1 %ec, label %exit, label %loop, !llvm.loop !3

exit:                                             ; preds = %middle.block, %loop
  ret void
}
```

The fix you have for computePredInstDiscount could be used as a workaround for now, but ultimately I think the best solution would be to avoid tail-folding altogether.

https://github.com/llvm/llvm-project/pull/109289