[llvm] 239d01f - Reland "[LV] Print remark when loop cannot be vectorized due to invalid costs."

Fri Jul 16 02:53:18 PDT 2021

Author: Sander de Smalen
Date: 2021-07-16T10:52:01+01:00
New Revision: 239d01fa884d8707ece2f2dbf0eafcbbf8714aa4

URL: https://github.com/llvm/llvm-project/commit/239d01fa884d8707ece2f2dbf0eafcbbf8714aa4
DIFF: https://github.com/llvm/llvm-project/commit/239d01fa884d8707ece2f2dbf0eafcbbf8714aa4.diff

LOG: Reland "[LV] Print remark when loop cannot be vectorized due to invalid costs."

The original patch was:
  https://reviews.llvm.org/D105806

There were some issues with undeterministic behaviour of the sorting
function, which led to scalable-call.ll passing and/or failing. This
patch fixes the issue by numbering all instructions in the array first,
and using that number as the order, which should provide a consistent
ordering.

This reverts commit a607f64118240f70bf1b14ec121b65f49d63800d.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 71f16a86a0bf3..64dce08eacd2e 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1676,8 +1676,13 @@ class LoopVectorizationCostModel {
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare 
diff erent
   /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width.
-  VectorizationCostTy expectedCost(ElementCount VF);
+  /// the factor width. If \p Invalid is not nullptr, this function
+  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
+  /// each instruction that has an Invalid cost for the given VF.
+  using InstructionVFPair = std::pair<Instruction *, ElementCount>;
+  VectorizationCostTy
+  expectedCost(ElementCount VF,
+               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
@@ -6075,12 +6080,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
     ChosenFactor.Cost = InstructionCost::getMax();
   }
 
+  SmallVector<InstructionVFPair> InvalidCosts;
   for (const auto &i : VFCandidates) {
     // The cost for scalar VF=1 is already calculated, so ignore it.
     if (i.isScalar())
       continue;
 
-    VectorizationCostTy C = expectedCost(i);
+    VectorizationCostTy C = expectedCost(i, &InvalidCosts);
     VectorizationFactor Candidate(i, C.first);
     LLVM_DEBUG(
         dbgs() << "LV: Vector loop of width " << i << " costs: "
@@ -6103,6 +6109,66 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
       ChosenFactor = Candidate;
   }
 
+  // Emit a report of VFs with invalid costs in the loop.
+  if (!InvalidCosts.empty()) {
+    // Group the remarks per instruction, keeping the instruction order from
+    // InvalidCosts.
+    std::map<Instruction *, unsigned> Numbering;
+    unsigned I = 0;
+    for (auto &Pair : InvalidCosts)
+      if (!Numbering.count(Pair.first))
+        Numbering[Pair.first] = I++;
+
+    // Sort the list, first on instruction(number) then on VF.
+    llvm::sort(InvalidCosts,
+               [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
+                 if (Numbering[A.first] != Numbering[B.first])
+                   return Numbering[A.first] < Numbering[B.first];
+                 ElementCountComparator ECC;
+                 return ECC(A.second, B.second);
+               });
+
+    // For a list of ordered instruction-vf pairs:
+    //   [(load, vf1), (load, vf2), (store, vf1)]
+    // Group the instructions together to emit separate remarks for:
+    //   load  (vf1, vf2)
+    //   store (vf1)
+    auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
+    auto Subset = ArrayRef<InstructionVFPair>();
+    do {
+      if (Subset.empty())
+        Subset = Tail.take_front(1);
+
+      Instruction *I = Subset.front().first;
+
+      // If the next instruction is 
diff erent, or if there are no other pairs,
+      // emit a remark for the collated subset. e.g.
+      //   [(load, vf1), (load, vf2))]
+      // to emit:
+      //  remark: invalid costs for 'load' at VF=(vf, vf2)
+      if (Subset == Tail || Tail[Subset.size()].first != I) {
+        std::string OutString;
+        raw_string_ostream OS(OutString);
+        assert(!Subset.empty() && "Unexpected empty range");
+        OS << "Instruction with invalid costs prevented vectorization at VF=(";
+        for (auto &Pair : Subset)
+          OS << (Pair.second == Subset.front().second ? "" : ", ")
+             << Pair.second;
+        OS << "):";
+        if (auto *CI = dyn_cast<CallInst>(I))
+          OS << " call to " << CI->getCalledFunction()->getName();
+        else
+          OS << " " << I->getOpcodeName();
+        OS.flush();
+        reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
+        Tail = Tail.drop_front(Subset.size());
+        Subset = {};
+      } else
+        // Grow the subset by one element
+        Subset = Tail.take_front(Subset.size() + 1);
+    } while (!Tail.empty());
+  }
+
   if (!EnableCondStoresVectorization && NumPredStores) {
     reportVectorizationFailure("There are conditional stores.",
         "store that is conditionally executed prevents vectorization",
@@ -6884,7 +6950,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(ElementCount VF) {
+LoopVectorizationCostModel::expectedCost(
+    ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6904,6 +6971,10 @@ LoopVectorizationCostModel::expectedCost(ElementCount VF) {
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
         C.first = InstructionCost(ForceTargetInstructionCost);
 
+      // Keep a list of instructions with invalid costs.
+      if (Invalid && !C.first.isValid())
+        Invalid->emplace_back(&I, VF);
+
       BlockCost.first += C.first;
       BlockCost.second |= C.second;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index 767aac9a31abd..d7a3f719e151e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on \
+; RUN:     -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
 
 define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) {
 ; CHECK-LABEL: @vec_load
@@ -95,6 +97,10 @@ for.end:
   ret void
 }
 
+; CHECK-REMARKS: UserVF ignored because of invalid costs.
+; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_no_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
@@ -105,10 +111,10 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
-  %0 = load float, float* %arrayidx, align 4
-  %1 = tail call fast float @llvm.sin.f32(float %0)
+  %0 = load float, float* %arrayidx, align 4, !dbg !11
+  %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12
   %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
-  store float %1, float* %arrayidx1, align 4
+  store float %1, float* %arrayidx1, align 4, !dbg !13
   %inc = add nuw nsw i64 %i.07, 1
   %exitcond.not = icmp eq i64 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
@@ -117,6 +123,46 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 }
 
+; CHECK-REMARKS: UserVF ignored because of invalid costs.
+; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+define void @vec_sin_no_mapping_ite(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_no_mapping_ite
+; CHECK-NOT: <vscale x
+; CHECK: ret
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %if.end
+  %i.07 = phi i64 [ %inc, %if.end ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+  %0 = load float, float* %arrayidx, align 4, !dbg !11
+  %cmp = fcmp ugt float %0, 0.0000
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %1 = tail call fast float @llvm.sin.f32(float %0), !dbg !12
+  br label %if.end
+if.else:
+  %2 = tail call fast float @llvm.sin.f32(float 0.0), !dbg !13
+  br label %if.end
+if.end:
+  %3 = phi float [%1, %if.then], [%2, %if.else]
+  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+  store float %3, float* %arrayidx1, align 4, !dbg !14
+  %inc = add nuw nsw i64 %i.07, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+; CHECK-REMARKS: UserVF ignored because of invalid costs.
+; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_fixed_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
@@ -127,10 +173,10 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
-  %0 = load float, float* %arrayidx, align 4
-  %1 = tail call fast float @llvm.sin.f32(float %0) #3
+  %0 = load float, float* %arrayidx, align 4, !dbg !11
+  %1 = tail call fast float @llvm.sin.f32(float %0) #3, !dbg !12
   %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
-  store float %1, float* %arrayidx1, align 4
+  store float %1, float* %arrayidx1, align 4, !dbg !13
   %inc = add nuw nsw i64 %i.07, 1
   %exitcond.not = icmp eq i64 %inc, %n
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
@@ -183,3 +229,19 @@ attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_
 !1 = distinct !{!1, !2, !3}
 !2 = !{!"llvm.loop.vectorize.width", i32 2}
 !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+!llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!7}
+!llvm.ident = !{!8}
+
+!4 = distinct !DICompileUnit(language: DW_LANG_C99, file: !5, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !6, splitDebugInlining: false, nameTableKind: None)
+!5 = !DIFile(filename: "t.c", directory: "somedir")
+!6 = !{}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{!"clang"}
+!9 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 2, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !4, retainedNodes: !6)
+!10 = !DISubroutineType(types: !6)
+!11 = !DILocation(line: 3, column: 10, scope: !9)
+!12 = !DILocation(line: 3, column: 20, scope: !9)
+!13 = !DILocation(line: 3, column: 30, scope: !9)
+!14 = !DILocation(line: 3, column: 40, scope: !9)