[llvm] eddec9d - [Pseudo probe] Duplicate probes in vectorized loop body.

Hongtao Yu via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 15 10:18:15 PST 2023


Author: Hongtao Yu
Date: 2023-02-15T10:18:08-08:00
New Revision: eddec9de44cd6b1ccbe825eec725f27ae5a2b982

URL: https://github.com/llvm/llvm-project/commit/eddec9de44cd6b1ccbe825eec725f27ae5a2b982
DIFF: https://github.com/llvm/llvm-project/commit/eddec9de44cd6b1ccbe825eec725f27ae5a2b982.diff

LOG: [Pseudo probe] Duplicate probes in vectorized loop body.

Prevoius pseudo probes were dropped out of a vectorized loop body during loop vectorization. This can result in the samples of the loop entry is used for the loop body, which in turn can cause undercounting of the loop iteration count. The undercounting can further prevent the loop from being vectorized in the next build. I'm fixing this by explicting allowing pseudo probes to be kept in the vectorized loop body, and by claiming a probe instruction is not "uniform", the vectorizer will duplicate it by the number of vector lanes.

For one internal service, I'm seeing the change causes the size increase of the .pseudoprobe section by 0.7%, which should count around 0.2% of the whole binary size.

Reviewed By: wenlei

Differential Revision: https://reviews.llvm.org/D144066

Added: 
    llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b0a337aedd459..c6cb09139a4ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1285,6 +1285,12 @@ class LoopVectorizationCostModel {
 
   /// Returns true if \p I is known to be uniform after vectorization.
   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+    // Pseudo probe needs to be duplicated for each unrolled iteration and
+    // vector lane so that profiled loop trip count can be accurately
+    // accumulated instead of being under counted.
+    if (isa<PseudoProbeInst>(I))
+      return false;
+
     if (VF.isScalar())
       return true;
 
@@ -8939,7 +8945,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
 
     // Introduce each ingredient into VPlan.
     // TODO: Model and preserve debug intrinsics in VPlan.
-    for (Instruction &I : BB->instructionsWithoutDebug()) {
+    for (Instruction &I : BB->instructionsWithoutDebug(false)) {
       Instruction *Instr = &I;
 
       // First filter out irrelevant instructions, to ensure no recipes are

diff  --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll
new file mode 100644
index 0000000000000..a80ab117e4eb3
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-loop-vectorize.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(ptr nocapture %a, ptr nocapture readonly %b) #0 {
+entry:
+  call void @llvm.pseudoprobe(i64 3666282617048535130, i64 1, i32 0, i64 -1)
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4, !tbaa !1
+  %conv = fptosi float %0 to i32
+  %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %conv, ptr %arrayidx2, align 4, !tbaa !5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  call void @llvm.pseudoprobe(i64 3666282617048535130, i64 2, i32 0, i64 -1)
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  call void @llvm.pseudoprobe(i64 3666282617048535130, i64 3, i32 0, i64 -1)
+  ret i32 0
+}
+
+
+; CHECK-LABEL:  @test1
+; CHECK:        vector.body:
+; CHECK:          load <4 x float>, ptr %{{.*}}
+; CHECK:          store <4 x i32> %{{.*}}, ptr %{{.*}}
+; CHECK-COUNT-4:  call void @llvm.pseudoprobe(i64 3666282617048535130, i64 2, i32 0, i64 -1)
+; CHECK:          %index.next = add nuw i64 %index, 4
+
+
+
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+
+!llvm.pseudo_probe_desc = !{!0}
+
+!0 = !{i64 3666282617048535130, i64 52824598631, !"test1"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !3, i64 0}


        


More information about the llvm-commits mailing list