[llvm] [VPlan][PseudoProbe] Fix `pseudoprobe` duplication when `VF=1` (PR #185238)

Henry Jiang via llvm-commits llvm-commits at lists.llvm.org
Sat Mar 7 17:20:04 PST 2026


https://github.com/mustartt created https://github.com/llvm/llvm-project/pull/185238

Fix assertion in `loop-vectorize` on loops that contains `llvm.pseudoprobe` at VF=1, UF=2. Minimal Reproducer: https://godbolt.org/z/nrcMWWqMx

Originally in https://reviews.llvm.org/D144066, Pseudoprobes were marked non-uniform in `isUniformAfterVectorization` that allows the `REPLICATE call @llvm.pseudoprobe` to survive until the plan is executed when VF=1, UF=2.

Instead, mark pseudoprobes as uniform and explicitly duplicate them by VF in `replicateByVF`.

>From 6fe915f3cac1e73d650b7a1f0430ac49c6881a04 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry_jiang2 at apple.com>
Date: Sat, 7 Mar 2026 17:04:24 -0800
Subject: [PATCH] Fix IsSingular assertion in VPlan with psuedoprobe

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  5 +-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  6 +-
 .../Transforms/LoopVectorize/pseudoprobe.ll   | 74 +++++++++++++++++--
 3 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bb4eef5a41c09..a1d6d3a6abe1d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -995,11 +995,8 @@ class LoopVectorizationCostModel {
     assert(
         TheLoop->isInnermost() &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
-    // Pseudo probe needs to be duplicated for each unrolled iteration and
-    // vector lane so that profiled loop trip count can be accurately
-    // accumulated instead of being under counted.
     if (isa<PseudoProbeInst>(I))
-      return false;
+      return true;
 
     if (VF.isScalar())
       return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 2d961808d3bcd..d5cd66a69c9d2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -681,7 +681,11 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(&R) ||
           (isa<VPReplicateRecipe>(&R) &&
-           cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
+           cast<VPReplicateRecipe>(&R)->isSingleScalar() &&
+           // Pseudoprobes are single scalar but must still be replicated per
+           // vector lane to preserve the original trip count when profiling.
+           !match(cast<VPReplicateRecipe>(&R),
+                  m_Intrinsic(Intrinsic::pseudoprobe))) ||
           (isa<VPInstruction>(&R) &&
            !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
            cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
diff --git a/llvm/test/Transforms/LoopVectorize/pseudoprobe.ll b/llvm/test/Transforms/LoopVectorize/pseudoprobe.ll
index 42f1056beecf9..f392348a669e9 100644
--- a/llvm/test/Transforms/LoopVectorize/pseudoprobe.ll
+++ b/llvm/test/Transforms/LoopVectorize/pseudoprobe.ll
@@ -1,9 +1,40 @@
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s --check-prefix=VF4UF1
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=1 -S | FileCheck %s --check-prefix=VF1UF1
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S | FileCheck %s --check-prefix=VF1UF2
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s --check-prefix=VF2UF1
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
 define i32 @test1(ptr nocapture %a, ptr nocapture readonly %b) #0 {
+; VF4UF1-LABEL:  @test1
+; VF4UF1:        vector.body:
+; VF4UF1:          load <4 x float>, ptr %{{.*}}
+; VF4UF1:          store <4 x i32> %{{.*}}, ptr %{{.*}}
+; VF4UF1-COUNT-4:  call void @llvm.pseudoprobe
+; VF4UF1-NOT:      call void @llvm.pseudoprobe
+; VF4UF1:          %index.next = add nuw i64 %index, 4
+;
+; VF1UF1-LABEL:  @test1
+; VF1UF1:        for.body:
+; VF1UF1-COUNT-1:  call void @llvm.pseudoprobe
+; VF1UF1-NOT:      call void @llvm.pseudoprobe
+; VF1UF1:          br i1
+;
+; VF1UF2-LABEL:  @test1
+; VF1UF2:        vector.body:
+; VF1UF2-COUNT-2:  call void @llvm.pseudoprobe
+; VF1UF2-NOT:      call void @llvm.pseudoprobe
+; VF1UF2:          %index.next = add nuw i64 %index, 2
+;
+; VF2UF1-LABEL:  @test1
+; VF2UF1:        vector.body:
+; VF2UF1:          load <2 x float>, ptr %{{.*}}
+; VF2UF1:          store <2 x i32> %{{.*}}, ptr %{{.*}}
+; VF2UF1-COUNT-2:  call void @llvm.pseudoprobe
+; VF2UF1-NOT:      call void @llvm.pseudoprobe
+; VF2UF1:          %index.next = add nuw i64 %index, 2
+
 entry:
   call void @llvm.pseudoprobe(i64 3666282617048535130, i64 1, i32 0, i64 -1)
   br label %for.body
@@ -25,15 +56,44 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+define void @test2() {
+; VF4UF1-LABEL:  @test2
+; VF4UF1:        vector.body:
+; VF4UF1-COUNT-4:  call void @llvm.pseudoprobe
+; VF4UF1-NOT:      call void @llvm.pseudoprobe
+; VF4UF1:          %index.next = add nuw i64 %index, 4
+;
+; VF1UF1-LABEL:  @test2
+; VF1UF1:        loop:
+; VF1UF1-COUNT-1:  call void @llvm.pseudoprobe
+; VF1UF1-NOT:      call void @llvm.pseudoprobe
+; VF1UF1:          br i1
+;
+; VF1UF2-LABEL:  @test2
+; VF1UF2:        vector.body:
+; VF1UF2-COUNT-2:  call void @llvm.pseudoprobe
+; VF1UF2-NOT:      call void @llvm.pseudoprobe
+; VF1UF2:          %index.next = add nuw i64 %index, 2
+;
+; VF2UF1-LABEL:  @test2
+; VF2UF1:        vector.body:
+; VF2UF1-COUNT-2:  call void @llvm.pseudoprobe
+; VF2UF1-NOT:      call void @llvm.pseudoprobe
+; VF2UF1:          %index.next = add nuw i64 %index, 2
 
-; CHECK-LABEL:  @test1
-; CHECK:        vector.body:
-; CHECK:          load <4 x float>, ptr %{{.*}}
-; CHECK:          store <4 x i32> %{{.*}}, ptr %{{.*}}
-; CHECK-COUNT-4:  call void @llvm.pseudoprobe(i64 3666282617048535130, i64 2, i32 0, i64 -1)
-; CHECK:          %index.next = add nuw i64 %index, 4
+entry:
+    br label %loop
 
+loop:
+    %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+    %iv.next = add i64 %iv, 1
+    call void @llvm.pseudoprobe(i64 0, i64 0, i32 0, i64 0)
+    %done = icmp eq i64 %iv.next, 0
+    br i1 %done, label %exit, label %loop
 
+exit:
+    ret void
+}
 
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)



More information about the llvm-commits mailing list