[llvm] [profcheck][coro] Adding Branch weights PGO in CoroSplit and CoroFrame Passes (PR #184466)

Jin Huang via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 3 16:48:35 PST 2026


https://github.com/jinhuang1102 updated https://github.com/llvm/llvm-project/pull/184466

>From 1a219d0e7a0d29ddfbbbd6ae25de801835b6973a Mon Sep 17 00:00:00 2001
From: Jin Huang <jingold at google.com>
Date: Wed, 4 Mar 2026 00:19:01 +0000
Subject: [PATCH] [profcheck][coro] Adding Branch weights PGO in CoroSplit and
 CoroFrame Passes

---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  | 19 +++++++++++
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  | 33 ++++++++++++++++++-
 .../coro-await-suspend-lower-invoke.ll        | 30 +++++++++++++----
 .../Transforms/Coroutines/coro-byval-param.ll | 32 ++++++++++++++++--
 .../Coroutines/coro-catchswitch-cleanuppad.ll |  7 ++--
 5 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 83b40ee927b51..09921a96f5f20 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -25,8 +25,10 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/OptimizedStructLayout.h"
@@ -43,6 +45,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
 #define DEBUG_TYPE "coro-frame"
 
 namespace {
@@ -1368,6 +1374,19 @@ static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB,
     SwitchOnDispatch->addCase(SwitchConstant, CaseBB);
     SwitchIndex++;
   }
+
+  if (!ProfcheckDisableMetadataFixes) {
+    // Add branch weights to SwitchOnDispatch, where branches are unreachable by
+    // default. We mark two branches as having equal weights because they are
+    // mutually exclusive.
+    if (SwitchIndex == 2) {
+      MDBuilder MDB(SwitchOnDispatch->getContext());
+      SwitchOnDispatch->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights({0, llvm::MDBuilder::kUnlikelyBranchWeight,
+                                   llvm::MDBuilder::kUnlikelyBranchWeight}));
+    }
+  }
 }
 
 static void cleanupSinglePredPHIs(Function &F) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 587f581ded8d5..a8d5f1aa56756 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -55,7 +55,9 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
@@ -77,6 +79,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
 #define DEBUG_TYPE "coro-split"
 
 // FIXME:
@@ -427,7 +433,11 @@ void coro::BaseCloner::handleFinalSuspend() {
       auto *Load =
           Builder.CreateLoad(Shape.getSwitchResumePointerType(), NewFramePtr);
       auto *Cond = Builder.CreateIsNull(Load);
-      Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB);
+      auto *Br = Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB);
+      applyProfMetadataIfEnabled(Br, [&](Instruction *Inst) {
+        setExplicitlyUnknownBranchWeightsIfProfiled(
+            *Inst, DEBUG_TYPE, Inst->getFunction());
+      });
     }
     OldSwitchBB->getTerminator()->eraseFromParent();
   }
@@ -1594,6 +1604,23 @@ struct SwitchCoroutineSplitter {
       ++SuspendIndex;
     }
 
+    // Add the branch weights to the switch instruction.
+    if (!ProfcheckDisableMetadataFixes) {
+      if (!Shape.CoroSuspends.empty()) {
+        SmallVector<uint32_t> Weights;
+        // Add the Unlikly weight for the default case.
+        Weights.push_back(llvm::MDBuilder::kUnlikelyBranchWeight);
+        // The first case (IndexVal == 0) represents the coroutine initialization, 
+        // which is expected to be the most common case, so we give it the highest weight. 
+        Weights.push_back(llvm::MDBuilder::kLikelyBranchWeight);
+        // Remaning cases are unlikely to be hit, so we give them the unlikely weight.
+        for (size_t i = 1, e = Shape.CoroSuspends.size(); i < e; ++i)
+          Weights.push_back(llvm::MDBuilder::kUnlikelyBranchWeight);
+        MDBuilder MDB(C);
+        Switch->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+      }
+    }
+
     Builder.SetInsertPoint(UnreachBB);
     Builder.CreateUnreachable();
     DBuilder.finalize();
@@ -1618,6 +1645,10 @@ struct SwitchCoroutineSplitter {
       // If there is a CoroAlloc and it returns false (meaning we elide the
       // allocation, use CleanupFn instead of DestroyFn).
       DestroyOrCleanupFn = Builder.CreateSelect(CA, DestroyFn, CleanupFn);
+      applyProfMetadataIfEnabled(DestroyOrCleanupFn, [&](Instruction *Inst) {
+        setExplicitlyUnknownBranchWeightsIfProfiled(*Inst, DEBUG_TYPE, 
+                                                     CoroId->getFunction());
+      });
     }
 
     // Destroy function pointer
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
index 72bb8fcf5b610..b57d6ecdd8148 100644
--- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 6
 ; Tests that invoke <type> @llvm.coro.await.suspend lowers to invoke @helper
 ; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split),simplifycfg' -S | FileCheck %s
 
 %Awaiter = type {}
 
-define void @f() presplitcoroutine personality i32 0 {
+define void @f() presplitcoroutine personality i32 0  !prof !0 {
 entry:
   %awaiter = alloca %Awaiter
   %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
@@ -103,7 +103,13 @@ declare void @__cxa_end_catch()
 
 declare noalias ptr @malloc(i32)
 declare void @free(ptr)
-; CHECK-LABEL: define void @f() personality i32 0 {
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: @f.resumers = private constant [3 x ptr] [ptr @f.resume, ptr @f.destroy, ptr @f.cleanup]
+;.
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ) personality i32 0 !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[AWAITER:%.*]] = alloca [[AWAITER:%.*]], align 8
 ; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr @f.resumers)
@@ -119,7 +125,7 @@ declare void @free(ptr)
 ;
 ;
 ; CHECK-LABEL: define internal fastcc void @f.resume(
-; CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[HDL:%.*]]) personality i32 0 {
+; CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[HDL:%.*]]) personality i32 0 !prof [[PROF0]] {
 ; CHECK-NEXT:  [[ENTRY_RESUME:.*]]:
 ; CHECK-NEXT:    [[AWAITER_RELOAD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[HDL]], i64 0
 ; CHECK-NEXT:    [[INDEX_ADDR:%.*]] = getelementptr inbounds i8, ptr [[HDL]], i64 16
@@ -129,7 +135,7 @@ declare void @free(ptr)
 ; CHECK-NEXT:      i2 1, label %[[AFTERCOROSUSPEND3:.*]]
 ; CHECK-NEXT:      i2 -2, label %[[AFTERCOROSUSPEND7:.*]]
 ; CHECK-NEXT:      i2 -1, label %[[CLEANUP:.*]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
 ; CHECK:       [[COROSAVE1]]:
 ; CHECK-NEXT:    [[INDEX_ADDR13:%.*]] = getelementptr inbounds i8, ptr [[HDL]], i64 16
 ; CHECK-NEXT:    store i2 1, ptr [[INDEX_ADDR13]], align 1
@@ -176,7 +182,7 @@ declare void @free(ptr)
 ;
 ;
 ; CHECK-LABEL: define internal fastcc void @f.destroy(
-; CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[HDL:%.*]]) personality i32 0 {
+; CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[HDL:%.*]]) personality i32 0 !prof [[PROF0]] {
 ; CHECK-NEXT:  [[ENTRY_DESTROY:.*:]]
 ; CHECK-NEXT:    [[AWAITER_RELOAD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[HDL]], i64 0
 ; CHECK-NEXT:    call void @free(ptr [[HDL]])
@@ -184,9 +190,19 @@ declare void @free(ptr)
 ;
 ;
 ; CHECK-LABEL: define internal fastcc void @f.cleanup(
-; CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[HDL:%.*]]) personality i32 0 {
+; CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[HDL:%.*]]) personality i32 0 !prof [[PROF0]] {
 ; CHECK-NEXT:  [[ENTRY_CLEANUP:.*:]]
 ; CHECK-NEXT:    [[AWAITER_RELOAD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[HDL]], i64 0
 ; CHECK-NEXT:    call void @free(ptr null)
 ; CHECK-NEXT:    ret void
 ;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind memory(argmem: read) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nomerge nounwind }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575, i32 1, i32 1, i32 1}
+;.
diff --git a/llvm/test/Transforms/Coroutines/coro-byval-param.ll b/llvm/test/Transforms/Coroutines/coro-byval-param.ll
index db1f151b59bc5..cabe6224fe20e 100644
--- a/llvm/test/Transforms/Coroutines/coro-byval-param.ll
+++ b/llvm/test/Transforms/Coroutines/coro-byval-param.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,9 +10,12 @@ target datalayout = "e-m:e-p:64:64-i64:64-f80:128-n8:16:32:64-S128"
 ; struct pointer, and that the alignment is taken into account.
 
 ; Function Attrs: noinline ssp uwtable mustprogress
-define ptr @foo(ptr nocapture readonly byval(%struct.A) align 8 %a1) #0 {
+;.
+; CHECK: @foo.resumers = private constant [3 x ptr] [ptr @foo.resume, ptr @foo.destroy, ptr @foo.cleanup]
+;.
+define ptr @foo(ptr nocapture readonly byval(%struct.A) align 8 %a1) #0 !prof !0 {
 ; CHECK-LABEL: define ptr @foo(
-; CHECK-SAME: ptr readonly byval([[STRUCT_A:%.*]]) align 8 [[A1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr readonly byval([[STRUCT_A:%.*]]) align 8 [[A1:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call token @llvm.coro.id(i32 16, ptr nonnull null, ptr @foo, ptr @foo.resumers)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.coro.alloc(token [[TMP0]])
@@ -24,8 +27,13 @@ define ptr @foo(ptr nocapture readonly byval(%struct.A) align 8 %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi ptr [ [[CALL]], %[[CORO_ALLOC]] ], [ null, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[TMP0]], ptr [[TMP2]])
 ; CHECK-NEXT:    store ptr @foo.resume, ptr [[TMP3]], align 8
+<<<<<<< HEAD
 ; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], ptr @foo.destroy, ptr @foo.cleanup
 ; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
+=======
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], ptr @foo.destroy, ptr @foo.cleanup, !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds nuw [[FOO_FRAME:%.*]], ptr [[TMP3]], i32 0, i32 1
+>>>>>>> 30fbd2b94442 ([profcheck][coro] Adding Branch weights PGO in CoroSplit and CoroFrame Passes)
 ; CHECK-NEXT:    store ptr [[TMP4]], ptr [[DESTROY_ADDR]], align 8
 ; CHECK-NEXT:    [[__PROMISE_RELOAD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[A1_SPILL_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 24
@@ -152,3 +160,21 @@ attributes #8 = { nobuiltin nounwind "frame-pointer"="all" "no-trapping-math"="t
 attributes #9 = { allocsize(0) }
 attributes #10 = { noduplicate }
 
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0]] = { mustprogress noinline ssp uwtable "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+; CHECK: attributes #[[ATTR2]] = { nounwind }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nobuiltin nofree allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind memory(none) }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR7:[0-9]+]] = { mustprogress noinline nounwind ssp willreturn uwtable "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nomerge nounwind }
+; CHECK: attributes #[[ATTR9:[0-9]+]] = { nobuiltin nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+; CHECK: attributes #[[ATTR10:[0-9]+]] = { nounwind memory(argmem: read) }
+; CHECK: attributes #[[ATTR11]] = { allocsize(0) }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"coro-split"}
+;.
diff --git a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
index e1e1611ee3362..516da5e4527ab 100644
--- a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
+++ b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
@@ -3,7 +3,7 @@
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg<switch-range-to-icmp>,early-cse' -S | FileCheck %s
 
 declare i32 @__CxxFrameHandler3(...)
-define ptr @f2(i1 %val) presplitcoroutine personality ptr @__CxxFrameHandler3 {
+define ptr @f2(i1 %val) presplitcoroutine personality ptr @__CxxFrameHandler3 !prof !0 {
 entry:
   %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
   %valueA = call i32 @f();
@@ -81,7 +81,7 @@ cleanup2:
 ; CHECK:   %1 = phi i8 [ 0, %handler2 ], [ 1, %catch.dispatch.2 ]
 ; CHECK:   %2 = cleanuppad within %h1 []
 ; CHECK:   %3 = icmp eq i8 %1, 0
-; CHECK:   br i1 %3, label %cleanup2.from.handler2, label %cleanup2.from.catch.dispatch.2
+; CHECK:   br i1 %3, label %cleanup2.from.handler2, label %cleanup2.from.catch.dispatch.2, !prof [[PROF1:![0-9]+]]
 
 ; CHECK: cleanup2.from.handler2:
 ; CHECK:   %valueB.reload = load i32, ptr %valueB.spill.addr, align 4
@@ -113,3 +113,6 @@ declare void @print(i32)
 declare void @free(ptr)
 
 declare i32 @f()
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1}



More information about the llvm-commits mailing list