[llvm] [profcheck] Fix missing profile metadata in ExpandMemCmp (PR #169979)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 29 01:13:52 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Jin Huang (jinhuang1102)
<details>
<summary>Changes</summary>
This patch fixes a profile metadata missing in the `ExpandMemCmp` pass when it expanding `memcmp` calls. This would cause branches between different blocks to lose their profile data, potentially leading to suboptimal code generation.
The patch updates the `ExpandMemCmp` pass to set branch weights to a default `unknown`(50/50 weights) value when a profile is available. This prevents the expansion from making a previously profiled branch unprofiled.
The patch also includes updates to the tests to reflect the new branch weights.
---
Full diff: https://github.com/llvm/llvm-project/pull/169979.diff
4 Files Affected:
- (modified) llvm/lib/CodeGen/ExpandMemCmp.cpp (+21-15)
- (modified) llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll (+14-6)
- (modified) llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll (+11-4)
- (modified) llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll (+19-7)
``````````diff
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 74f93e1979532..61ab0cb869904 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -62,7 +63,6 @@ static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(
namespace {
-
// This class provides helper functions to expand a memcmp library call into an
// inline expansion.
class MemCmpExpansion {
@@ -92,8 +92,7 @@ class MemCmpExpansion {
// 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
struct LoadEntry {
LoadEntry(unsigned LoadSize, uint64_t Offset)
- : LoadSize(LoadSize), Offset(Offset) {
- }
+ : LoadSize(LoadSize), Offset(Offset) {}
// The size of the load for this block, in bytes.
unsigned LoadSize;
@@ -488,6 +487,8 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
// continue to next LoadCmpBlock or EndBlock.
BasicBlock *BB = Builder.GetInsertBlock();
BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CmpBr, DEBUG_TYPE,
+ CI->getFunction());
Builder.Insert(CmpBr);
if (DTU)
DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB},
@@ -552,6 +553,8 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
// to next LoadCmpBlock or EndBlock.
BasicBlock *BB = Builder.GetInsertBlock();
BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CmpBr, DEBUG_TYPE,
+ CI->getFunction());
Builder.Insert(CmpBr);
if (DTU)
DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB},
@@ -592,6 +595,8 @@ void MemCmpExpansion::emitMemCmpResultBlock() {
Value *Res =
Builder.CreateSelect(Cmp, Constant::getAllOnesValue(Builder.getInt32Ty()),
ConstantInt::get(Builder.getInt32Ty(), 1));
+ setExplicitlyUnknownBranchWeightsIfProfiled(*cast<Instruction>(Res),
+ DEBUG_TYPE, CI->getFunction());
PhiRes->addIncoming(Res, ResBlock.BB);
BranchInst *NewBr = BranchInst::Create(EndBlock);
@@ -724,7 +729,8 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
// calculate which source was larger. The calculation requires the
// two loaded source values of each load compare block.
// These will be saved in the phi nodes created by setupResultBlockPHINodes.
- if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
+ if (!IsUsedForZeroCmp)
+ setupResultBlockPHINodes();
// Create the number of required load compare basic blocks.
createLoadCmpBlocks();
@@ -853,15 +859,14 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
const bool IsUsedForZeroCmp =
IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);
bool OptForSize = llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
- auto Options = TTI->enableMemCmpExpansion(OptForSize,
- IsUsedForZeroCmp);
- if (!Options) return false;
+ auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp);
+ if (!Options)
+ return false;
if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
- if (OptForSize &&
- MaxLoadsPerMemcmpOptSize.getNumOccurrences())
+ if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences())
Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
@@ -907,13 +912,14 @@ class ExpandMemCmpLegacyPass : public FunctionPass {
}
bool runOnFunction(Function &F) override {
- if (skipFunction(F)) return false;
+ if (skipFunction(F))
+ return false;
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC) {
return false;
}
- const TargetLowering* TL =
+ const TargetLowering *TL =
TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
const TargetLibraryInfo *TLI =
@@ -921,9 +927,9 @@ class ExpandMemCmpLegacyPass : public FunctionPass {
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- auto *BFI = (PSI && PSI->hasProfileSummary()) ?
- &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
- nullptr;
+ auto *BFI = (PSI && PSI->hasProfileSummary())
+ ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
+ : nullptr;
DominatorTree *DT = nullptr;
if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
DT = &DTWP->getDomTree();
@@ -969,7 +975,7 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
if (DT)
DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- const DataLayout& DL = F.getDataLayout();
+ const DataLayout &DL = F.getDataLayout();
bool MadeChanges = false;
for (auto BBIt = F.begin(); BBIt != F.end();) {
if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 7df48d878bd0b..0ed3af535ac9c 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
@@ -98,15 +98,15 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %call
}
-define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; CHECK-LABEL: define i32 @cmp7(
-; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]]) {
+; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]]) !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: br label [[LOADBB:%.*]]
; CHECK: res_block:
; CHECK-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
; CHECK-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: br label [[ENDBLOCK:%.*]]
; CHECK: loadbb:
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[X]], align 1
@@ -114,7 +114,7 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; CHECK-NEXT: [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
; CHECK-NEXT: [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]], !prof [[PROF1]]
; CHECK: loadbb1:
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
@@ -123,7 +123,7 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; CHECK-NEXT: [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
; CHECK-NEXT: [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]], !prof [[PROF1]]
; CHECK: endblock:
; CHECK-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
; CHECK-NEXT: ret i32 [[PHI_RES]]
@@ -860,3 +860,11 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) {
%conv = zext i1 %cmp to i32
ret i32 %conv
}
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index 0507ec9de542e..30da8b391f55d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -S -expand-memcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
@@ -34,12 +34,12 @@ define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonl
ret i32 %call
}
-define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; X32-LABEL: @cmp3(
; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
; X32-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
-; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]]
; X32-NEXT: br label [[ENDBLOCK:%.*]]
; X32: loadbb:
; X32-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
@@ -47,7 +47,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X32-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
; X32-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
; X32-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]], !prof [[PROF1]]
; X32: loadbb1:
; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
@@ -564,3 +564,10 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %conv
}
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; X32: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; X32: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; X32: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index 86dc3e5245f24..b5075a2fcff0b 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
@@ -36,12 +36,12 @@ define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonl
ret i32 %call
}
-define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; X64-LABEL: @cmp3(
; X64-NEXT: br label [[LOADBB:%.*]]
; X64: res_block:
; X64-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
-; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]]
; X64-NEXT: br label [[ENDBLOCK:%.*]]
; X64: loadbb:
; X64-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
@@ -49,7 +49,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X64-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
; X64-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
; X64-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]], !prof [[PROF1]]
; X64: loadbb1:
; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
@@ -474,7 +474,7 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %conv
}
-define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
+define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 {
; X64_1LD-LABEL: @cmp_eq3(
; X64_1LD-NEXT: br label [[LOADBB:%.*]]
; X64_1LD: res_block:
@@ -483,14 +483,14 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X64_1LD-NEXT: [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
; X64_1LD-NEXT: [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]], !prof [[PROF1]]
; X64_1LD: loadbb1:
; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
; X64_1LD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
; X64_1LD-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
; X64_1LD-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
; X64_1LD-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]], !prof [[PROF1]]
; X64_1LD: endblock:
; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -1076,3 +1076,15 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) {
ret i32 %conv
}
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; X64_1LD: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; X64_2LD: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; X64_1LD: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; X64_1LD: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
+; X64_2LD: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; X64_2LD: [[PROF1]] = !{!"unknown", !"expand-memcmp"}
+;.
``````````
</details>
https://github.com/llvm/llvm-project/pull/169979
More information about the llvm-commits
mailing list