[llvm] [MemProf] Support cloning for indirect calls with ThinLTO (PR #110625)

Snehasish Kumar via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 4 10:47:58 PDT 2024


================
@@ -0,0 +1,320 @@
+;; Test that cloning of an indirect call works. We should perform ICP and update
+;; promoted call to the correct clone.
+
+;; This was created from the following source code, then the IR was reduced
+;; using llvm-reduce with the expected FileCheck input.
+
+;; -- virtfunc.h: --
+;; #include <unistd.h>
+;;
+;; void external(int *x);
+;;
+;; class B0 {
+;;  public:
+;;   virtual int bar(unsigned s);
+;; };
+;;
+;; class B : public B0 {
+;;  public:
+;;   int bar(unsigned s) override;
+;; };
+;;
+;; int foo(B0 &b, unsigned s);
+
+;; -- virtfunc.cc: --
+;; #include "virtfunc.h"
+;;
+;; int foo(B0 &b, unsigned s) {
+;;   return b.bar(s);
+;; }
+
+;; -- virtfunc_main.cc: --
+;; #include "virtfunc.h"
+;; #include <stdio.h>
+;;
+;; int main() {
+;;   B b;
+;;   int x = foo(b, 1);
+;;   printf("%d\n", x);
+;;   int y = foo(b, 10);
+;;   printf("%d\n", y);
+;;   B0 b0;
+;;   x = foo(b0, 1);
+;;   printf("%d\n", x);
+;;   y = foo(b0, 10);
+;;   printf("%d\n", y);
+;;   return 0;
+;; }
+;;
+;; int B0::bar(unsigned s) {
+;;   int *x = new int;
+;;   sleep(s);
+;;   external(x);
+;;   delete x;
+;;   return 1;
+;; }
+;;
+;; int B::bar(unsigned s) {
+;;   int *x = new int;
+;;   sleep(s);
+;;   external(x);
+;;   delete x;
+;;   return 2;
+;; }
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: split-file %s %t
+
+; RUN: opt -thinlto-bc %t/main.ll >%t/main.o
+; RUN: opt -thinlto-bc %t/foo.ll >%t/foo.o
+
+;; Check that we get the synthesized callsite records. There should be 2, one
+;; for each profiled target in the VP metadata. They will have the same stackIds
+;; since the debug information for the callsite is the same.
+; RUN: llvm-dis %t/foo.o -o - | FileCheck %s --check-prefix=CALLSITES
+; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)))
+
+;; Make sure that we don't get the synthesized callsite records if the
+;; -enable-memprof-indirect-call-support flag is false.
+; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false -o - \
+; RUN: 	| llvm-dis -o - | FileCheck %s --implicit-check-not callsites
+
+;; First perform in-process ThinLTO
+; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
+; RUN:  -r=%t/main.o,_Znwm, \
+; RUN:  -r=%t/main.o,_ZdlPvm, \
+; RUN:  -r=%t/main.o,_Z8externalPi, \
+; RUN:  -r=%t/main.o,main,plx \
+; RUN:  -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN:  -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN:  -r=%t/main.o,_ZTV1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI1B,plx \
+; RUN:  -r=%t/main.o,_ZTV2B0,plx \
+; RUN:	-thinlto-threads=1 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=. -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS-MAIN \
+; RUN:  --check-prefix=REMARKS-FOO
+
+; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
+; RUN:  -r=%t/main.o,_Znwm, \
+; RUN:  -r=%t/main.o,_ZdlPvm, \
+; RUN:  -r=%t/main.o,_Z8externalPi, \
+; RUN:  -r=%t/main.o,main,plx \
+; RUN:  -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN:  -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN:  -r=%t/main.o,_ZTV1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI1B,plx \
+; RUN:  -r=%t/main.o,_ZTV2B0,plx \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS
+
+;; Run ThinLTO backend
+; RUN: opt -import-all-index -passes=function-import,memprof-context-disambiguation,inline \
+; RUN:  -summary-file=%t/foo.o.thinlto.bc -memprof-import-summary=%t/foo.o.thinlto.bc \
+; RUN:  -enable-import-metadata -stats -pass-remarks=. \
+; RUN:  %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO
+
+; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
+; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
+; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1
+; REMARKS-MAIN: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS-MAIN: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-MAIN: created clone _ZN1B3barEj.memprof.1
+; REMARKS-MAIN: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS-MAIN: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-FOO: created clone _Z3fooR2B0j.memprof.1
+;; In each version of foo we should have promoted the indirect call to two conditional
+;; direct calls, one to B::bar and one to B0::bar. The cloned version of foo should call
+;; the cloned versions of bar for both promotions.
+; REMARKS-FOO: Promote indirect call to _ZN1B3barEj with count 2 out of 4
+; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN1B3barEj
+; REMARKS-FOO: Promote indirect call to _ZN1B3barEj with count 2 out of 4
+; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN1B3barEj.memprof.1
+; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
+; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj
+; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
+; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1
+; REMARKS-FOO: created clone _ZN2B03barEj.memprof.1
+; REMARKS-FOO: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS-FOO: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-FOO: created clone _ZN1B3barEj.memprof.1
+; REMARKS-FOO: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS-FOO: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
+; STATS-BE: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
+; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+
+; IR: define {{.*}} @_Z3fooR2B0j(
+; IR:   %1 = icmp eq ptr %0, @_ZN1B3barEj
+; IR:   br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
+; IR: if.true.direct_targ:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
+; IR: if.false.orig_indirect:
+; IR:   %2 = icmp eq ptr %0, @_ZN2B03barEj
+; IR:   br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
+; IR: if.true.direct_targ1:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
+; IR: if.false.orig_indirect2:
+; IR:   call {{.*}} %0
+
+; IR: define {{.*}} @_Z3fooR2B0j.memprof.1(
+;; We should still compare against the original versions of bar since that is
+;; what is in the vtable. However, we should have called the cloned versions
+;; that perform cold allocations, which were subsequently inlined.
+; IR:   %1 = icmp eq ptr %0, @_ZN1B3barEj
+; IR:   br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
+; IR: if.true.direct_targ:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
+; IR: if.false.orig_indirect:
+; IR:   %2 = icmp eq ptr %0, @_ZN2B03barEj
+; IR:   br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
+; IR: if.true.direct_targ1:
+; IR:   call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
+; IR: if.false.orig_indirect2:
+; IR:   call {{.*}} %0
+
+; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
+; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"
+
+; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE-DISTRIB: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+
+;--- foo.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @_Z3fooR2B0j(ptr %b) {
+entry:
+  %0 = load ptr, ptr %b, align 8
+  %call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1
+  ret i32 0
+}
+
+!0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2}
+!1 = !{i64 -2101080423462424381}
+
+;--- main.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV1B = external constant { [3 x ptr] }
+ at _ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
+ at _ZTS1B = external constant [3 x i8]
+ at _ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+ at _ZTS2B0 = external constant [4 x i8]
+ at _ZTI2B0 = external constant { ptr, ptr }
+ at _ZTI1B = external constant { ptr, ptr, ptr }
+ at _ZTV2B0 = external constant { [3 x ptr] }
+
+define i32 @main() !prof !29 {
+entry:
+  %call2 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !30
+  %call4 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !31
+  %call6 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !32
+  ret i32 0
+}
+
+declare i32 @_Z3fooR2B0j(ptr, i32)
+
+define i32 @_ZN2B03barEj(ptr %this, i32 %s) {
+entry:
+  %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !33, !callsite !38
+  store volatile i32 0, ptr %call, align 4
+  ret i32 0
+}
+
+declare ptr @_Znwm(i64)
+
+declare void @_Z8externalPi()
+
+declare void @_ZdlPvm()
+
+define i32 @_ZN1B3barEj(ptr %this, i32 %s) {
+entry:
+  %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !39, !callsite !44
+  store volatile i32 0, ptr %call, align 4
+  ret i32 0
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3fooR2B0j, { 2, 1, 0 }
+
+attributes #0 = { builtin allocsize(0) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+!2 = !{!"ProfileFormat", !"InstrProf"}
----------------
snehasish wrote:

Is it worth adding a test for SampleProf? 

Maybe add TODO somewhere in the code to add a test once we have a text format to prevent any changes from breaking our usage.

https://github.com/llvm/llvm-project/pull/110625


More information about the llvm-commits mailing list