[clang] [compiler-rt] [llvm] [TypeProf][InstrFDO]Implement more efficient comparison sequence for indirect-call-promotion with vtable profiles. (PR #81442)

Mingming Liu via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 28 09:05:49 PDT 2024


https://github.com/minglotus-6 updated https://github.com/llvm/llvm-project/pull/81442

>From 48adcf1a142de6abeeb16596c5087fe83e8f422b Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Wed, 7 Feb 2024 15:12:36 -0800
Subject: [PATCH 01/16] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20?=
 =?UTF-8?q?changes=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4

[skip ci]
---
 compiler-rt/include/profile/InstrProfData.inc |  58 +-
 compiler-rt/lib/profile/InstrProfiling.h      |  35 +-
 .../lib/profile/InstrProfilingBuffer.c        |  58 +-
 .../lib/profile/InstrProfilingInternal.h      |   4 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  25 +-
 .../lib/profile/InstrProfilingPlatformLinux.c |  20 +
 .../lib/profile/InstrProfilingWriter.c        |  37 +-
 .../llvm/Analysis/IndirectCallVisitor.h       |  70 +-
 llvm/include/llvm/ProfileData/InstrProf.h     | 170 ++++-
 .../llvm/ProfileData/InstrProfData.inc        |  40 +-
 .../llvm/ProfileData/InstrProfReader.h        |  20 +
 .../llvm/ProfileData/InstrProfWriter.h        |   4 +
 .../IndirectCallPromotionAnalysis.cpp         |   4 +
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp   |  20 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |  13 +-
 llvm/lib/ProfileData/InstrProf.cpp            | 172 ++++-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  72 ++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  59 +-
 .../Instrumentation/IndirectCallPromotion.cpp |  45 +-
 .../Instrumentation/InstrProfiling.cpp        | 173 +++++
 .../Instrumentation/PGOInstrumentation.cpp    |   7 +
 .../Instrumentation/ValueProfilePlugins.inc   |  36 +-
 .../thinlto-func-summary-vtableref-pgo.ll     |  74 +++
 .../InstrProfiling/coverage.ll                |   8 +-
 .../thinlto_indirect_call_promotion.profraw   | Bin 528 -> 544 bytes
 .../PGOProfile/Inputs/vtable_prof.profraw     | Bin 0 -> 656 bytes
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../Transforms/PGOProfile/vtable_profile.ll   |  98 +++
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 2016 -> 2032 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1968 -> 1984 bytes
 .../Inputs/update_vtable_value_prof_inputs.sh | 102 +++
 .../Inputs/vtable-value-prof-basic.profraw    | Bin 0 -> 960 bytes
 .../Inputs/vtable-value-prof.proftext         |  73 +++
 .../llvm-profdata/binary-ids-padding.test     |   6 +-
 .../llvm-profdata/large-binary-id-size.test   |   4 +-
 ...alformed-not-space-for-another-header.test |   6 +-
 .../malformed-num-counters-zero.test          |   6 +-
 .../malformed-ptr-to-counter-array.test       |   6 +-
 .../misaligned-binary-ids-size.test           |   4 +-
 .../mismatched-raw-profile-header.test        |   2 +
 .../tools/llvm-profdata/raw-32-bits-be.test   |  11 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-two-profiles.test |   8 +-
 .../vtable-value-prof-basic.test              | 124 ++++
 .../llvm-profdata/vtable-value-prof.proftext  |  16 +
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  30 +-
 llvm/unittests/ProfileData/InstrProfTest.cpp  | 604 ++++++++++++++----
 49 files changed, 2065 insertions(+), 293 deletions(-)
 create mode 100644 llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/vtable_prof.profraw
 create mode 100644 llvm/test/Transforms/PGOProfile/vtable_profile.ll
 create mode 100755 llvm/test/tools/llvm-profdata/Inputs/update_vtable_value_prof_inputs.sh
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof-basic.profraw
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof.proftext
 create mode 100644 llvm/test/tools/llvm-profdata/vtable-value-prof-basic.test
 create mode 100644 llvm/test/tools/llvm-profdata/vtable-value-prof.proftext

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 25df899b3f361..f0bc2d960ce68 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -94,6 +94,26 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(
+    const uint64_t, llvm::Type::getInt64Ty(Ctx), VTableNameHash,
+    ConstantInt::get(llvm::Type::getInt64Ty(Ctx),
+                     IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::Type::getInt8PtrTy(Ctx),
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize,
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx),
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -145,6 +165,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -186,13 +208,28 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the addresses of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (i.e., there is an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (beginning and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The address of the compatible vtable (i.e., "
+                                      "there is an offset from this address to per C++ "
+                                      "class virtual table global variable.)")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -267,9 +304,9 @@ COVMAP_HEADER(uint32_t, Int32Ty, Version, \
 #undef COVMAP_HEADER
 /* COVMAP_HEADER end.  */
 
-
 #ifdef INSTR_PROF_SECT_ENTRY
 #define INSTR_PROF_DATA_DEFINED
+
 INSTR_PROF_SECT_ENTRY(IPSK_data, \
                       INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON), \
                       INSTR_PROF_DATA_COFF, "__DATA,")
@@ -282,12 +319,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -307,7 +350,6 @@ INSTR_PROF_SECT_ENTRY(IPSK_covname, \
 #undef INSTR_PROF_SECT_ENTRY
 #endif
 
-
 #ifdef INSTR_PROF_VALUE_PROF_DATA
 #define INSTR_PROF_DATA_DEFINED
 
@@ -479,7 +521,6 @@ getValueProfRecordHeaderSize(uint32_t NumValueSites);
 #undef INSTR_PROF_VALUE_PROF_DATA
 #endif  /* INSTR_PROF_VALUE_PROF_DATA */
 
-
 #ifdef INSTR_PROF_COMMON_API_IMPL
 #define INSTR_PROF_DATA_DEFINED
 #ifdef __cplusplus
@@ -663,9 +704,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -703,10 +744,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -717,10 +760,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
@@ -938,3 +983,4 @@ InstrProfIsSingleValRange(uint64_t Value) {
 }
 
 #endif /* INSTR_PROF_VALUE_PROF_MEMOP_API */
+
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 0123908336918..9e6306ace61f2 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,6 +49,12 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
+typedef void *IntPtrT;
+typedef struct VTableProfData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "profile/InstrProfData.inc"
+} VTableProfData;
+
 /*!
  * \brief Return 1 if profile counters are continuously synced to the raw
  * profile via an mmap(). This is in contrast to the default mode, in which
@@ -103,12 +109,16 @@ const __llvm_profile_data *__llvm_profile_begin_data(void);
 const __llvm_profile_data *__llvm_profile_end_data(void);
 const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
+const char *__llvm_profile_begin_vtabnames(void);
+const char *__llvm_profile_end_vtabnames(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
 char *__llvm_profile_begin_bitmap(void);
 char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
+VTableProfData *__llvm_profile_begin_vtables();
+VTableProfData *__llvm_profile_end_vtables();
 uint32_t *__llvm_profile_begin_orderfile();
 
 /*!
@@ -252,20 +262,31 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
 /*! \brief Get the size of the profile name section in bytes. */
 uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
 
-/* ! \brief Given the sizes of the data and counter information, return the
- * number of padding bytes before and after the counters, and after the names,
- * in the raw profile.
+/*! \brief Get the number of virtual table profile data entries */
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End);
+
+/*! \brief Get the size of virtual table profile data in bytes. */
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End);
+
+/* ! \brief Given the sizes of the data and counter information, computes the
+ * number of padding bytes before and after the counter section, as well as the
+ * number of padding bytes after other setions in the raw profile.
+ * Returns -1 upon errors and 0 upon success. Output parameters should be used
+ * iff return value is 0.
  *
  * Note: When mmap() mode is disabled, no padding bytes before/after counters
  * are needed. However, in mmap() mode, the counter section in the raw profile
  * must be page-aligned: this API computes the number of padding bytes
  * needed to achieve that.
  */
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
-    uint64_t *PaddingBytesAfterNames);
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames);
 
 /*!
  * \brief Set the flag that profile data has been dumped to the file.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index af52804b2b532..f31dc7d4e2111 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -70,6 +70,18 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
                                       const __llvm_profile_data *End) {
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End) {
+  intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin;
+  return (EndI + sizeof(VTableProfData) - 1 - BeginI) / sizeof(VTableProfData);
+}
+
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End) {
+  return __llvm_profile_get_num_vtable(Begin, End) * sizeof(VTableProfData);
+}
 
 COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) {
   if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE)
@@ -119,11 +131,13 @@ static int needsCounterPadding(void) {
 }
 
 COMPILER_RT_VISIBILITY
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
-    uint64_t *PaddingBytesAfterNames) {
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) {
+  // Counter padding is needed only if continuous mode is enabled.
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
@@ -131,9 +145,19 @@ void __llvm_profile_get_padding_sizes_for_counters(
     *PaddingBytesAfterBitmapBytes =
         __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
-    return;
+    if (PaddingBytesAfterVTable != NULL)
+      *PaddingBytesAfterVTable =
+          __llvm_profile_get_num_padding_bytes(VTableSize);
+    if (PaddingBytesAfterVName != NULL)
+      *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize);
+    return 0;
   }
 
+  // Value profiling not supported in continuous mode at profile-write time.
+  // Return -1 to alert the incompatibility.
+  if (VTableSize != 0 || VNameSize != 0)
+    return -1;
+
   // In continuous mode, the file offsets for headers and for the start of
   // counter sections need to be page-aligned.
   *PaddingBytesBeforeCounters =
@@ -142,6 +166,13 @@ void __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesAfterBitmapBytes =
       calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
+  // Set these two variables to zero to avoid uninitialized variables
+  // even if VTableSize and VNameSize are known to be zero.
+  if (PaddingBytesAfterVTable != NULL)
+    *PaddingBytesAfterVTable = 0;
+  if (PaddingBytesAfterVName != NULL)
+    *PaddingBytesAfterVName = 0;
+  return 0;
 }
 
 COMPILER_RT_VISIBILITY
@@ -162,9 +193,11 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
       PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */,
+      0 /* VNameSize */, &PaddingBytesBeforeCounters,
+      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+      &PaddingBytesAfterNames, NULL /* PaddingBytesAfterVTable */,
+      NULL /* PaddingbytesAfterVNames */);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
@@ -191,7 +224,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
-  return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
-                            NamesEnd, 0);
+  // Set virtual table arguments to NULL since they are not supported yet.
+  return lprofWriteDataImpl(
+      &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
+      BitmapBegin, BitmapEnd, 0 /* VPDataReader */, NamesBegin, NamesEnd,
+      NULL /* VTableBegin */, NULL /* VTableEnd */, NULL /* VNamesBegin */,
+      NULL /* VNamesEnd */, 0 /* SkipNameDataWrite */);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 03ed67fcfa766..38159b668a1df 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -156,7 +156,9 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const char *CountersBegin, const char *CountersEnd,
                        const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
-                       const char *NamesEnd, int SkipNameDataWrite);
+                       const char *NamesEnd, const VTableProfData *VTableBegin,
+                       const VTableProfData *VTableEnd, const char *VNamesBegin,
+                       const char *VNamesEnd, int SkipNameDataWrite);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index b5850e99ee37d..ad7a50dc77f44 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -107,6 +107,27 @@ static uintptr_t signextIfWin64(void *V) {
 #endif
 }
 
+static uint64_t
+getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) {
+  // Skip names section, vtable profile data section and vtable names section
+  // for runtime profile merge. To merge runtime addresses from multiple
+  // profiles collected from the same instrumented binary, the binary should be
+  // loaded at fixed base address (e.g., build with -no-pie, or run with ASLR
+  // disabled).
+  // In this set-up these three sections remain unchanged.
+  const uint64_t VTableSectionSize =
+      Header->NumVTables * sizeof(VTableProfData);
+  const uint64_t PaddingBytesAfterVTableSection =
+      __llvm_profile_get_num_padding_bytes(VTableSectionSize);
+  const uint64_t VNamesSize = Header->VNamesSize;
+  const uint64_t PaddingBytesAfterVNamesSize =
+      __llvm_profile_get_num_padding_bytes(VNamesSize);
+  return Header->NamesSize +
+         __llvm_profile_get_num_padding_bytes(Header->NamesSize) +
+         VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize +
+         PaddingBytesAfterVNamesSize;
+}
+
 COMPILER_RT_VISIBILITY
 int __llvm_profile_merge_from_buffer(const char *ProfileData,
                                      uint64_t ProfileSize) {
@@ -136,9 +157,9 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
                    Header->NumCounters * __llvm_profile_counter_entry_size();
   SrcBitmapStart = SrcCountersEnd;
   SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
+
   SrcValueProfDataStart =
-      SrcNameStart + Header->NamesSize +
-      __llvm_profile_get_num_padding_bytes(Header->NamesSize);
+      SrcNameStart + getDistanceFromCounterToValueProf(Header);
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 19266ab6c6fb8..d2554a2702aaf 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -24,8 +24,12 @@
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
+#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON)
+#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
+#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON)
+#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON)
 #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
 #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
@@ -41,6 +45,10 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -63,6 +71,18 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) {
 COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) {
   return &PROF_NAME_STOP;
 }
+COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) {
+  return &PROF_VNAME_START;
+}
+COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) {
+  return &PROF_VNAME_STOP;
+}
+COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_begin_vtables(void) {
+  return &PROF_VTABLE_START;
+}
+COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_end_vtables(void) {
+  return &PROF_VTABLE_STOP;
+}
 COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
   return &PROF_CNTS_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 4d767d1385148..8816a71155511 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -250,9 +250,14 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
                             CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
-                            NamesBegin, NamesEnd, SkipNameDataWrite);
+                            NamesBegin, NamesEnd, VTableBegin, VTableEnd,
+                            VNamesBegin, VNamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
@@ -261,7 +266,9 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *CountersBegin, const char *CountersEnd,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
-                   const char *NamesEnd, int SkipNameDataWrite) {
+                   const char *NamesEnd, const VTableProfData *VTableBegin,
+                   const VTableProfData *VTableEnd, const char *VNamesBegin,
+                   const char *VNamesEnd, int SkipNameDataWrite) {
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
       __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -273,6 +280,12 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
+  const uint64_t NumVTables =
+      __llvm_profile_get_num_vtable(VTableBegin, VTableEnd);
+  const uint64_t VTableSectionSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNamesSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -280,11 +293,15 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
-  __llvm_profile_get_padding_sizes_for_counters(
-      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
+          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
+          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+          &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+          &PaddingBytesAfterVNames) == -1)
+    return -1;
 
   {
 /* Initialize header structure.  */
@@ -323,7 +340,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
+      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1},
+      {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1},
+      {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
diff --git a/llvm/include/llvm/Analysis/IndirectCallVisitor.h b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
index 0825e19ecd2d2..5969241a179ea 100644
--- a/llvm/include/llvm/Analysis/IndirectCallVisitor.h
+++ b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
@@ -12,27 +12,87 @@
 #ifndef LLVM_ANALYSIS_INDIRECTCALLVISITOR_H
 #define LLVM_ANALYSIS_INDIRECTCALLVISITOR_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/InstVisitor.h"
 #include <vector>
 
 namespace llvm {
-// Visitor class that finds all indirect call.
+// Visitor class that finds indirect calls or instructions that gives vtable
+// value, depending on Type.
 struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
+  enum class InstructionType {
+    kIndirectCall = 0,
+    kVTableVal = 1,
+  };
   std::vector<CallBase *> IndirectCalls;
-  PGOIndirectCallVisitor() = default;
+  std::vector<Instruction *> ProfiledAddresses;
+  PGOIndirectCallVisitor(InstructionType Type) : Type(Type) {}
+
+  // Given an indirect call instruction, try to find the the following pattern
+  //
+  // %vtable = load ptr, ptr %obj
+  // %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  // %2 = load ptr, ptr %vfn
+  // $call = tail call i32 %2
+  //
+  // A heuristic is used to find the address feeding instructions.
+  static Instruction *tryGetVTableInstruction(CallBase *CB) {
+    assert(CB != nullptr && "Caller guaranteed");
+    LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());
+
+    if (LI != nullptr) {
+      Value *FuncPtr = LI->getPointerOperand(); // GEP (or bitcast)
+      Value *VTablePtr = FuncPtr->stripInBoundsConstantOffsets();
+      // FIXME: Add support in the frontend so LLVM type intrinsics are
+      // emitted without LTO. This way, added intrinsics could filter
+      // non-vtable instructions and reduce instrumentation overhead.
+      // Since a non-vtable profiled address is not within the address
+      // range of vtable objects, it's stored as zero in indexed profiles.
+      // A pass that looks up symbol with an zero hash will (almost) always
+      // find nullptr and skip the actual transformation (e.g., comparison
+      // of symbols). So the performance overhead from non-vtable profiled
+      // address is negligible if exists at all. Comparing loaded address
+      // with symbol address guarantees correctness.
+      if (VTablePtr != nullptr && isa<Instruction>(VTablePtr)) {
+        return cast<Instruction>(VTablePtr);
+      }
+    }
+    return nullptr;
+  }
 
   void visitCallBase(CallBase &Call) {
-    if (Call.isIndirectCall())
+    if (Call.isIndirectCall()) {
       IndirectCalls.push_back(&Call);
+
+      if (Type != InstructionType::kVTableVal)
+        return;
+
+      Instruction *VPtr =
+          PGOIndirectCallVisitor::tryGetVTableInstruction(&Call);
+      if (VPtr) {
+        ProfiledAddresses.push_back(VPtr);
+      }
+    }
   }
+
+private:
+  InstructionType Type;
 };
 
-// Helper function that finds all indirect call sites.
 inline std::vector<CallBase *> findIndirectCalls(Function &F) {
-  PGOIndirectCallVisitor ICV;
+  PGOIndirectCallVisitor ICV(
+      PGOIndirectCallVisitor::InstructionType::kIndirectCall);
   ICV.visit(F);
   return ICV.IndirectCalls;
 }
+
+inline std::vector<Instruction *> findVTableAddrs(Function &F) {
+  PGOIndirectCallVisitor ICV(
+      PGOIndirectCallVisitor::InstructionType::kVTableVal);
+  ICV.visit(F);
+  return ICV.ProfiledAddresses;
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 87e7bbbd727ee..6cdceae5eeb96 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -89,6 +89,9 @@ inline StringRef getInstrProfValueProfMemOpFuncName() {
 /// Return the name prefix of variables containing instrumented function names.
 inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; }
 
+/// Return the name prefix of variables containing virtual table profile data.
+inline StringRef getInstrProfVTableVarPrefix() { return "__profvt_"; }
+
 /// Return the name prefix of variables containing per-function control data.
 inline StringRef getInstrProfDataVarPrefix() { return "__profd_"; }
 
@@ -110,6 +113,8 @@ inline StringRef getInstrProfNamesVarName() {
   return "__llvm_prf_nm";
 }
 
+inline StringRef getInstrProfVTableNamesVarName() { return "__llvm_prf_vnm"; }
+
 /// Return the name of a covarage mapping variable (internal linkage)
 /// for each instrumented source module. Such variables are allocated
 /// in the __llvm_covmap section.
@@ -246,6 +251,9 @@ Error collectGlobalObjectNameStrings(ArrayRef<std::string> NameStrs,
 Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
                                 std::string &Result, bool doCompression = true);
 
+Error collectVTableStrings(ArrayRef<GlobalVariable *> VTables,
+                           std::string &Result, bool doCompression);
+
 /// Check if INSTR_PROF_RAW_VERSION_VAR is defined. This global is only being
 /// set in IR PGO compilation.
 bool isIRPGOFlagSet(const Module *M);
@@ -269,13 +277,15 @@ void annotateValueSite(Module &M, Instruction &Inst,
                        uint32_t MaxMDCount = 3);
 
 /// Same as the above interface but using an ArrayRef, as well as \p Sum.
+/// This function will not annotate !prof metadata on the instruction if the
+/// referenced array is empty.
 void annotateValueSite(Module &M, Instruction &Inst,
                        ArrayRef<InstrProfValueData> VDs, uint64_t Sum,
                        InstrProfValueKind ValueKind, uint32_t MaxMDCount);
 
 /// Extract the value profile data from \p Inst which is annotated with
 /// value profile meta data. Return false if there is no value data annotated,
-/// otherwise  return true.
+/// otherwise return true.
 bool getValueProfDataFromInst(const Instruction &Inst,
                               InstrProfValueKind ValueKind,
                               uint32_t MaxNumValueData,
@@ -283,11 +293,23 @@ bool getValueProfDataFromInst(const Instruction &Inst,
                               uint32_t &ActualNumValueData, uint64_t &TotalC,
                               bool GetNoICPValue = false);
 
+/// Extract the value profile data from \p Inst and returns them if \p Inst is
+/// annotated with value profile data. Returns nullptr otherwise. It's similar
+/// to `getValueProfDataFromInst` above except that an array is allocated only
+/// after a preliminary checking that the value profiles of kind `ValueKind`
+/// exist.
+std::unique_ptr<InstrProfValueData[]>
+getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind,
+                         uint32_t MaxNumValueData, uint32_t &ActualNumValueData,
+                         uint64_t &TotalC, bool GetNoICPValue = false);
+
 inline StringRef getPGOFuncNameMetadataName() { return "PGOFuncName"; }
 
 /// Return the PGOFuncName meta data associated with a function.
 MDNode *getPGOFuncNameMetadata(const Function &F);
 
+std::string getPGOName(const GlobalVariable &V, bool InLTO = false);
+
 /// Create the PGOFuncName meta data if PGOFuncName is different from
 /// function's raw name. This should only apply to internal linkage functions
 /// declared by users only.
@@ -295,7 +317,7 @@ void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName);
 
 /// Check if we can use Comdat for profile variables. This will eliminate
 /// the duplicated profile variables for Comdat functions.
-bool needsComdatForCounter(const Function &F, const Module &M);
+bool needsComdatForCounter(const GlobalValue &GV, const Module &M);
 
 /// An enum describing the attributes of an instrumented profile.
 enum class InstrProfKind {
@@ -429,20 +451,36 @@ uint64_t ComputeHash(StringRef K);
 class InstrProfSymtab {
 public:
   using AddrHashMap = std::vector<std::pair<uint64_t, uint64_t>>;
+  using RangeHashMap =
+      std::vector<std::pair<std::pair<uint64_t, uint64_t>, uint64_t>>;
 
 private:
   StringRef Data;
   uint64_t Address = 0;
-  // Unique name strings.
+  // Unique name strings. Used to ensure entries in MD5NameMap (a vector that's
+  // going to be sorted) has unique MD5 keys in the first place.
   StringSet<> NameTab;
+  // Records the unique virtual table names. This is used by InstrProfWriter to
+  // write out an on-disk chained hash table of virtual table names.
+  // InstrProfWriter stores per function profile data (keyed by function names)
+  // so it doesn't use a StringSet for function names.
+  StringSet<> VTableNames;
   // A map from MD5 keys to function name strings.
   std::vector<std::pair<uint64_t, StringRef>> MD5NameMap;
+  // A map from MD5 keys to virtual table definitions. Only populated when
+  // building the Symtab from a module.
+  std::vector<std::pair<uint64_t, GlobalVariable *>> MD5VTableMap;
   // A map from MD5 keys to function define. We only populate this map
   // when build the Symtab from a Module.
   std::vector<std::pair<uint64_t, Function *>> MD5FuncMap;
   // A map from function runtime address to function name MD5 hash.
   // This map is only populated and used by raw instr profile reader.
   AddrHashMap AddrToMD5Map;
+  // A map from virtual table runtime address to function name MD5 hash.
+  // This map is only populated and used by raw instr profile reader.
+  // This is a different map from 'AddrToMD5Map' for readability and
+  // debuggability.
+  RangeHashMap VTableAddrRangeToMD5Map;
   bool Sorted = false;
 
   static StringRef getExternalSymbol() {
@@ -470,9 +508,19 @@ class InstrProfSymtab {
 
   /// \c NameStrings is a string composed of one of more sub-strings
   ///  encoded in the format described in \c collectPGOFuncNameStrings.
-  /// This method is a wrapper to \c readPGOFuncNameStrings method.
+  /// This method is a wrapper to \c readAndDecodeStrings method.
   Error create(StringRef NameStrings);
 
+  /// \c FuncNameStrings is a string composed of one or more encoded function
+  /// name strings, and \c VTableNameStrings composes of one or more encoded
+  /// vtable names. This function is a wrapper to \c readAndDecodeStrings
+  /// method.
+  Error create(StringRef FuncNameStrings, StringRef VTableNameStrings);
+
+  /// Initialize 'this' with the set of vtable names encoded in
+  /// \c CompressedVTableNames.
+  Error initVTableNamesFromCompressedStrings(StringRef CompressedVTableNames);
+
   /// This interface is used by reader of CoverageMapping test
   /// format.
   inline Error create(StringRef D, uint64_t BaseAddr);
@@ -485,32 +533,70 @@ class InstrProfSymtab {
 
   /// Create InstrProfSymtab from a set of names iteratable from
   /// \p IterRange. This interface is used by IndexedProfReader.
-  template <typename NameIterRange> Error create(const NameIterRange &IterRange);
-
-  /// Update the symtab by adding \p FuncName to the table. This interface
-  /// is used by the raw and text profile readers.
-  Error addFuncName(StringRef FuncName) {
-    if (FuncName.empty())
+  template <typename NameIterRange>
+  Error create(const NameIterRange &IterRange);
+
+  /// Create InstrProfSymtab from a set of function names and vtable
+  /// names iteratable from \p IterRange. This interface is used by
+  /// IndexedProfReader.
+  template <typename FuncNameIterRange, typename VTableNameIterRange>
+  Error create(const FuncNameIterRange &FuncIterRange,
+               const VTableNameIterRange &VTableIterRange);
+
+  Error addSymbolName(StringRef SymbolName) {
+    if (SymbolName.empty())
       return make_error<InstrProfError>(instrprof_error::malformed,
-                                        "function name is empty");
-    auto Ins = NameTab.insert(FuncName);
+                                        "symbol name is empty");
+
+    // Insert into NameTab so that MD5NameMap (a vector that will be sorted)
+    // won't have duplicated entries in the first place.
+    auto Ins = NameTab.insert(SymbolName);
     if (Ins.second) {
       MD5NameMap.push_back(std::make_pair(
-          IndexedInstrProf::ComputeHash(FuncName), Ins.first->getKey()));
+          IndexedInstrProf::ComputeHash(SymbolName), Ins.first->getKey()));
       Sorted = false;
     }
     return Error::success();
   }
 
+  /// The method name is kept since there are many callers.
+  /// It just forwards to 'addSymbolName'.
+  Error addFuncName(StringRef FuncName) { return addSymbolName(FuncName); }
+
+  /// Adds VTableName as a known symbol, and inserts it to a map that
+  /// tracks all vtable names.
+  Error addVTableName(StringRef VTableName) {
+    if (Error E = addSymbolName(VTableName))
+      return E;
+
+    // Record VTableName. InstrProfWriter uses this map. The comment around
+    // class member explains why.
+    VTableNames.insert(VTableName);
+    return Error::success();
+  }
+
+  const StringSet<> &getVTableNames() const { return VTableNames; }
+
   /// Map a function address to its name's MD5 hash. This interface
   /// is only used by the raw profiler reader.
   void mapAddress(uint64_t Addr, uint64_t MD5Val) {
     AddrToMD5Map.push_back(std::make_pair(Addr, MD5Val));
   }
 
+  /// Map the address range (i.e., [start_address, end_address]) of a variable
+  /// to  its names' MD5 hash. This interface is only used by the raw profile
+  /// reader.
+  void mapVTableAddress(uint64_t StartAddr, uint64_t EndAddr, uint64_t MD5Val) {
+    VTableAddrRangeToMD5Map.push_back(
+        std::make_pair(std::make_pair(StartAddr, EndAddr), MD5Val));
+  }
+
   /// Return a function's hash, or 0, if the function isn't in this SymTab.
   uint64_t getFunctionHashFromAddress(uint64_t Address);
 
+  /// Return a vtable's hash, or 0 if the vtable doesn't exist in this SymTab.
+  uint64_t getVTableHashFromAddress(uint64_t Address);
+
   /// Return function's PGO name from the function name's symbol
   /// address in the object file. If an error occurs, return
   /// an empty string.
@@ -532,6 +618,8 @@ class InstrProfSymtab {
 
   /// Return function from the name's md5 hash. Return nullptr if not found.
   inline Function *getFunction(uint64_t FuncMD5Hash);
+  // Return vtable from the name's MD5 hash. Return nullptr if not found.
+  inline GlobalVariable *getGlobalVariable(uint64_t GlobalVariableMD5Hash);
 
   /// Return the name section data.
   inline StringRef getNameData() const { return Data; }
@@ -556,6 +644,23 @@ Error InstrProfSymtab::create(const NameIterRange &IterRange) {
   return Error::success();
 }
 
+template <typename FuncNameIterRange, typename VTableNameIterRange>
+Error InstrProfSymtab::create(const FuncNameIterRange &FuncIterRange,
+                              const VTableNameIterRange &VTableIterRange) {
+  for (auto Name : FuncIterRange)
+    if (Error E = addFuncName(Name))
+      return E;
+
+  for (auto VTableName : VTableIterRange) {
+    if (Error E = addVTableName(VTableName)) {
+      return E;
+    }
+  }
+
+  finalizeSymtab();
+  return Error::success();
+}
+
 void InstrProfSymtab::finalizeSymtab() {
   if (Sorted)
     return;
@@ -564,6 +669,13 @@ void InstrProfSymtab::finalizeSymtab() {
   llvm::sort(AddrToMD5Map, less_first());
   AddrToMD5Map.erase(std::unique(AddrToMD5Map.begin(), AddrToMD5Map.end()),
                      AddrToMD5Map.end());
+  // VTable object address ranges should not overlap; so sort by either
+  // beginning address or end address is fine.
+  llvm::sort(VTableAddrRangeToMD5Map, less_first());
+  // std::unique uses == operator for std::pair.
+  VTableAddrRangeToMD5Map.erase(std::unique(VTableAddrRangeToMD5Map.begin(),
+                                            VTableAddrRangeToMD5Map.end()),
+                                VTableAddrRangeToMD5Map.end());
   Sorted = true;
 }
 
@@ -594,6 +706,19 @@ Function* InstrProfSymtab::getFunction(uint64_t FuncMD5Hash) {
   return nullptr;
 }
 
+GlobalVariable *
+InstrProfSymtab::getGlobalVariable(uint64_t GlobalVariableMD5Hash) {
+  finalizeSymtab();
+  auto Result =
+      llvm::lower_bound(MD5VTableMap, GlobalVariableMD5Hash,
+                        [](const std::pair<uint64_t, GlobalVariable *> &LHS,
+                           uint64_t RHS) { return LHS.first < RHS; });
+
+  if (Result != MD5VTableMap.end() && Result->first == GlobalVariableMD5Hash)
+    return Result->second;
+  return nullptr;
+}
+
 // To store the sums of profile count values, or the percentage of
 // the sums of the total count values.
 struct CountSumOrPercent {
@@ -820,6 +945,7 @@ struct InstrProfRecord {
   struct ValueProfData {
     std::vector<InstrProfValueSiteRecord> IndirectCallSites;
     std::vector<InstrProfValueSiteRecord> MemOPSizes;
+    std::vector<InstrProfValueSiteRecord> VTableTargets;
   };
   std::unique_ptr<ValueProfData> ValueData;
 
@@ -842,6 +968,8 @@ struct InstrProfRecord {
       return ValueData->IndirectCallSites;
     case IPVK_MemOPSize:
       return ValueData->MemOPSizes;
+    case IPVK_VTableTarget:
+      return ValueData->VTableTargets;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -856,6 +984,8 @@ struct InstrProfRecord {
       return ValueData->IndirectCallSites;
     case IPVK_MemOPSize:
       return ValueData->MemOPSizes;
+    case IPVK_VTableTarget:
+      return ValueData->VTableTargets;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -1025,7 +1155,9 @@ enum ProfVersion {
   Version10 = 10,
   // An additional field is used for bitmap bytes.
   Version11 = 11,
-  // The current version is 11.
+  // VTable profiling,
+  Version12 = 12,
+  // The current version is 12.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1046,6 +1178,7 @@ struct Header {
   uint64_t MemProfOffset;
   uint64_t BinaryIdOffset;
   uint64_t TemporalProfTracesOffset;
+  uint64_t VTableNamesOffset; // Organize virtual table names.
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
@@ -1182,8 +1315,13 @@ template <> inline uint64_t getMagic<uint32_t>() {
 // It should also match the synthesized type in
 // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters.
 template <class IntPtrT> struct alignas(8) ProfileData {
-  #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
-  #include "llvm/ProfileData/InstrProfData.inc"
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+template <class IntPtrT> struct alignas(8) VTableProfileData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
 };
 
 // File header structure of the LLVM profile data in raw format.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 25df899b3f361..77720aba3eb48 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -94,6 +94,22 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(
+    const uint64_t, llvm::Type::getInt64Ty(Ctx), VTableNameHash,
+    ConstantInt::get(llvm::Type::getInt64Ty(Ctx),
+                     IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx),
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize,
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx),
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -145,6 +161,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -186,13 +204,14 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "vtable target")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -267,7 +286,6 @@ COVMAP_HEADER(uint32_t, Int32Ty, Version, \
 #undef COVMAP_HEADER
 /* COVMAP_HEADER end.  */
 
-
 #ifdef INSTR_PROF_SECT_ENTRY
 #define INSTR_PROF_DATA_DEFINED
 INSTR_PROF_SECT_ENTRY(IPSK_data, \
@@ -282,12 +300,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -307,7 +331,6 @@ INSTR_PROF_SECT_ENTRY(IPSK_covname, \
 #undef INSTR_PROF_SECT_ENTRY
 #endif
 
-
 #ifdef INSTR_PROF_VALUE_PROF_DATA
 #define INSTR_PROF_DATA_DEFINED
 
@@ -347,7 +370,7 @@ typedef struct ValueProfRecord {
   /*!
    * Return the number of value sites.
    */
-  uint32_t getNumValueSites() const { return NumValueSites; }
+  uint32_t getNumValueSites() const {  return NumValueSites; }
   /*!
    * Read data from this record and save it to Record.
    */
@@ -479,7 +502,6 @@ getValueProfRecordHeaderSize(uint32_t NumValueSites);
 #undef INSTR_PROF_VALUE_PROF_DATA
 #endif  /* INSTR_PROF_VALUE_PROF_DATA */
 
-
 #ifdef INSTR_PROF_COMMON_API_IMPL
 #define INSTR_PROF_DATA_DEFINED
 #ifdef __cplusplus
@@ -663,9 +685,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -703,10 +725,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -717,10 +741,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 87f15639a2c3c..c1edd7afb75bd 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -326,12 +326,16 @@ class RawInstrProfReader : public InstrProfReader {
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableBegin = nullptr;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableEnd = nullptr;
   const char *CountersStart;
   const char *CountersEnd;
   const char *BitmapStart;
   const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
+  const char *VNamesStart = nullptr;
+  const char *VNamesEnd = nullptr;
   // After value profile is all read, this pointer points to
   // the header of next profile data (if exists)
   const uint8_t *ValueDataStart;
@@ -622,6 +626,12 @@ class InstrProfReaderIndex : public InstrProfReaderIndexBase {
   InstrProfKind getProfileKind() const override;
 
   Error populateSymtab(InstrProfSymtab &Symtab) override {
+    // FIXME: the create method calls 'finalizeSymtab' and sorts a bunch of
+    // arrays/maps. Since there are other data sources other than 'HashTable' to
+    // populate a symtab, it might make sense to have something like this
+    // 1. Let each data source populate Symtab and init the arrays/maps without
+    // calling 'finalizeSymtab'
+    // 2. Call 'finalizeSymtab' once to get all arrays/maps sorted if needed.
     return Symtab.create(HashTable->keys());
   }
 };
@@ -656,6 +666,16 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<MemProfRecordHashTable> MemProfRecordTable;
   /// MemProf frame profile data on-disk indexed via frame id.
   std::unique_ptr<MemProfFrameHashTable> MemProfFrameTable;
+  /// The reader itself doesn't decompress vtable names. A compiler that reads
+  /// indexed profiles could construct symtab from module IR so it doesn't need
+  /// the decompressed names.
+  /// When a symtab is constructed from profiles by llvm-profdata, the list of
+  /// names could be decompressed based on `VTableNamePtr` and
+  /// `CompressedVTableNamesLen`.
+  /// VTableNamePtr points to the beginning of compressed vtable names.
+  const char *VTableNamePtr = nullptr;
+  /// The length of compressed vtable names.
+  uint64_t CompressedVTableNamesLen = 0;
   /// Total size of binary ids.
   uint64_t BinaryIdsSize{0};
   /// Start address of binary id length and data pairs.
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 047b14f223bd9..049fa36bb53f5 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -63,6 +63,9 @@ class InstrProfWriter {
   // List of binary ids.
   std::vector<llvm::object::BuildID> BinaryIds;
 
+  // Read the vtable names from raw instr profile reader.
+  StringSet<> VTableNames;
+
   // An enum describing the attributes of the profile.
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   // Use raw pointer here for the incomplete type object.
@@ -84,6 +87,7 @@ class InstrProfWriter {
   void addRecord(NamedInstrProfRecord &&I, function_ref<void(Error)> Warn) {
     addRecord(std::move(I), 1, Warn);
   }
+  void addVTableName(StringRef VTableName) { VTableNames.insert(VTableName); }
 
   /// Add \p SrcTraces using reservoir sampling where \p SrcStreamSize is the
   /// total number of temporal profiling traces the source has seen.
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index ebfa1c8fc08e1..ab53717eb889a 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -45,6 +45,10 @@ static cl::opt<unsigned>
                      cl::desc("Max number of promotions for a single indirect "
                               "call callsite"));
 
+cl::opt<unsigned> MaxNumVTableAnnotations(
+    "icp-max-num-vtables", cl::init(6), cl::Hidden,
+    cl::desc("Max number of vtables annotated for a vtable load instruction."));
+
 ICallPromotionAnalysis::ICallPromotionAnalysis() {
   ValueDataArray = std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
 }
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 1f15e94783240..3ad0bab827a51 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -82,6 +82,8 @@ static cl::opt<std::string> ModuleSummaryDotFile(
 
 extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
 
+extern cl::opt<unsigned> MaxNumVTableAnnotations;
+
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
@@ -124,6 +126,24 @@ static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
         Worklist.push_back(Operand);
     }
   }
+
+  const Instruction *I = dyn_cast<Instruction>(CurUser);
+  if (I) {
+    uint32_t ActualNumValueData = 0;
+    uint64_t TotalCount = 0;
+    // MaxNumVTableAnnotations is the maximum number of vtables annotated on
+    // the instruction.
+    auto ValueDataArray =
+        getValueProfDataFromInst(*I, IPVK_VTableTarget, MaxNumVTableAnnotations,
+                                 ActualNumValueData, TotalCount);
+
+    if (ValueDataArray.get()) {
+      for (uint32_t j = 0; j < ActualNumValueData; j++) {
+        RefEdges.insert(Index.getOrInsertValueInfo(/* VTableGUID = */
+                                                   ValueDataArray[j].Value));
+      }
+    }
+  }
   return HasBlockAddress;
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 13be0b0c3307f..7686e32b69305 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -199,7 +199,7 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
     for (const auto &GUIDSummaryLists : *Index)
       // Examine all summaries for this GUID.
       for (auto &Summary : GUIDSummaryLists.second.SummaryList)
-        if (auto FS = dyn_cast<FunctionSummary>(Summary.get()))
+        if (auto FS = dyn_cast<FunctionSummary>(Summary.get())) {
           // For each call in the function summary, see if the call
           // is to a GUID (which means it is for an indirect call,
           // otherwise we would have a Value for it). If so, synthesize
@@ -207,6 +207,15 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
           for (auto &CallEdge : FS->calls())
             if (!CallEdge.first.haveGVs() || !CallEdge.first.getValue())
               assignValueId(CallEdge.first.getGUID());
+
+          // For each referenced variables in the function summary, see if the
+          // variable is represented by a GUID (as opposed to a symbol to
+          // declarations or definitions in the module). If so, synthesize a
+          // value id.
+          for (auto &RefEdge : FS->refs())
+            if ((!RefEdge.haveGVs() || !RefEdge.getValue()))
+              assignValueId(RefEdge.getGUID());
+        }
   }
 
 protected:
@@ -4071,7 +4080,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(SpecialRefCnts.second); // worefcnt
 
   for (auto &RI : FS->refs())
-    NameVals.push_back(VE.getValueID(RI.getValue()));
+    NameVals.push_back(getValueId(RI));
 
   const bool UseRelBFRecord =
       WriteRelBFToSummary && !F.hasProfileData() &&
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 2640027455e0d..91e79e8b2e9ad 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -219,6 +219,12 @@ cl::opt<bool> DoInstrProfNameCompression(
     "enable-name-compression",
     cl::desc("Enable name/filename string compression"), cl::init(true));
 
+cl::opt<bool> EnableVTableValueProfiling(
+    "enable-vtable-value-profiling", cl::init(false),
+    cl::desc("If true, the virtual table address will be instrumented to know "
+             "the types of a C++ pointer. The information is used in indirect "
+             "call promotion to do selective vtable-based comparison."));
+
 std::string getInstrProfSectionName(InstrProfSectKind IPSK,
                                     Triple::ObjectFormatType OF,
                                     bool AddSegmentInfo) {
@@ -378,6 +384,13 @@ std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
   return getPGOFuncName(F.getName(), GlobalValue::ExternalLinkage, "");
 }
 
+std::string getPGOName(const GlobalVariable &V, bool InLTO) {
+  // PGONameMetadata should be set by compiler at profile use time
+  // and read by symtab creation to look up symbols corresponding to
+  // a MD5 hash.
+  return getIRPGOObjectName(V, InLTO, nullptr /* PGONameMetadata */);
+}
+
 // See getIRPGOFuncName() for a discription of the format.
 std::pair<StringRef, StringRef>
 getParsedIRPGOFuncName(StringRef IRPGOFuncName) {
@@ -460,6 +473,17 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
     if (Error E = addFuncWithName(F, getPGOFuncName(F, InLTO)))
       return E;
   }
+
+  SmallVector<MDNode *, 2> Types;
+  for (GlobalVariable &G : M.globals()) {
+    if (!G.hasName())
+      continue;
+    Types.clear();
+    G.getMetadata(LLVMContext::MD_type, Types);
+    if (!Types.empty()) {
+      MD5VTableMap.emplace_back(G.getGUID(), &G);
+    }
+  }
   Sorted = false;
   finalizeSymtab();
   return Error::success();
@@ -518,6 +542,25 @@ Error InstrProfSymtab::create(StringRef NameStrings) {
       std::bind(&InstrProfSymtab::addFuncName, this, std::placeholders::_1));
 }
 
+Error InstrProfSymtab::create(StringRef FuncNameStrings,
+                              StringRef VTableNameStrings) {
+  if (Error E = readAndDecodeStrings(FuncNameStrings,
+                                     std::bind(&InstrProfSymtab::addFuncName,
+                                               this, std::placeholders::_1)))
+    return E;
+
+  return readAndDecodeStrings(
+      VTableNameStrings,
+      std::bind(&InstrProfSymtab::addVTableName, this, std::placeholders::_1));
+}
+
+Error InstrProfSymtab::initVTableNamesFromCompressedStrings(
+    StringRef CompressedVTableStrings) {
+  return readAndDecodeStrings(
+      CompressedVTableStrings,
+      std::bind(&InstrProfSymtab::addVTableName, this, std::placeholders::_1));
+}
+
 Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
   if (Error E = addFuncName(PGOFuncName))
     return E;
@@ -550,6 +593,28 @@ Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
   return Error::success();
 }
 
+uint64_t InstrProfSymtab::getVTableHashFromAddress(uint64_t Address) {
+  finalizeSymtab();
+  auto It = lower_bound(
+      VTableAddrRangeToMD5Map, Address,
+      [](std::pair<std::pair<uint64_t, uint64_t>, uint64_t> VTableRangeAddr,
+         uint64_t Addr) {
+        // Find the first address range of which end address is larger than
+        // `Addr`. Smaller-than-or-equal-to is used because the profiled address
+        // within a vtable should be [start-address, end-address).
+        return VTableRangeAddr.first.second <= Addr;
+      });
+
+  // Returns the MD5 hash if Address is within the address range of an entry.
+  if (It != VTableAddrRangeToMD5Map.end() && It->first.first <= Address) {
+    return It->second;
+  }
+  // The virtual table address collected from value profiler could be defined
+  // in another module that is not instrumented. Force the value to be 0 in
+  // this case.
+  return 0;
+}
+
 uint64_t InstrProfSymtab::getFunctionHashFromAddress(uint64_t Address) {
   finalizeSymtab();
   auto It = partition_point(AddrToMD5Map, [=](std::pair<uint64_t, uint64_t> A) {
@@ -626,6 +691,17 @@ Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
       NameStrs, compression::zlib::isAvailable() && doCompression, Result);
 }
 
+Error collectVTableStrings(ArrayRef<GlobalVariable *> VTables,
+                           std::string &Result, bool doCompression) {
+  std::vector<std::string> VTableNameStrs;
+  for (auto *VTable : VTables) {
+    VTableNameStrs.push_back(getPGOName(*VTable));
+  }
+  return collectGlobalObjectNameStrings(
+      VTableNameStrs, compression::zlib::isAvailable() && doCompression,
+      Result);
+}
+
 void InstrProfRecord::accumulateCounts(CountSumOrPercent &Sum) const {
   uint64_t FuncSum = 0;
   Sum.NumEntries += Counts.size();
@@ -888,6 +964,9 @@ uint64_t InstrProfRecord::remapValue(uint64_t Value, uint32_t ValueKind,
   if (ValueKind == IPVK_IndirectCallTarget)
     return SymTab->getFunctionHashFromAddress(Value);
 
+  if (ValueKind == IPVK_VTableTarget)
+    return SymTab->getVTableHashFromAddress(Value);
+
   return Value;
 }
 
@@ -1181,6 +1260,8 @@ void annotateValueSite(Module &M, Instruction &Inst,
                        ArrayRef<InstrProfValueData> VDs,
                        uint64_t Sum, InstrProfValueKind ValueKind,
                        uint32_t MaxMDCount) {
+  if (VDs.empty())
+    return;
   LLVMContext &Ctx = M.getContext();
   MDBuilder MDHelper(Ctx);
   SmallVector<Metadata *, 3> Vals;
@@ -1206,46 +1287,44 @@ void annotateValueSite(Module &M, Instruction &Inst,
   Inst.setMetadata(LLVMContext::MD_prof, MDNode::get(Ctx, Vals));
 }
 
-bool getValueProfDataFromInst(const Instruction &Inst,
-                              InstrProfValueKind ValueKind,
-                              uint32_t MaxNumValueData,
-                              InstrProfValueData ValueData[],
-                              uint32_t &ActualNumValueData, uint64_t &TotalC,
-                              bool GetNoICPValue) {
+MDNode *mayHaveValueProfileOfKind(const Instruction &Inst,
+                                  InstrProfValueKind ValueKind) {
   MDNode *MD = Inst.getMetadata(LLVMContext::MD_prof);
   if (!MD)
-    return false;
+    return nullptr;
 
-  unsigned NOps = MD->getNumOperands();
+  if (MD->getNumOperands() < 5)
+    return nullptr;
 
-  if (NOps < 5)
-    return false;
-
-  // Operand 0 is a string tag "VP":
   MDString *Tag = cast<MDString>(MD->getOperand(0));
-  if (!Tag)
-    return false;
-
-  if (!Tag->getString().equals("VP"))
-    return false;
+  if (!Tag || !Tag->getString().equals("VP"))
+    return nullptr;
 
   // Now check kind:
   ConstantInt *KindInt = mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
   if (!KindInt)
-    return false;
+    return nullptr;
   if (KindInt->getZExtValue() != ValueKind)
-    return false;
+    return nullptr;
 
+  return MD;
+}
+
+static bool getValueProfDataFromInstImpl(const MDNode *const MD,
+                                         const uint32_t MaxNumDataWant,
+                                         InstrProfValueData ValueData[],
+                                         uint32_t &ActualNumValueData,
+                                         uint64_t &TotalC, bool GetNoICPValue) {
+  const unsigned NOps = MD->getNumOperands();
   // Get total count
   ConstantInt *TotalCInt = mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
   if (!TotalCInt)
     return false;
   TotalC = TotalCInt->getZExtValue();
-
   ActualNumValueData = 0;
 
   for (unsigned I = 3; I < NOps; I += 2) {
-    if (ActualNumValueData >= MaxNumValueData)
+    if (ActualNumValueData >= MaxNumDataWant)
       break;
     ConstantInt *Value = mdconst::dyn_extract<ConstantInt>(MD->getOperand(I));
     ConstantInt *Count =
@@ -1262,6 +1341,36 @@ bool getValueProfDataFromInst(const Instruction &Inst,
   return true;
 }
 
+std::unique_ptr<InstrProfValueData[]>
+getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind,
+                         uint32_t MaxNumValueData, uint32_t &ActualNumValueData,
+                         uint64_t &TotalC, bool GetNoICPValue) {
+  MDNode *MD = mayHaveValueProfileOfKind(Inst, ValueKind);
+  if (!MD)
+    return nullptr;
+  auto ValueDataArray = std::make_unique<InstrProfValueData[]>(MaxNumValueData);
+  if (!getValueProfDataFromInstImpl(MD, MaxNumValueData, ValueDataArray.get(),
+                                    ActualNumValueData, TotalC, GetNoICPValue))
+    return nullptr;
+  return ValueDataArray;
+}
+
+// FIXME: Migrate existing callers to the function above that returns an
+// array.
+bool getValueProfDataFromInst(const Instruction &Inst,
+                              InstrProfValueKind ValueKind,
+                              uint32_t MaxNumValueData,
+                              InstrProfValueData ValueData[],
+                              uint32_t &ActualNumValueData, uint64_t &TotalC,
+                              bool GetNoICPValue) {
+  MDNode *MD = mayHaveValueProfileOfKind(Inst, ValueKind);
+  if (!MD)
+    return false;
+  return getValueProfDataFromInstImpl(MD, MaxNumValueData, ValueData,
+                                      ActualNumValueData, TotalC,
+                                      GetNoICPValue);
+}
+
 MDNode *getPGOFuncNameMetadata(const Function &F) {
   return F.getMetadata(getPGOFuncNameMetadataName());
 }
@@ -1278,8 +1387,8 @@ void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName) {
   F.setMetadata(getPGOFuncNameMetadataName(), N);
 }
 
-bool needsComdatForCounter(const Function &F, const Module &M) {
-  if (F.hasComdat())
+bool needsComdatForCounter(const GlobalValue &GV, const Module &M) {
+  if (GV.hasComdat())
     return true;
 
   if (!Triple(M.getTargetTriple()).supportsCOMDAT())
@@ -1295,7 +1404,7 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
   // available_externally functions will end up being duplicated in raw profile
   // data. This can result in distorted profile as the counts of those dups
   // will be accumulated by the profile merger.
-  GlobalValue::LinkageTypes Linkage = F.getLinkage();
+  GlobalValue::LinkageTypes Linkage = GV.getLinkage();
   if (Linkage != GlobalValue::ExternalWeakLinkage &&
       Linkage != GlobalValue::AvailableExternallyLinkage)
     return false;
@@ -1451,7 +1560,7 @@ void OverlapStats::dump(raw_fd_ostream &OS) const {
   for (unsigned I = 0; I < IPVK_Last - IPVK_First + 1; I++) {
     if (Base.ValueCounts[I] < 1.0f && Test.ValueCounts[I] < 1.0f)
       continue;
-    char ProfileKindName[20];
+    char ProfileKindName[20] = {0};
     switch (I) {
     case IPVK_IndirectCallTarget:
       strncpy(ProfileKindName, "IndirectCall", 19);
@@ -1459,6 +1568,9 @@ void OverlapStats::dump(raw_fd_ostream &OS) const {
     case IPVK_MemOPSize:
       strncpy(ProfileKindName, "MemOP", 19);
       break;
+    case IPVK_VTableTarget:
+      strncpy(ProfileKindName, "VTable", 19);
+      break;
     default:
       snprintf(ProfileKindName, 19, "VP[%d]", I);
       break;
@@ -1523,9 +1635,12 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
+  case 12ull:
+    H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset));
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
@@ -1551,10 +1666,13 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
+  case 12ull:
+    return offsetOf(&Header::VTableNamesOffset) +
+           sizeof(Header::VTableNamesOffset);
   case 11ull:
     [[fallthrough]];
   case 10ull:
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 0d8d43daae960..4ef6823381749 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -366,6 +366,14 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
               return E;
             Value = IndexedInstrProf::ComputeHash(VD.first);
           }
+        } else if (ValueKind == IPVK_VTableTarget) {
+          if (InstrProfSymtab::isExternalSymbol(VD.first)) {
+            Value = 0;
+          } else {
+            if (Error E = Symtab->addVTableName(VD.first))
+              return E;
+            Value = IndexedInstrProf::ComputeHash(VD.first);
+          }
         } else {
           READ_NUM(VD.first, Value);
         }
@@ -533,7 +541,8 @@ Error RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
 
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
-  if (Error E = Symtab.create(StringRef(NamesStart, NamesEnd - NamesStart)))
+  if (Error E = Symtab.create(StringRef(NamesStart, NamesEnd - NamesStart),
+                              StringRef(VNamesStart, VNamesEnd - VNamesStart)))
     return error(std::move(E));
   for (const RawInstrProf::ProfileData<IntPtrT> *I = Data; I != DataEnd; ++I) {
     const IntPtrT FPtr = swap(I->FunctionPointer);
@@ -541,6 +550,21 @@ Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
       continue;
     Symtab.mapAddress(FPtr, swap(I->NameRef));
   }
+
+  if (VTableBegin != nullptr && VTableEnd != nullptr) {
+    for (const RawInstrProf::VTableProfileData<IntPtrT> *I = VTableBegin;
+         I != VTableEnd; ++I) {
+      const IntPtrT VPtr = swap(I->VTablePointer);
+      if (!VPtr)
+        continue;
+      // Map both begin and end address to the name hash, since the instrumented
+      // address could be somewhere in the middle.
+      // VPtr is of type uint32_t or uint64_t so 'VPtr + I->VTableSize' marks
+      // the end of vtable address.
+      Symtab.mapVTableAddress(VPtr, VPtr + swap(I->VTableSize),
+                              swap(I->VTableNameHash));
+    }
+  }
   return success();
 }
 
@@ -582,10 +606,17 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto NumBitmapBytes = swap(Header.NumBitmapBytes);
   auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
+  auto VTableNameSize = swap(Header.VNamesSize);
+  auto NumVTables = swap(Header.NumVTables);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>);
-  auto PaddingSize = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize);
+
+  auto VTableSectionSize =
+      NumVTables * sizeof(RawInstrProf::VTableProfileData<IntPtrT>);
+  auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize;
@@ -594,7 +625,12 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
   ptrdiff_t NamesOffset =
       BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
-  ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
+  ptrdiff_t VTableProfDataOffset =
+      NamesOffset + NamesSize + PaddingBytesAfterNames;
+  ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize +
+                               PaddingBytesAfterVTableProfData;
+  ptrdiff_t ValueDataOffset =
+      VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
   if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
@@ -614,8 +650,14 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
         Start + DataOffset);
     DataEnd = Data + NumData;
+    VTableBegin =
+        reinterpret_cast<const RawInstrProf::VTableProfileData<IntPtrT> *>(
+            Start + VTableProfDataOffset);
+    VTableEnd = VTableBegin + NumVTables;
     NamesStart = Start + NamesOffset;
     NamesEnd = NamesStart + NamesSize;
+    VNamesStart = Start + VTableNameOffset;
+    VNamesEnd = VNamesStart + VTableNameSize;
   }
 
   CountersStart = Start + CountersOffset;
@@ -1260,6 +1302,19 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
+  if (GET_VERSION(Header->formatVersion()) >= 12) {
+    uint64_t VTableNamesOffset =
+        endian::byte_swap<uint64_t, llvm::endianness::little>(
+            Header->VTableNamesOffset);
+    const unsigned char *Ptr = Start + VTableNamesOffset;
+
+    CompressedVTableNamesLen =
+        support::endian::readNext<uint64_t, llvm::endianness::little,
+                                  unaligned>(Ptr);
+
+    VTableNamePtr = (const char *)Ptr;
+  }
+
   if (GET_VERSION(Header->formatVersion()) >= 10 &&
       Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
     uint64_t TemporalProfTracesOffset =
@@ -1319,7 +1374,16 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
   if (Symtab)
     return *Symtab;
 
-  std::unique_ptr<InstrProfSymtab> NewSymtab = std::make_unique<InstrProfSymtab>();
+  std::unique_ptr<InstrProfSymtab> NewSymtab =
+      std::make_unique<InstrProfSymtab>();
+
+  if (Error E = NewSymtab->initVTableNamesFromCompressedStrings(
+          StringRef(VTableNamePtr, CompressedVTableNamesLen))) {
+    auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
+    consumeError(error(ErrCode, Msg));
+  }
+
+  // finalizeSymtab is called inside populateSymtab.
   if (Error E = Index->populateSymtab(*NewSymtab)) {
     auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
     consumeError(error(ErrCode, Msg));
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index d65f8fe50313d..7592c0ffd3272 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
@@ -455,12 +456,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   Header.MemProfOffset = 0;
   Header.BinaryIdOffset = 0;
   Header.TemporalProfTracesOffset = 0;
+  Header.VTableNamesOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
   // Only write out all the fields except 'HashOffset', 'MemProfOffset',
-  // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the
-  // offset of these fields to allow back patching later.
-  for (int I = 0; I < N - 4; I++)
+  // 'BinaryIdOffset', `TemporalProfTracesOffset` and `VTableNamesOffset`. We
+  // need to remember the offset of these fields to allow back patching later.
+  for (int I = 0; I < N - 5; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -484,6 +486,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   uint64_t TemporalProfTracesOffset = OS.tell();
   OS.write(0);
 
+  uint64_t VTableNamesOffset = OS.tell();
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -604,6 +609,43 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       OS.writeByte(0);
   }
 
+  // if version >= the version with vtable profile metadata.
+  uint64_t VTableNamesSectionStart = 0;
+  if (IndexedInstrProf::ProfVersion::CurrentVersion >= 12) {
+    VTableNamesSectionStart = OS.tell();
+
+    std::string CompressedVTableNames;
+
+    std::vector<std::string> VTableNameStrs;
+    for (const auto &VTableName : VTableNames.keys()) {
+      VTableNameStrs.push_back(VTableName.str());
+    }
+
+    if (!VTableNameStrs.empty()) {
+      if (Error E = collectGlobalObjectNameStrings(
+              VTableNameStrs, compression::zlib::isAvailable(),
+              CompressedVTableNames))
+        return E;
+    }
+
+    uint64_t CompressedStringLen = CompressedVTableNames.length();
+
+    // Record the length of compressed string.
+    OS.write(CompressedStringLen);
+
+    // Write the chars in compressed strings.
+    for (auto &c : CompressedVTableNames)
+      OS.writeByte(static_cast<uint8_t>(c));
+
+    // Pad up to a multiple of 8.
+    // InstrProfReader could read bytes according to 'CompressedStringLen'.
+    uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
+
+    for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) {
+      OS.writeByte(0);
+    }
+  }
+
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
     TemporalProfTracesSectionStart = OS.tell();
@@ -647,6 +689,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
       // traces).
       {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
+      {VTableNamesOffset, &VTableNamesSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
@@ -699,7 +742,8 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) {
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       DenseSet<uint64_t> SeenValues;
       for (uint32_t I = 0; I < ND; I++)
-        if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second)
+        if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) &&
+            !SeenValues.insert(VD[I].Value).second)
           return make_error<InstrProfError>(instrprof_error::invalid_prof);
     }
   }
@@ -747,7 +791,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       OS << ND << "\n";
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
-        if (VK == IPVK_IndirectCallTarget)
+        if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget)
           OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":"
              << VD[I].Count << "\n";
         else
@@ -786,6 +830,11 @@ Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
     }
   }
 
+  for (const auto &VTableName : VTableNames) {
+    if (Error E = Symtab.addVTableName(VTableName.getKey()))
+      return E;
+  }
+
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile))
     writeTextTemporalProfTraceData(OS, Symtab);
 
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 7344fea175171..6a44a32bb34dc 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -136,11 +136,13 @@ class IndirectCallPromoter {
       const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
       uint64_t TotalCount, uint32_t NumCandidates);
 
-  // Promote a list of targets for one indirect-call callsite. Return
-  // the number of promotions.
-  uint32_t tryToPromote(CallBase &CB,
-                        const std::vector<PromotionCandidate> &Candidates,
-                        uint64_t &TotalCount);
+  // Promote a list of targets for one indirect-call callsite by comparing
+  // indirect callee with functions. Returns true if there are IR
+  // transformations and false otherwise.
+  bool tryToPromoteWithFuncCmp(
+      CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+      uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
+      uint32_t NumCandidates);
 
 public:
   IndirectCallPromoter(Function &Func, InstrProfSymtab *Symtab, bool SamplePGO,
@@ -273,9 +275,10 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
 }
 
 // Promote indirect-call to conditional direct-call for one callsite.
-uint32_t IndirectCallPromoter::tryToPromote(
+bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
     CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
-    uint64_t &TotalCount) {
+    uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
+    uint32_t NumCandidates) {
   uint32_t NumPromoted = 0;
 
   for (const auto &C : Candidates) {
@@ -287,7 +290,18 @@ uint32_t IndirectCallPromoter::tryToPromote(
     NumOfPGOICallPromotion++;
     NumPromoted++;
   }
-  return NumPromoted;
+
+  const bool Changed = (NumPromoted != 0);
+
+  if (Changed) {
+    CB.setMetadata(LLVMContext::MD_prof, nullptr);
+
+    if (TotalCount != 0)
+      annotateValueSite(*F.getParent(), CB, ICallProfDataRef.slice(NumPromoted),
+                        TotalCount, IPVK_IndirectCallTarget, NumCandidates);
+  }
+
+  return Changed;
 }
 
 // Traverse all the indirect-call callsite and get the value profile
@@ -305,19 +319,8 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
       continue;
     auto PromotionCandidates = getPromotionCandidatesForCallSite(
         *CB, ICallProfDataRef, TotalCount, NumCandidates);
-    uint32_t NumPromoted = tryToPromote(*CB, PromotionCandidates, TotalCount);
-    if (NumPromoted == 0)
-      continue;
-
-    Changed = true;
-    // Adjust the MD.prof metadata. First delete the old one.
-    CB->setMetadata(LLVMContext::MD_prof, nullptr);
-    // If all promoted, we don't need the MD.prof metadata.
-    if (TotalCount == 0 || NumPromoted == NumVals)
-      continue;
-    // Otherwise we need update with the un-promoted records back.
-    annotateValueSite(*F.getParent(), *CB, ICallProfDataRef.slice(NumPromoted),
-                      TotalCount, IPVK_IndirectCallTarget, NumCandidates);
+    Changed |= tryToPromoteWithFuncCmp(*CB, PromotionCandidates, TotalCount,
+                                       ICallProfDataRef, NumCandidates);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index a19b140872544..49978dac034e8 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -64,6 +64,9 @@ using namespace llvm;
 #define DEBUG_TYPE "instrprof"
 
 namespace llvm {
+// Command line option to enable vtable value profiling. Defined in
+// ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=
+extern cl::opt<bool> EnableVTableValueProfiling;
 // TODO: Remove -debug-info-correlate in next LLVM release, in favor of
 // -profile-correlate=debug-info.
 cl::opt<bool> DebugInfoCorrelate(
@@ -196,12 +199,18 @@ class InstrLowerer final {
     PerFunctionProfileData() = default;
   };
   DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap;
+  // Key is virtual table variable, value is 'VTableProfData' in the form of
+  // GlobalVariable.
+  DenseMap<GlobalVariable *, GlobalVariable *> VTableDataMap;
   /// If runtime relocation is enabled, this maps functions to the load
   /// instruction that produces the profile relocation bias.
   DenseMap<const Function *, LoadInst *> FunctionToProfileBiasMap;
   std::vector<GlobalValue *> CompilerUsedVars;
   std::vector<GlobalValue *> UsedVars;
   std::vector<GlobalVariable *> ReferencedNames;
+  // The list of virtual table variables of which the VTableProfData is
+  // collected.
+  std::vector<GlobalVariable *> ReferencedVTables;
   GlobalVariable *NamesVar = nullptr;
   size_t NamesSize = 0;
 
@@ -294,9 +303,15 @@ class InstrLowerer final {
   /// Create INSTR_PROF_DATA variable for counters and bitmaps.
   void createDataVariable(InstrProfCntrInstBase *Inc);
 
+  /// Get the counters for virtual table values, creating them if necessary.
+  void getOrCreateVTableProfData(GlobalVariable *GV);
+
   /// Emit the section with compressed function names.
   void emitNameData();
 
+  /// Emit the section with compressed vtable names.
+  void emitVTableNames();
+
   /// Emit value nodes section for value profiling.
   void emitVNodes();
 
@@ -740,6 +755,15 @@ bool InstrLowerer::lower() {
     }
   }
 
+  if (EnableVTableValueProfiling) {
+    for (GlobalVariable &GV : M.globals()) {
+      // Global variables with type metadata are virtual table variables.
+      if (GV.hasMetadata(LLVMContext::MD_type)) {
+        getOrCreateVTableProfData(&GV);
+      }
+    }
+  }
+
   for (Function &F : M)
     MadeChange |= lowerIntrinsics(&F);
 
@@ -753,6 +777,7 @@ bool InstrLowerer::lower() {
 
   emitVNodes();
   emitNameData();
+  emitVTableNames();
 
   // Emit runtime hook for the cases where the target does not unconditionally
   // require pulling in profile runtime, and coverage is enabled on code that is
@@ -1220,6 +1245,129 @@ void InstrLowerer::maybeSetComdat(GlobalVariable *GV, Function *Fn,
     GV->setLinkage(GlobalValue::InternalLinkage);
 }
 
+static inline bool shouldRecordVTableAddr(GlobalVariable *GV) {
+  if (!profDataReferencedByCode(*GV->getParent()))
+    return false;
+
+  if (!GV->hasLinkOnceLinkage() && !GV->hasLocalLinkage() &&
+      !GV->hasAvailableExternallyLinkage())
+    return true;
+
+  // This avoids the profile data from referencing internal symbols in
+  // COMDAT.
+  if (GV->hasLocalLinkage() && GV->hasComdat())
+    return false;
+
+  return true;
+}
+
+// FIXME: Does symbolic relocation from 'getFuncAddrForProfData' matter here?
+static inline Constant *getVTableAddrForProfData(GlobalVariable *GV) {
+  auto *Int8PtrTy = PointerType::getUnqual(GV->getContext());
+
+  // Store a nullptr in __profvt_ if a real address shouldn't be used.
+  if (!shouldRecordVTableAddr(GV))
+    return ConstantPointerNull::get(Int8PtrTy);
+
+  return ConstantExpr::getBitCast(GV, Int8PtrTy);
+}
+
+void InstrLowerer::getOrCreateVTableProfData(GlobalVariable *GV) {
+  assert(!DebugInfoCorrelate &&
+         "Value profiling is not supported with lightweight instrumentation");
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return;
+
+  if (GV->getName().starts_with("llvm.") ||
+      GV->getName().starts_with("__llvm") ||
+      GV->getName().starts_with("__prof"))
+    return;
+
+  // VTableProfData already created
+  auto It = VTableDataMap.find(GV);
+  if (It != VTableDataMap.end() && It->second)
+    return;
+
+  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
+  GlobalValue::VisibilityTypes Visibility = GV->getVisibility();
+
+  // This is to keep consistent with per-function profile data
+  // for correctness.
+  if (TT.isOSBinFormatXCOFF()) {
+    Linkage = GlobalValue::InternalLinkage;
+    Visibility = GlobalValue::DefaultVisibility;
+  }
+
+  LLVMContext &Ctx = M.getContext();
+  Type *DataTypes[] = {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+
+  auto *DataTy = StructType::get(Ctx, ArrayRef(DataTypes));
+
+  // Used by INSTR_PROF_VTABLE_DATA MACRO
+  Constant *VTableAddr = getVTableAddrForProfData(GV);
+  const std::string PGOVTableName = getPGOName(*GV);
+  // Record the length of the vtable. This is needed since vtable pointers
+  // loaded from C++ objects might be from the middle of a vtable definition.
+  uint32_t VTableSizeVal =
+      M.getDataLayout().getTypeAllocSize(GV->getValueType());
+
+  Constant *DataVals[] = {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+
+  std::string VarName = getInstrProfVTableVarPrefix().str() + PGOVTableName;
+  auto *Data =
+      new GlobalVariable(M, DataTy, false /* constant */, Linkage,
+                         ConstantStruct::get(DataTy, DataVals), VarName);
+
+  Data->setVisibility(Visibility);
+  Data->setSection(getInstrProfSectionName(IPSK_vtab, TT.getObjectFormat()));
+  Data->setAlignment(Align(8));
+
+  const bool NeedComdat = needsComdatForCounter(*GV, M);
+
+  // GV is the data structure to record vtable information.
+  // Place the global variable for per-vtable profile data in a comdat group
+  // if the associated vtable definition is a COMDAT. This makes sure only one
+  // copy of the variable for the vtable will be emitted after linking.
+  auto MaybeSetComdat = [&](GlobalVariable *GV, StringRef GroupName) {
+    bool UseComdat = (NeedComdat || TT.isOSBinFormatELF());
+    if (UseComdat) {
+      // Create a new comdat group using the name of the global variable as
+      // opposed to using the comdat group of the vtable.
+      Comdat *C = M.getOrInsertComdat(GroupName);
+      // For ELF, when not using COMDAT, put the vtable profile data into a
+      // nodeduplicate COMDAT which is lowered to a zero-flag zero group.
+      // This allows -z -start-stop-gc to discard the entire group when the
+      // vtable def is discarded.
+      if (!NeedComdat)
+        C->setSelectionKind(Comdat::NoDeduplicate);
+      GV->setComdat(C);
+      // COFF doesn't allow the comdat group leader to have private linkage, so
+      // upgrade private linkage to internal linkage to produce a symbol table
+      // entry.
+      if (TT.isOSBinFormatCOFF() && GV->hasPrivateLinkage()) {
+        GV->setLinkage(GlobalValue::InternalLinkage);
+      }
+      return;
+    }
+  };
+
+  MaybeSetComdat(Data, Data->getName());
+
+  VTableDataMap[GV] = Data;
+
+  ReferencedVTables.push_back(GV);
+
+  // VTable <Hash, Addr> is used by runtime but not referenced by other
+  // sections. Conservatively mark it linker retained.
+  UsedVars.push_back(Data);
+}
+
 GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
                                                   InstrProfSectKind IPSK) {
   GlobalVariable *NamePtr = Inc->getName();
@@ -1633,6 +1781,31 @@ void InstrLowerer::emitNameData() {
     NamePtr->eraseFromParent();
 }
 
+void InstrLowerer::emitVTableNames() {
+  if (!EnableVTableValueProfiling || ReferencedVTables.empty())
+    return;
+
+  // Collect the PGO names of referenced vtables and compress them.
+  std::string CompressedVTableNames;
+  if (Error E = collectVTableStrings(ReferencedVTables, CompressedVTableNames,
+                                     DoInstrProfNameCompression)) {
+    report_fatal_error(Twine(toString(std::move(E))), false);
+  }
+
+  auto &Ctx = M.getContext();
+  auto *VTableNamesVal = ConstantDataArray::getString(
+      Ctx, StringRef(CompressedVTableNames), false /* AddNull */);
+  GlobalVariable *VTableNamesVar =
+      new GlobalVariable(M, VTableNamesVal->getType(), true /* constant */,
+                         GlobalValue::PrivateLinkage, VTableNamesVal,
+                         getInstrProfVTableNamesVarName());
+  VTableNamesVar->setSection(
+      getInstrProfSectionName(IPSK_vname, TT.getObjectFormat()));
+  VTableNamesVar->setAlignment(Align(1));
+  // Make VTableNames linker retained.
+  UsedVars.push_back(VTableNamesVar);
+}
+
 void InstrLowerer::emitRegistration() {
   if (!needsRuntimeRegistrationOfSectionRange(TT))
     return;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c20fc942eaf0d..f1aa17de42933 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -327,6 +327,11 @@ extern cl::opt<PGOViewCountsType> PGOViewCounts;
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
 
+extern cl::opt<bool> DebugInfoCorrelate;
+
+// Command line option to enable vtable value profiling. Defined in
+// ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=
+extern cl::opt<bool> EnableVTableValueProfiling;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
@@ -581,6 +586,8 @@ template <class Edge, class BBInfo> class FuncPGOInstrumentation {
       NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
       NumOfPGOBB += MST.bbInfoSize();
       ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
+      if (EnableVTableValueProfiling)
+        ValueSites[IPVK_VTableTarget] = VPC.get(IPVK_VTableTarget);
     } else {
       NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
       NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index 3a129de1acd02..96b21301ce676 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -90,9 +90,39 @@ public:
   }
 };
 
+///------------------------ VirtualTableValueProfilingPlugin
+///------------------------
+class VTableProfilingPlugin {
+  Function &F;
+
+public:
+  static constexpr InstrProfValueKind Kind = IPVK_VTableTarget;
+
+  VTableProfilingPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {}
+
+  void run(std::vector<CandidateInfo> &Candidates) {
+    std::vector<Instruction *> Result = findVTableAddrs(F);
+    for (Instruction *I : Result) {
+      Instruction *InsertPt = I->getNextNonDebugInstruction();
+      // When finding an insertion point, keep PHI and EH pad instructions
+      // before vp intrinsics. This is similar to
+      // `BasicBlock::getFirstInsertionPt`.
+      while (InsertPt && (dyn_cast<PHINode>(InsertPt) || InsertPt->isEHPad()))
+        InsertPt = InsertPt->getNextNonDebugInstruction();
+      // Skip instrumentating the value if InsertPt is the last instruction.
+      // FIXME: Set InsertPt to the end of basic block to instrument the value
+      // if InsertPt is the last instruction.
+      if (InsertPt == nullptr)
+        continue;
+
+      Instruction *AnnotatedInst = I;
+      Candidates.emplace_back(CandidateInfo{I, InsertPt, AnnotatedInst});
+    }
+  }
+};
+
 ///----------------------- Registration of the plugins -------------------------
 /// For now, registering a plugin with the ValueProfileCollector is done by
 /// adding the plugin type to the VP_PLUGIN_LIST macro.
-#define VP_PLUGIN_LIST           \
-    MemIntrinsicPlugin,          \
-    IndirectCallPromotionPlugin
+#define VP_PLUGIN_LIST                                                         \
+  MemIntrinsicPlugin, IndirectCallPromotionPlugin, VTableProfilingPlugin
diff --git a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
new file mode 100644
index 0000000000000..ba3ce9a75ee83
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
@@ -0,0 +1,74 @@
+; Promote at most one function and annotate at most one vtable.
+; As a result, only one value (of each relevant kind) shows up in the function
+; summary.
+
+; RUN: opt -module-summary -icp-max-num-vtables=1 -icp-max-prom=1 %s -o %t.o
+
+; RUN: llvm-bcanalyzer -dump %t.o | FileCheck %s
+
+; RUN: llvm-dis -o - %t.o | FileCheck %s --check-prefix=DIS
+; Round trip it through llvm-as
+; RUN: llvm-dis -o - %t.o | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=DIS
+
+; CHECK: <GLOBALVAL_SUMMARY_BLOCK
+; CHECK-NEXT:   <VERSION op0=9/>
+; CHECK-NEXT:   <FLAGS op0=0/>
+; The `VALUE_GUID` below represents the "_ZTV4Base" referenced by the instruction
+; that loads vtable pointers.
+; CHECK-NEXT: <VALUE_GUID op0=21 op1=1960855528937986108/>
+; The `VALUE_GUID` below represents the "_ZN4Base4funcEv" referenced by the
+; indirect call instruction.
+; CHECK-NEXT:      <VALUE_GUID op0=20 op1=5459407273543877811/>
+; NOTE vtables and functions from Derived class is dropped because
+; `-icp-max-num-vtables` and `-icp-max-prom` are both set to one.
+; <PERMODULE_PROFILE> has the format [valueid, flags, instcount, funcflags,
+;                                     numrefs, rorefcnt, worefcnt,
+;                                     m x valueid,
+;                                     n x (valueid, hotness+tailcall)]
+; CHECK-NEXT:   <PERMODULE_PROFILE abbrevid=4 op0=0 op1=0 op2=4 op3=256 op4=1 op5=1 op6=0 op7=21 op8=20 op9=3/>
+; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function has one BB and an entry count of 150, so the BB is hot according to
+; ProfileSummary and reflected so in the bitcode (see llvm-dis output).
+define i32 @_Z4testP4Base(ptr %0) !prof !15 {
+  %2 = load ptr, ptr %0, !prof !16
+  %3 = load ptr, ptr %2
+  %4 = tail call i32 %3(ptr %0), !prof !17
+  ret i32 %4
+}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 200}
+!6 = !{!"MaxInternalCount", i64 200}
+!7 = !{!"MaxFunctionCount", i64 200}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 990000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+
+!15 = !{!"function_entry_count", i32 150}
+; 1960855528937986108 is the MD5 hash of _ZTV4Base, and
+; 13870436605473471591 is the MD5 hash of _ZTV7Derived
+!16 = !{!"VP", i32 2, i64 150, i64 1960855528937986108, i64 100, i64 13870436605473471591, i64 50}
+; 5459407273543877811 is the MD5 hash of _ZN4Base4funcEv, and
+; 6174874150489409711 is the MD5 hash of  _ZN7Derived4funcEv
+!17 = !{!"VP", i32 0, i64 150, i64 5459407273543877811, i64 100, i64 6174874150489409711, i64 50}
+
+; ModuleSummaryIndex stores <guid, global-value summary> map in std::map; so
+; global value summares are printed out in the order that gv's guid increases.
+; DIS: ^0 = module: (path: "{{.*}}", hash: (0, 0, 0, 0, 0))
+; DIS: ^1 = gv: (guid: 1960855528937986108)
+; DIS: ^2 = gv: (guid: 5459407273543877811)
+; DIS: ^3 = gv: (name: "_Z4testP4Base", summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 0, canAutoHide: 0), insts: 4, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 0, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 1, mustBeUnreachable: 0), calls: ((callee: ^2, hotness: hot)), refs: (readonly ^1)))) ; guid = 15857150948103218965
+; DIS: ^4 = blockcount: 0
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index bbf895ea4b34e..08cbcaa962b76 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
index 5efda10bb98a941c04b6846db05d3691bc36aac0..5d96ba8ac220508002ae9a7cdb0beb13e0a25144 100644
GIT binary patch
delta 133
zcmbQhvVeuNu_!ISs37M**F;W##g0c6JDpbj|Gzn}&24We0|sE4n5oVhFbgO-ajG?I
s0+?~tnzsPN04lGLYj at i_S(ee5^#Dj at awy|$1+XHZ93#{)ux=zi0I(w{I{*Lx

delta 117
zcmZ3$GJ%D&u_!ISs37M*=R{6_L67IVA1SZ;|9^9yv+SKv1_s87mFlblGl86mORZTI
rz>KHXyapf!P<n@?i|n1rx{SuG4Iq)psf at D~z=}Xx86W_x8;K796T>9f

diff --git a/llvm/test/Transforms/PGOProfile/Inputs/vtable_prof.profraw b/llvm/test/Transforms/PGOProfile/Inputs/vtable_prof.profraw
new file mode 100644
index 0000000000000000000000000000000000000000..5adeb774cddd6462bd2d3779a96d9ad5a06d5e23
GIT binary patch
literal 656
zcmZoHO3N=Q$obF700xW at ih+R*#(>fsXncDp|G<9;NPf(`(<%&25s=FS6^fqKW9??N
zGj7VQAPc)yoJ=r%1$-<h`e5o|CjGkF``j#L>TQo!%iMorv-oE~?b`s=moX8dAEXai
zZ{I{cNtM}66M)L!U*_`VDuC*1;77F&?qR5f3mFb--sHb`6Q<7rs&4`TeGkj4d7Hnn
z_QUiEK=mykpl|Z0nI4u#dwXE|9{hu7+(1CzJE&)WLEZg=8Nz`12PO`qVd`OYj%~%Q
z?tUW^4?Qnm9Z%ksIv!_J&iI}=libt)X>)<6?kOE_UqcgL?X%uyC1=kZ5ixZVHa)cJ
q8pk>@hHDd5b}&Q$t%KPG4tuD3VBrh17v_JMy|8eE$;12!lLr8f`l*Eg

literal 0
HcmV?d00001

diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 8c6942c0f527b..1bad0db1b4762 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/Transforms/PGOProfile/vtable_profile.ll b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
new file mode 100644
index 0000000000000..edc866e4e4efb
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
@@ -0,0 +1,98 @@
+; RUN: opt < %s -passes=pgo-instr-gen -enable-vtable-value-profiling -S | FileCheck %s --check-prefix=GEN
+; RUN: opt < %s -passes=pgo-instr-gen,instrprof -enable-vtable-value-profiling -S | FileCheck %s --check-prefix=LOWER
+
+; __llvm_prf_vnm stores zlib-compressed vtable names.
+; REQUIRES: zlib
+
+source_filename = "vtable_local.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The test IR is generated based on the following C++ program.
+; Base1 has external linkage and Base2 has local linkage.
+; class Derived uses multiple inheritance so its virtual table
+; global variable contains two vtables. func1 is loaded from
+; the vtable compatible with class Base1, and func2 is loaded
+; from the vtable compatible with class Base2.
+
+; class Base1 {
+; public:
+;   virtual int func1(int a) ;
+; };
+;
+; namespace {
+; class Base2 {
+; public:
+;   __attribute__((noinline)) virtual int func2(int a) {
+;     return a;
+;   }
+; };
+; }
+
+; class Derived : public Base1, public Base2 {
+; public:
+;   Derived(int c) : v(c) {}
+; private:
+;   int v;
+; };
+;
+; Derived* createType();
+
+; int func(int a) {
+;   Derived* d = createType();
+;   return d->func2(a) + d->func1(a);
+; }
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV7Derived = constant { [3 x ptr], [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base15func1Ei], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN12_GLOBAL__N_15Base25func2Ei] }, !type !0, !type !3, !type !6, !type !8, !type !10
+ at _ZTV5Base1 = available_externally constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base15func1Ei] }, !type !0
+ at _ZTVN12_GLOBAL__N_15Base2E = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN12_GLOBAL__N_15Base25func2Ei] }, !type !11, !type !8; !vcall_visibility !12
+ at llvm.compiler.used = appending global [1 x ptr] [ptr @_ZTV5Base1], section "llvm.metadata"
+
+; GEN: __llvm_profile_raw_version = comdat any
+; GEN: __llvm_profile_raw_version = hidden constant i64 72057594037927946, comdat
+; GEN: __profn__Z4funci = private constant [8 x i8] c"_Z4funci"
+
+; LOWER: $__profvt__ZTV7Derived = comdat nodeduplicate
+; LOWER: $"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = comdat nodeduplicate
+; LOWER: @__profvt__ZTV7Derived = global { i64, ptr, i32 } { i64 -4576307468236080025, ptr @_ZTV7Derived, i32 48 }, section "__llvm_prf_vtab", comdat, align 8
+; LOWER: @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = internal global { i64, ptr, i32 } { i64 1419990121885302679, ptr @_ZTVN12_GLOBAL__N_15Base2E, i32 24 }, section "__llvm_prf_vtab", comdat, align 8
+; LOWER: @__llvm_prf_vnm = private constant [64 x i8] c"7>x\DA\8B\8F\0A\093wI-\CA,KMa,+IL\CAI\8D\CF\C9ON\CC\D1\CB\C9\B1\8E\07J\FA\19\1A\C5\BB\FB\F8;9\FA\C4\C7\FB\C5\1B\9A:%\16\A7\1A\B9\02\00\19:\12o", section "__llvm_prf_vtabnames", align 1
+; LOWER: @llvm.used = appending global [5 x ptr] [ptr @__profvt__ZTV7Derived, ptr @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E", ptr @__llvm_prf_vnodes, ptr @__llvm_prf_nm, ptr @__llvm_prf_vnm], section "llvm.metadata"
+
+define i32 @_Z4funci(i32 %a) {
+entry:
+  %call = call ptr @_Z10createTypev()
+  %add.ptr = getelementptr inbounds i8, ptr %call, i64 8
+  %vtable = load ptr, ptr %add.ptr
+; GEN: [[P1:%[0-9]+]] = ptrtoint ptr %vtable to i64
+; GEN: call void @llvm.instrprof.value.profile(ptr @__profn__Z4funci, i64 [[CFGHash:[0-9]+]], i64 [[P1]], i32 2, i32 0)
+; LOWER: [[P1:%[0-9]+]] = ptrtoint ptr %vtable to i64
+; LOWER: call void @__llvm_profile_instrument_target(i64 [[P1]], ptr @__profd__Z4funci, i32 2)
+  %vfunc1 = load ptr, ptr %vtable
+  %call1 = call i32 %vfunc1(ptr %add.ptr, i32 %a)
+  %vtable2 = load ptr, ptr %call
+; GEN: [[P2:%[0-9]+]] = ptrtoint ptr %vtable2 to i64
+; GEN: call void @llvm.instrprof.value.profile(ptr @__profn__Z4funci, i64 [[CFGHash]], i64 [[P2]], i32 2, i32 1)
+; LOWER: [[P2:%[0-9]+]] = ptrtoint ptr %vtable2 to i64
+; LOWER: call void @__llvm_profile_instrument_target(i64 [[P2]], ptr @__profd__Z4funci, i32 3)
+  %vfunc2 = load ptr, ptr %vtable2
+  %call4 = call i32 %vfunc2(ptr %call, i32 %a)
+  %add = add nsw i32 %call1, %call4
+  ret i32 %add
+}
+
+declare ptr @_Z10createTypev()
+declare i32 @_ZN12_GLOBAL__N_15Base25func2Ei(ptr %this, i32 %a)
+declare i32 @_ZN5Base15func1Ei(ptr, i32)
+
+!0 = !{i64 16, !"_ZTS5Base1"}
+!3 = !{i64 16, !"_ZTS7Derived"}
+!6 = !{i64 40, !7}
+!7 = distinct !{}
+!8 = !{i64 16, !9}
+!9 = distinct !{}
+!10 = !{i64 40, !9}
+!11 = !{i64 16, !7}
diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index 9cd225587c92511e99f3497ce1d5f47c6fc5f0af..a5dcc9fb22e2e125eccd0ad52a509a84e218781a 100644
GIT binary patch
delta 40
ycmV+ at 0N4NE5AY8OfpTVVa&T<_3Xus<4&W)m$E2$N|IVI0I9pYdP6HaTaBv5DToMxi

delta 39
vcmeys|A3#fu_!ISs37M*=R{6_K?|$bHJ=*(|L<GyrHQwmfq`*jWjQ+lUJ(&8

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index 9966729d92ddc33bf89eeb3fee87215bbabbbef1..4d36ffcf5e05b084cf0d1e04fe3933f80b0b1749 100644
GIT binary patch
delta 40
ycmV+ at 0N4Mp55NxzfpTVVa&T<_3Xus<4&eFQuj8rz|DDYvP#jj1P6HaTa6kus4H8fQ

delta 39
vcmX at Wzk#2#u_!ISs37M*=R{6_L5r?)Gq*SV|KArNnC4N>z`(e%(w!XuMI#TR

diff --git a/llvm/test/tools/llvm-profdata/Inputs/update_vtable_value_prof_inputs.sh b/llvm/test/tools/llvm-profdata/Inputs/update_vtable_value_prof_inputs.sh
new file mode 100755
index 0000000000000..89c3e642ac7ef
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/update_vtable_value_prof_inputs.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+if [ $# -lt 1 ]; then
+  echo "Path to clang++ required!"
+  echo "Usage: update_vtable_value_prof_inputs.sh /path/to/updated/clang++"
+  exit 1
+else
+  CLANG=$1
+fi
+
+
+# Remember current directory.
+CURDIR=$PWD
+
+# Allows the script to be invoked from other directories.
+OUTDIR=$(dirname $(realpath -s $0))
+echo $OUTDIR
+
+cd $OUTDIR
+
+# vtable_prof.cc has the following class hierarchy:
+# class Base
+# ├── class Derived1
+# └── class Derived2
+# Derived1 is a class in the global namespace and Derived2 is in anonymous
+# namespace for test coverage. Overridden virtual methods are annotated as
+# `noinline` so the callsite remains indirect calls for testing purposes.
+cat > vtable_prof.cc << EOF
+#include <cstdlib>
+#include <cstdio>
+
+class Base {
+ public:
+  virtual int func1(int a, int b) = 0;
+  virtual int func2(int a, int b) = 0;
+};
+
+class Derived1 : public Base {
+    public:
+    __attribute__((noinline))
+    int func1(int a, int b) override
+    {
+        return a + b;
+    }
+
+    __attribute__((noinline))
+    int func2(int a, int b) override {
+        return a * b;
+    }
+};
+
+namespace {
+class Derived2 : public Base {
+    public:
+    __attribute__((noinline))
+    int func1(int a, int b) override {
+        return a - b;
+    }
+
+    __attribute__((noinline))
+    int func2(int a, int b) override {
+        return a * (a - b);
+    }
+};
+}  // namespace
+
+__attribute__((noinline)) Base* createType(int a) {
+    Base* base = nullptr;
+    if (a % 4 == 0)
+      base = new Derived1();
+    else
+      base = new Derived2();
+    return base;
+}
+
+
+int main(int argc, char** argv) {
+    int sum = 0;
+    for (int i = 0; i < 1000; i++) {
+        int a = rand();
+        int b = rand();
+        Base* ptr = createType(i);
+        sum += ptr->func1(a, b) + ptr->func2(b, a);
+    }
+    printf("sum is %d\n", sum);
+    return 0;
+}
+EOF
+
+
+# Clean up temporary files on exit and return to original directory.
+cleanup() {
+  rm -f vtable_prof
+  rm -f vtable_prof.cc
+  cd $CURDIR
+}
+trap cleanup EXIT
+
+FLAGS="-fuse-ld=lld -O2 -g -fprofile-generate=. -mllvm -enable-vtable-value-profiling"
+
+${CLANG} ${FLAGS} vtable_prof.cc -o vtable_prof
+env LLVM_PROFILE_FILE=vtable-value-prof-basic.profraw ./vtable_prof
diff --git a/llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof-basic.profraw b/llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof-basic.profraw
new file mode 100644
index 0000000000000000000000000000000000000000..322c8dcd73f935564ca6775962dfa7cbfbdbeda6
GIT binary patch
literal 960
zcmZoHO3N=Q$obF300xW at ih+R*#(>fsXnb^T5>(*8e+cNvxiv$7)&KuHuBfD62xVZf
zg~~Ib(b2Pdtlf-u#!a~uWMQ|8lL at 9DX8r>{Rj_^-AEqD1|8=$Zxmn88+a9f!x&Opw
z at lSx-50h8mhv;8$5<-LY!!Srq-$Xr0mDx=oZSOC0`En(o>30x7wI3cpPzx3^9MrtY
zfA1ztzXqEA00R1p<k**G8rrkL^ndsVF#u{jg8~8l5A7N^IQjV|!u0Pz({DgP|2t at a
z{(@Tgf*Hbq`4c7%qhab{^fJcyid)_NMkXG5UcNe>yeoA)&ZeC4J#!|xr~lLD0#Dsj
zI^Mp9CcfHdz0FF_o;f07%G5b+s*2jvCC9`?o-Ti>svgZODk*hz_wOIpmcNc1|9w>Y
z&=v_r9wlGy6JiY2N)|B?_Z(ljx&GM=lcpW at koZu5y4Nb%QFreJNezYuG;wQ?v$#0n
zPDs(w_VhXB*{iQ{%Kx&r-?lz>uU}~PUXj|##Nd5hxD9Cb3Lu81HE?=?a$xBJ79Ozp
jfrSSwzG3pP at Q2BR%>tPXHSYw}KQQ$IP=CPG6Oso21zGjH

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof.proftext b/llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof.proftext
new file mode 100644
index 0000000000000..ec85dc4c3b12f
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/vtable-value-prof.proftext
@@ -0,0 +1,73 @@
+# IR level Instrumentation Flag
+:ir
+/path/to/vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func1Eii
+# Func Hash:
+742261418966908927
+# Num Counters:
+1
+# Counter Values:
+750
+
+/path/to/vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func2Eii
+# Func Hash:
+742261418966908927
+# Num Counters:
+1
+# Counter Values:
+750
+
+_Z10createTypei
+# Func Hash:
+146835647075900052
+# Num Counters:
+2
+# Counter Values:
+750
+250
+
+_ZN8Derived15func1Eii
+# Func Hash:
+742261418966908927
+# Num Counters:
+1
+# Counter Values:
+250
+
+_ZN8Derived15func2Eii
+# Func Hash:
+742261418966908927
+# Num Counters:
+1
+# Counter Values:
+250
+
+main
+# Func Hash:
+1124236338992350536
+# Num Counters:
+2
+# Counter Values:
+1000
+1
+# Num Value Kinds:
+2
+# ValueKind = IPVK_IndirectCallTarget:
+0
+# NumValueSites:
+2
+2
+/path/to/vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func1Eii:750
+_ZN8Derived15func1Eii:250
+2
+/path/to/vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func2Eii:750
+_ZN8Derived15func2Eii:250
+# ValueKind = IPVK_VTableTarget:
+2
+# NumValueSites:
+2
+2
+/path/to/vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E:750
+_ZTV8Derived1:250
+2
+/path/to/vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E:750
+_ZTV8Derived1:250
diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index eda63203a304a..61881b69cfd5c 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -32,6 +34,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 38b838e0d100a..316a9a4c9df4c 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index c967e850dbe35..8b686d5c50cb7 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 2e747f81a6bfa..089afad420622 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 3c23bc7dd0f7f..e404ba4210cc1 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 4a5c42843ff4d..ee54bfb978567 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -10,6 +10,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index 2a92575ee3407..dfa163f1f3439 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -15,6 +15,8 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index 8220361df6cfa..63782c8b94d4a 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,5 +1,6 @@
+// Header
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +13,8 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +23,8 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
@@ -31,9 +33,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 9352ae132380d..e9569bec1178b 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index c3e995add6ff2..0bc579eec58ab 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,5 +1,5 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 0b3ef2a89abe5..ca9ea54c3f014 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index f4a9aa8e1bbc3..70a4210dea9f8 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -26,7 +28,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -39,6 +41,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
diff --git a/llvm/test/tools/llvm-profdata/vtable-value-prof-basic.test b/llvm/test/tools/llvm-profdata/vtable-value-prof-basic.test
new file mode 100644
index 0000000000000..fb070dc97a4d8
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/vtable-value-prof-basic.test
@@ -0,0 +1,124 @@
+To update the inputs used below, run
+Inputs/update_vtable_value_prof_inputs.sh /path/to/updated/clang++
+
+; Raw profiles stores zlib-compressed vtable names. Raw profile reader needs
+; to decompress them.
+; REQUIRES: zlib
+
+; RUN: rm -rf %t && mkdir %t && cd %t
+
+Show profile data from raw profiles.
+RUN: llvm-profdata show --function=main --ic-targets --show-vtables %p/Inputs/vtable-value-prof-basic.profraw | FileCheck %s --check-prefix=RAW
+
+Generate indexed profile from raw profile and show the data.
+RUN: llvm-profdata merge %p/Inputs/vtable-value-prof-basic.profraw -o indexed.profdata
+RUN: llvm-profdata show --function=main --ic-targets --show-vtables indexed.profdata | FileCheck %s --check-prefix=INDEXED
+
+Generate text profile from raw profile and show the data.
+RUN: llvm-profdata merge --text %p/Inputs/vtable-value-prof-basic.profraw -o vtable-value-prof-basic.proftext
+RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text vtable-value-prof-basic.proftext | FileCheck %s --check-prefix=ICTEXT 
+
+RAW: Counters:
+RAW-NEXT:  main:
+RAW-NEXT:  Hash: 0x0f9a16fe6d398548
+RAW-NEXT:  Counters: 2
+RAW-NEXT:  Indirect Call Site Count: 2
+RAW-NEXT:  Number of instrumented vtables: 2
+RAW-NEXT:  Indirect Target Results:
+RAW-NEXT:       [  0, _ZN8Derived15func1Eii,        250 ] (25.00%)
+RAW-NEXT:       [  0, {{.*}}vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func1Eii,        750 ] (75.00%)
+RAW-NEXT:       [  1, _ZN8Derived15func2Eii,        250 ] (25.00%)
+RAW-NEXT:       [  1, {{.*}}vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func2Eii,        750 ] (75.00%)
+RAW-NEXT:  VTable Results:
+RAW-NEXT:       [  0, _ZTV8Derived1,        250 ] (25.00%)
+RAW-NEXT:       [  0, {{.*}}vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+RAW-NEXT:       [  1, _ZTV8Derived1,        250 ] (25.00%)
+RAW-NEXT:       [  1, {{.*}}vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+RAW-NEXT: Instrumentation level: IR  entry_first = 0
+RAW-NEXT: Functions shown: 1
+RAW-NEXT: Total functions: 6
+RAW-NEXT: Maximum function count: 1000
+RAW-NEXT: Maximum internal block count: 250
+RAW-NEXT: Statistics for indirect call sites profile:
+RAW-NEXT:   Total number of sites: 2
+RAW-NEXT:   Total number of sites with values: 2
+RAW-NEXT:   Total number of profiled values: 4
+RAW-NEXT:   Value sites histogram:
+RAW-NEXT:         NumTargets, SiteCount
+RAW-NEXT:         2, 2
+RAW-NEXT: Statistics for vtable profile:
+RAW-NEXT:   Total number of sites: 2
+RAW-NEXT:   Total number of sites with values: 2
+RAW-NEXT:   Total number of profiled values: 4
+RAW-NEXT:   Value sites histogram:
+RAW-NEXT:         NumTargets, SiteCount
+RAW-NEXT:         2, 2
+
+
+INDEXED:      Counters:
+INDEXED-NEXT:   main:
+INDEXED-NEXT:     Hash: 0x0f9a16fe6d398548
+INDEXED-NEXT:     Counters: 2
+INDEXED-NEXT:     Indirect Call Site Count: 2
+INDEXED-NEXT:     Number of instrumented vtables: 2
+INDEXED-NEXT:     Indirect Target Results:
+INDEXED-NEXT:         [  0, {{.*}}vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func1Eii,        750 ] (75.00%)
+INDEXED-NEXT:         [  0, _ZN8Derived15func1Eii,        250 ] (25.00%)
+INDEXED-NEXT:         [  1, {{.*}}vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func2Eii,        750 ] (75.00%)
+INDEXED-NEXT:         [  1, _ZN8Derived15func2Eii,        250 ] (25.00%)
+INDEXED-NEXT:     VTable Results:
+INDEXED-NEXT:         [  0, {{.*}}vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+INDEXED-NEXT:         [  0, _ZTV8Derived1,        250 ] (25.00%)
+INDEXED-NEXT:         [  1, {{.*}}vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+INDEXED-NEXT:         [  1, _ZTV8Derived1,        250 ] (25.00%)
+INDEXED-NEXT: Instrumentation level: IR  entry_first = 0
+INDEXED-NEXT: Functions shown: 1
+INDEXED-NEXT: Total functions: 6
+INDEXED-NEXT: Maximum function count: 1000
+INDEXED-NEXT: Maximum internal block count: 250
+INDEXED-NEXT: Statistics for indirect call sites profile:
+INDEXED-NEXT:   Total number of sites: 2
+INDEXED-NEXT:   Total number of sites with values: 2
+INDEXED-NEXT:   Total number of profiled values: 4
+INDEXED-NEXT:   Value sites histogram:
+INDEXED-NEXT:       NumTargets, SiteCount
+INDEXED-NEXT:       2, 2
+INDEXED-NEXT: Statistics for vtable profile:
+INDEXED-NEXT:   Total number of sites: 2
+INDEXED-NEXT:   Total number of sites with values: 2
+INDEXED-NEXT:   Total number of profiled values: 4
+INDEXED-NEXT:   Value sites histogram:
+INDEXED-NEXT:       NumTargets, SiteCount
+INDEXED-NEXT:       2, 2
+
+ICTEXT: :ir
+ICTEXT: main
+ICTEXT: # Func Hash:
+ICTEXT: 1124236338992350536
+ICTEXT: # Num Counters:
+ICTEXT: 2
+ICTEXT: # Counter Values:
+ICTEXT: 1000
+ICTEXT: 1
+ICTEXT: # Num Value Kinds:
+ICTEXT: 2
+ICTEXT: # ValueKind = IPVK_IndirectCallTarget:
+ICTEXT: 0
+ICTEXT: # NumValueSites:
+ICTEXT: 2
+ICTEXT: 2
+ICTEXT: {{.*}}vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func1Eii:750
+ICTEXT: _ZN8Derived15func1Eii:250
+ICTEXT: 2
+ICTEXT: {{.*}}vtable_prof.cc;_ZN12_GLOBAL__N_18Derived25func2Eii:750
+ICTEXT: _ZN8Derived15func2Eii:250
+ICTEXT: # ValueKind = IPVK_VTableTarget:
+ICTEXT: 2
+ICTEXT: # NumValueSites:
+ICTEXT: 2
+ICTEXT: 2
+ICTEXT: {{.*}}vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E:750
+ICTEXT: _ZTV8Derived1:250
+ICTEXT: 2
+ICTEXT: {{.*}}vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E:750
+ICTEXT: _ZTV8Derived1:250
diff --git a/llvm/test/tools/llvm-profdata/vtable-value-prof.proftext b/llvm/test/tools/llvm-profdata/vtable-value-prof.proftext
new file mode 100644
index 0000000000000..38073916ec445
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/vtable-value-prof.proftext
@@ -0,0 +1,16 @@
+# RUN: llvm-profdata show --function=main --show-vtables %p/Inputs/vtable-value-prof.proftext | FileCheck %s
+
+# CHECK: Counters:
+# CHECK:  main:
+# CHECK:    Hash: 0x0f9a16fe6d398548
+# CHECK:    Counters: 2
+# CHECK:    VTable Results:
+# CHECK:	       [  0, /path/to/vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+# CHECK:	       [  0, _ZTV8Derived1,        250 ] (25.00%)
+# CHECK:	       [  1, /path/to/vtable_prof.cc;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+# CHECK:	       [  1, _ZTV8Derived1,        250 ] (25.00%)
+# CHECK: Instrumentation level: IR  entry_first = 0
+# CHECK: Functions shown: 1
+# CHECK: Total functions: 6
+# CHECK: Maximum function count: 1000
+# CHECK: Maximum internal block count: 250
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 7754ca3612572..9fb56b8e2647e 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -345,6 +345,9 @@ cl::opt<bool> ShowIndirectCallTargets(
     "ic-targets", cl::init(false),
     cl::desc("Show indirect call site target values for shown functions"),
     cl::sub(ShowSubcommand));
+cl::opt<bool> ShowVTables("show-vtables", cl::init(false),
+                          cl::desc("Show vtable names for shown functions"),
+                          cl::sub(ShowSubcommand));
 cl::opt<bool> ShowMemOPSizes(
     "memop-sizes", cl::init(false),
     cl::desc("Show the profiled sizes of the memory intrinsic calls "
@@ -722,6 +725,13 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
     });
   }
 
+  const InstrProfSymtab &symtab = Reader->getSymtab();
+  const auto &VTableNames = symtab.getVTableNames();
+
+  for (const auto &kv : VTableNames) {
+    WC->Writer.addVTableName(kv.getKey());
+  }
+
   if (Reader->hasTemporalProfile()) {
     auto &Traces = Reader->getTemporalProfTraces(Input.Weight);
     if (!Traces.empty())
@@ -1353,8 +1363,8 @@ remapSamples(const sampleprof::FunctionSamples &Samples,
                           BodySample.second.getSamples());
     for (const auto &Target : BodySample.second.getCallTargets()) {
       Result.addCalledTargetSamples(BodySample.first.LineOffset,
-                                    MaskedDiscriminator,
-                                    Remapper(Target.first), Target.second);
+                                    MaskedDiscriminator, Remapper(Target.first),
+                                    Target.second);
     }
   }
   for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
@@ -2817,6 +2827,10 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
         OS << "    Indirect Call Site Count: "
            << Func.getNumValueSites(IPVK_IndirectCallTarget) << "\n";
 
+      if (ShowVTables)
+        OS << "    Number of instrumented vtables: "
+           << Func.getNumValueSites(IPVK_VTableTarget) << "\n";
+
       uint32_t NumMemOPCalls = Func.getNumValueSites(IPVK_MemOPSize);
       if (ShowMemOPSizes && NumMemOPCalls > 0)
         OS << "    Number of Memory Intrinsics Calls: " << NumMemOPCalls
@@ -2838,6 +2852,13 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
                               &(Reader->getSymtab()));
       }
 
+      if (ShowVTables) {
+        OS << "    VTable Results:\n";
+        traverseAllValueSites(Func, IPVK_VTableTarget,
+                              VPStats[IPVK_VTableTarget], OS,
+                              &(Reader->getSymtab()));
+      }
+
       if (ShowMemOPSizes && NumMemOPCalls > 0) {
         OS << "    Memory Intrinsic Size Results:\n";
         traverseAllValueSites(Func, IPVK_MemOPSize, VPStats[IPVK_MemOPSize], OS,
@@ -2886,6 +2907,11 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
                         VPStats[IPVK_IndirectCallTarget]);
   }
 
+  if (ShownFunctions && ShowVTables) {
+    OS << "Statistics for vtable profile:\n";
+    showValueSitesStats(OS, IPVK_VTableTarget, VPStats[IPVK_VTableTarget]);
+  }
+
   if (ShownFunctions && ShowMemOPSizes) {
     OS << "Statistics for memory intrinsic calls sizes profile:\n";
     showValueSitesStats(OS, IPVK_MemOPSize, VPStats[IPVK_MemOPSize]);
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 8ffb68de7a2d2..b007a374c2cf2 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -638,32 +638,78 @@ TEST_F(InstrProfTest, test_irpgo_read_deprecated_names) {
       Succeeded());
 }
 
+// callee1 to callee6 are from vtable1 to vtable6 respectively.
 static const char callee1[] = "callee1";
 static const char callee2[] = "callee2";
 static const char callee3[] = "callee3";
 static const char callee4[] = "callee4";
 static const char callee5[] = "callee5";
 static const char callee6[] = "callee6";
+// callee7 and callee8 are not from any vtables.
+static const char callee7[] = "callee7";
+static const char callee8[] = "callee8";
+// 'callee' is primarily used to create multiple-element vtables.
+static const char callee[] = "callee";
+static const uint64_t vtable1[] = {uint64_t(callee), uint64_t(callee1)};
+static const uint64_t vtable2[] = {uint64_t(callee2), uint64_t(callee)};
+static const uint64_t vtable3[] = {
+    uint64_t(callee),
+    uint64_t(callee3),
+};
+static const uint64_t vtable4[] = {uint64_t(callee4), uint64_t(callee)};
+static const uint64_t vtable5[] = {uint64_t(callee5), uint64_t(callee)};
+static const uint64_t vtable6[] = {uint64_t(callee6), uint64_t(callee)};
+
+// Returns the address of callee with a numbered suffix in vtable.
+static uint64_t getCalleeAddress(const uint64_t *vtableAddr) {
+  uint64_t CalleeAddr;
+  // Callee with a numbered suffix is the 2nd element in vtable1 and vtable3,
+  // and the 1st element in the rest of vtables.
+  if (vtableAddr == vtable1 || vtableAddr == vtable3)
+    CalleeAddr = uint64_t(vtableAddr) + 8;
+  else
+    CalleeAddr = uint64_t(vtableAddr);
+  return CalleeAddr;
+}
 
-TEST_P(InstrProfReaderWriterTest, icall_data_read_write) {
+TEST_P(InstrProfReaderWriterTest, icall_and_vtable_data_read_write) {
   NamedInstrProfRecord Record1("caller", 0x1234, {1, 2});
 
-  // 4 value sites.
-  Record1.reserveSites(IPVK_IndirectCallTarget, 4);
-  InstrProfValueData VD0[] = {
-      {(uint64_t)callee1, 1}, {(uint64_t)callee2, 2}, {(uint64_t)callee3, 3}};
-  Record1.addValueData(IPVK_IndirectCallTarget, 0, VD0, 3, nullptr);
-  // No value profile data at the second site.
-  Record1.addValueData(IPVK_IndirectCallTarget, 1, nullptr, 0, nullptr);
-  InstrProfValueData VD2[] = {{(uint64_t)callee1, 1}, {(uint64_t)callee2, 2}};
-  Record1.addValueData(IPVK_IndirectCallTarget, 2, VD2, 2, nullptr);
-  InstrProfValueData VD3[] = {{(uint64_t)callee1, 1}};
-  Record1.addValueData(IPVK_IndirectCallTarget, 3, VD3, 1, nullptr);
+  // 4 indirect call value sites.
+  {
+    Record1.reserveSites(IPVK_IndirectCallTarget, 4);
+    InstrProfValueData VD0[] = {
+        {(uint64_t)callee1, 1}, {(uint64_t)callee2, 2}, {(uint64_t)callee3, 3}};
+    Record1.addValueData(IPVK_IndirectCallTarget, 0, VD0, 3, nullptr);
+    // No value profile data at the second site.
+    Record1.addValueData(IPVK_IndirectCallTarget, 1, nullptr, 0, nullptr);
+    InstrProfValueData VD2[] = {{(uint64_t)callee1, 1}, {(uint64_t)callee2, 2}};
+    Record1.addValueData(IPVK_IndirectCallTarget, 2, VD2, 2, nullptr);
+    InstrProfValueData VD3[] = {{(uint64_t)callee7, 1}, {(uint64_t)callee8, 2}};
+    Record1.addValueData(IPVK_IndirectCallTarget, 3, VD3, 2, nullptr);
+  }
+
+  // 2 vtable value sites.
+  {
+    InstrProfValueData VD0[] = {
+        {getCalleeAddress(vtable1), 1},
+        {getCalleeAddress(vtable2), 2},
+        {getCalleeAddress(vtable3), 3},
+    };
+    InstrProfValueData VD2[] = {
+        {getCalleeAddress(vtable1), 1},
+        {getCalleeAddress(vtable2), 2},
+    };
+    Record1.addValueData(IPVK_VTableTarget, 0, VD0, 3, nullptr);
+    Record1.addValueData(IPVK_VTableTarget, 2, VD2, 2, nullptr);
+  }
 
   Writer.addRecord(std::move(Record1), getProfWeight(), Err);
   Writer.addRecord({"callee1", 0x1235, {3, 4}}, Err);
   Writer.addRecord({"callee2", 0x1235, {3, 4}}, Err);
   Writer.addRecord({"callee3", 0x1235, {3, 4}}, Err);
+  Writer.addRecord({"callee7", 0x1235, {3, 4}}, Err);
+  Writer.addRecord({"callee8", 0x1235, {3, 4}}, Err);
 
   // Set writer value prof data endianness.
   Writer.setValueProfDataEndianness(getEndianness());
@@ -676,24 +722,66 @@ TEST_P(InstrProfReaderWriterTest, icall_data_read_write) {
 
   Expected<InstrProfRecord> R = Reader->getInstrProfRecord("caller", 0x1234);
   ASSERT_THAT_ERROR(R.takeError(), Succeeded());
+
+  // Test the number of instrumented indirect call sites and the number of
+  // profiled values at each site.
   ASSERT_EQ(4U, R->getNumValueSites(IPVK_IndirectCallTarget));
   EXPECT_EQ(3U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 0));
   EXPECT_EQ(0U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 1));
   EXPECT_EQ(2U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 2));
-  EXPECT_EQ(1U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 3));
+  EXPECT_EQ(2U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 3));
+
+  // Test the number of instrumented vtable sites and the number of profiled
+  // values at each site.
+  ASSERT_EQ(2U, R->getNumValueSites(IPVK_VTableTarget));
+  EXPECT_EQ(3U, R->getNumValueDataForSite(IPVK_VTableTarget, 0));
+  EXPECT_EQ(2U, R->getNumValueDataForSite(IPVK_VTableTarget, 1));
+
+  // First indirect site.
+  {
+    uint64_t TotalC;
+    std::unique_ptr<InstrProfValueData[]> VD =
+        R->getValueForSite(IPVK_IndirectCallTarget, 0, &TotalC);
+
+    EXPECT_EQ(3U * getProfWeight(), VD[0].Count);
+    EXPECT_EQ(2U * getProfWeight(), VD[1].Count);
+    EXPECT_EQ(1U * getProfWeight(), VD[2].Count);
+    EXPECT_EQ(6U * getProfWeight(), TotalC);
+
+    EXPECT_EQ(StringRef((const char *)VD[0].Value, 7), StringRef("callee3"));
+    EXPECT_EQ(StringRef((const char *)VD[1].Value, 7), StringRef("callee2"));
+    EXPECT_EQ(StringRef((const char *)VD[2].Value, 7), StringRef("callee1"));
+  }
 
-  uint64_t TotalC;
-  std::unique_ptr<InstrProfValueData[]> VD =
-      R->getValueForSite(IPVK_IndirectCallTarget, 0, &TotalC);
+  // First vtable site.
+  {
+    uint64_t TotalC;
+    std::unique_ptr<InstrProfValueData[]> VD =
+        R->getValueForSite(IPVK_VTableTarget, 0, &TotalC);
+
+    EXPECT_EQ(3U * getProfWeight(), VD[0].Count);
+    EXPECT_EQ(2U * getProfWeight(), VD[1].Count);
+    EXPECT_EQ(1U * getProfWeight(), VD[2].Count);
+    EXPECT_EQ(6U * getProfWeight(), TotalC);
 
-  EXPECT_EQ(3U * getProfWeight(), VD[0].Count);
-  EXPECT_EQ(2U * getProfWeight(), VD[1].Count);
-  EXPECT_EQ(1U * getProfWeight(), VD[2].Count);
-  EXPECT_EQ(6U * getProfWeight(), TotalC);
+    EXPECT_EQ(VD[0].Value, getCalleeAddress(vtable3));
+    EXPECT_EQ(VD[1].Value, getCalleeAddress(vtable2));
+    EXPECT_EQ(VD[2].Value, getCalleeAddress(vtable1));
+  }
 
-  EXPECT_EQ(StringRef((const char *)VD[0].Value, 7), StringRef("callee3"));
-  EXPECT_EQ(StringRef((const char *)VD[1].Value, 7), StringRef("callee2"));
-  EXPECT_EQ(StringRef((const char *)VD[2].Value, 7), StringRef("callee1"));
+  // Second vtable site.
+  {
+    uint64_t TotalC;
+    std::unique_ptr<InstrProfValueData[]> VD =
+        R->getValueForSite(IPVK_VTableTarget, 1, &TotalC);
+
+    EXPECT_EQ(2U * getProfWeight(), VD[0].Count);
+    EXPECT_EQ(1U * getProfWeight(), VD[1].Count);
+    EXPECT_EQ(3U * getProfWeight(), TotalC);
+
+    EXPECT_EQ(VD[0].Value, getCalleeAddress(vtable2));
+    EXPECT_EQ(VD[1].Value, getCalleeAddress(vtable1));
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -801,33 +889,53 @@ TEST_P(MaybeSparseInstrProfTest, annotate_vp_data) {
   ASSERT_EQ(1U, ValueData[3].Count);
 }
 
-TEST_P(MaybeSparseInstrProfTest, icall_data_merge) {
+TEST_P(MaybeSparseInstrProfTest, icall_and_vtable_data_merge) {
   static const char caller[] = "caller";
   NamedInstrProfRecord Record11(caller, 0x1234, {1, 2});
   NamedInstrProfRecord Record12(caller, 0x1234, {1, 2});
 
-  // 5 value sites.
-  Record11.reserveSites(IPVK_IndirectCallTarget, 5);
-  InstrProfValueData VD0[] = {{uint64_t(callee1), 1},
-                              {uint64_t(callee2), 2},
-                              {uint64_t(callee3), 3},
-                              {uint64_t(callee4), 4}};
-  Record11.addValueData(IPVK_IndirectCallTarget, 0, VD0, 4, nullptr);
+  // 5 value sites for indirect calls.
+  {
+    Record11.reserveSites(IPVK_IndirectCallTarget, 5);
+    InstrProfValueData VD0[] = {{uint64_t(callee1), 1},
+                                {uint64_t(callee2), 2},
+                                {uint64_t(callee3), 3},
+                                {uint64_t(callee4), 4}};
+    Record11.addValueData(IPVK_IndirectCallTarget, 0, VD0, 4, nullptr);
 
-  // No value profile data at the second site.
-  Record11.addValueData(IPVK_IndirectCallTarget, 1, nullptr, 0, nullptr);
+    // No value profile data at the second site.
+    Record11.addValueData(IPVK_IndirectCallTarget, 1, nullptr, 0, nullptr);
 
-  InstrProfValueData VD2[] = {
-      {uint64_t(callee1), 1}, {uint64_t(callee2), 2}, {uint64_t(callee3), 3}};
-  Record11.addValueData(IPVK_IndirectCallTarget, 2, VD2, 3, nullptr);
+    InstrProfValueData VD2[] = {
+        {uint64_t(callee1), 1}, {uint64_t(callee2), 2}, {uint64_t(callee3), 3}};
+    Record11.addValueData(IPVK_IndirectCallTarget, 2, VD2, 3, nullptr);
 
-  InstrProfValueData VD3[] = {{uint64_t(callee1), 1}};
-  Record11.addValueData(IPVK_IndirectCallTarget, 3, VD3, 1, nullptr);
+    InstrProfValueData VD3[] = {{uint64_t(callee7), 1}, {uint64_t(callee8), 2}};
+    Record11.addValueData(IPVK_IndirectCallTarget, 3, VD3, 2, nullptr);
 
-  InstrProfValueData VD4[] = {{uint64_t(callee1), 1},
-                              {uint64_t(callee2), 2},
-                              {uint64_t(callee3), 3}};
-  Record11.addValueData(IPVK_IndirectCallTarget, 4, VD4, 3, nullptr);
+    InstrProfValueData VD4[] = {
+        {uint64_t(callee1), 1}, {uint64_t(callee2), 2}, {uint64_t(callee3), 3}};
+    Record11.addValueData(IPVK_IndirectCallTarget, 4, VD4, 3, nullptr);
+  }
+  // 3 value sites for vtables.
+  {
+    Record11.reserveSites(IPVK_VTableTarget, 3);
+    InstrProfValueData VD0[] = {{getCalleeAddress(vtable1), 1},
+                                {getCalleeAddress(vtable2), 2},
+                                {getCalleeAddress(vtable3), 3},
+                                {getCalleeAddress(vtable4), 4}};
+    Record11.addValueData(IPVK_VTableTarget, 0, VD0, 4, nullptr);
+
+    InstrProfValueData VD2[] = {{getCalleeAddress(vtable1), 1},
+                                {getCalleeAddress(vtable2), 2},
+                                {getCalleeAddress(vtable3), 3}};
+    Record11.addValueData(IPVK_VTableTarget, 1, VD2, 3, nullptr);
+
+    InstrProfValueData VD4[] = {{getCalleeAddress(vtable1), 1},
+                                {getCalleeAddress(vtable2), 2},
+                                {getCalleeAddress(vtable3), 3}};
+    Record11.addValueData(IPVK_VTableTarget, 3, VD4, 3, nullptr);
+  }
 
   // A different record for the same caller.
   Record12.reserveSites(IPVK_IndirectCallTarget, 5);
@@ -843,11 +951,28 @@ TEST_P(MaybeSparseInstrProfTest, icall_data_merge) {
 
   Record12.addValueData(IPVK_IndirectCallTarget, 3, nullptr, 0, nullptr);
 
-  InstrProfValueData VD42[] = {{uint64_t(callee1), 1},
-                               {uint64_t(callee2), 2},
-                               {uint64_t(callee3), 3}};
+  InstrProfValueData VD42[] = {
+      {uint64_t(callee1), 1}, {uint64_t(callee2), 2}, {uint64_t(callee3), 3}};
   Record12.addValueData(IPVK_IndirectCallTarget, 4, VD42, 3, nullptr);
 
+  // 3 value sites for vtables.
+  {
+    Record12.reserveSites(IPVK_VTableTarget, 3);
+    InstrProfValueData VD0[] = {{getCalleeAddress(vtable2), 5},
+                                {getCalleeAddress(vtable3), 3}};
+    Record12.addValueData(IPVK_VTableTarget, 0, VD0, 2, nullptr);
+
+    InstrProfValueData VD2[] = {{getCalleeAddress(vtable2), 1},
+                                {getCalleeAddress(vtable3), 3},
+                                {getCalleeAddress(vtable4), 4}};
+    Record12.addValueData(IPVK_VTableTarget, 1, VD2, 3, nullptr);
+
+    InstrProfValueData VD4[] = {{getCalleeAddress(vtable1), 1},
+                                {getCalleeAddress(vtable2), 2},
+                                {getCalleeAddress(vtable3), 3}};
+    Record12.addValueData(IPVK_VTableTarget, 3, VD4, 3, nullptr);
+  }
+
   Writer.addRecord(std::move(Record11), Err);
   // Merge profile data.
   Writer.addRecord(std::move(Record12), Err);
@@ -857,53 +982,99 @@ TEST_P(MaybeSparseInstrProfTest, icall_data_merge) {
   Writer.addRecord({callee3, 0x1235, {3, 4}}, Err);
   Writer.addRecord({callee3, 0x1235, {3, 4}}, Err);
   Writer.addRecord({callee4, 0x1235, {3, 5}}, Err);
+  Writer.addRecord({callee7, 0x1235, {3, 5}}, Err);
+  Writer.addRecord({callee8, 0x1235, {3, 5}}, Err);
   auto Profile = Writer.writeBuffer();
   readProfile(std::move(Profile));
 
+  // Test the number of instrumented value sites and the number of profiled
+  // values for each site.
   Expected<InstrProfRecord> R = Reader->getInstrProfRecord("caller", 0x1234);
   EXPECT_THAT_ERROR(R.takeError(), Succeeded());
+  // For indirect calls.
   ASSERT_EQ(5U, R->getNumValueSites(IPVK_IndirectCallTarget));
   ASSERT_EQ(4U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 0));
   ASSERT_EQ(0U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 1));
   ASSERT_EQ(4U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 2));
-  ASSERT_EQ(1U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 3));
+  ASSERT_EQ(2U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 3));
   ASSERT_EQ(3U, R->getNumValueDataForSite(IPVK_IndirectCallTarget, 4));
+  // For vtables.
+  ASSERT_EQ(3U, R->getNumValueSites(IPVK_VTableTarget));
+  ASSERT_EQ(4U, R->getNumValueDataForSite(IPVK_VTableTarget, 0));
+  ASSERT_EQ(4U, R->getNumValueDataForSite(IPVK_VTableTarget, 1));
+  ASSERT_EQ(3U, R->getNumValueDataForSite(IPVK_VTableTarget, 2));
+
+  // Test the merged values for indirect calls.
+  {
+    std::unique_ptr<InstrProfValueData[]> VD =
+        R->getValueForSite(IPVK_IndirectCallTarget, 0);
+    EXPECT_EQ(StringRef((const char *)VD[0].Value, 7), StringRef("callee2"));
+    EXPECT_EQ(7U, VD[0].Count);
+    EXPECT_EQ(StringRef((const char *)VD[1].Value, 7), StringRef("callee3"));
+    EXPECT_EQ(6U, VD[1].Count);
+    EXPECT_EQ(StringRef((const char *)VD[2].Value, 7), StringRef("callee4"));
+    EXPECT_EQ(4U, VD[2].Count);
+    EXPECT_EQ(StringRef((const char *)VD[3].Value, 7), StringRef("callee1"));
+    EXPECT_EQ(1U, VD[3].Count);
+
+    std::unique_ptr<InstrProfValueData[]> VD_2(
+        R->getValueForSite(IPVK_IndirectCallTarget, 2));
+    EXPECT_EQ(StringRef((const char *)VD_2[0].Value, 7), StringRef("callee3"));
+    EXPECT_EQ(6U, VD_2[0].Count);
+    EXPECT_EQ(StringRef((const char *)VD_2[1].Value, 7), StringRef("callee4"));
+    EXPECT_EQ(4U, VD_2[1].Count);
+    EXPECT_EQ(StringRef((const char *)VD_2[2].Value, 7), StringRef("callee2"));
+    EXPECT_EQ(3U, VD_2[2].Count);
+    EXPECT_EQ(StringRef((const char *)VD_2[3].Value, 7), StringRef("callee1"));
+    EXPECT_EQ(1U, VD_2[3].Count);
+
+    std::unique_ptr<InstrProfValueData[]> VD_3(
+        R->getValueForSite(IPVK_IndirectCallTarget, 3));
+    EXPECT_EQ(StringRef((const char *)VD_3[0].Value, 7), StringRef("callee8"));
+    EXPECT_EQ(2U, VD_3[0].Count);
+    EXPECT_EQ(StringRef((const char *)VD_3[1].Value, 7), StringRef("callee7"));
+    EXPECT_EQ(1U, VD_3[1].Count);
+
+    std::unique_ptr<InstrProfValueData[]> VD_4(
+        R->getValueForSite(IPVK_IndirectCallTarget, 4));
+    EXPECT_EQ(StringRef((const char *)VD_4[0].Value, 7), StringRef("callee3"));
+    EXPECT_EQ(6U, VD_4[0].Count);
+    EXPECT_EQ(StringRef((const char *)VD_4[1].Value, 7), StringRef("callee2"));
+    EXPECT_EQ(4U, VD_4[1].Count);
+    EXPECT_EQ(StringRef((const char *)VD_4[2].Value, 7), StringRef("callee1"));
+    EXPECT_EQ(2U, VD_4[2].Count);
+  }
 
-  std::unique_ptr<InstrProfValueData[]> VD =
-      R->getValueForSite(IPVK_IndirectCallTarget, 0);
-  ASSERT_EQ(StringRef((const char *)VD[0].Value, 7), StringRef("callee2"));
-  ASSERT_EQ(7U, VD[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD[1].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(6U, VD[1].Count);
-  ASSERT_EQ(StringRef((const char *)VD[2].Value, 7), StringRef("callee4"));
-  ASSERT_EQ(4U, VD[2].Count);
-  ASSERT_EQ(StringRef((const char *)VD[3].Value, 7), StringRef("callee1"));
-  ASSERT_EQ(1U, VD[3].Count);
-
-  std::unique_ptr<InstrProfValueData[]> VD_2(
-      R->getValueForSite(IPVK_IndirectCallTarget, 2));
-  ASSERT_EQ(StringRef((const char *)VD_2[0].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(6U, VD_2[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD_2[1].Value, 7), StringRef("callee4"));
-  ASSERT_EQ(4U, VD_2[1].Count);
-  ASSERT_EQ(StringRef((const char *)VD_2[2].Value, 7), StringRef("callee2"));
-  ASSERT_EQ(3U, VD_2[2].Count);
-  ASSERT_EQ(StringRef((const char *)VD_2[3].Value, 7), StringRef("callee1"));
-  ASSERT_EQ(1U, VD_2[3].Count);
-
-  std::unique_ptr<InstrProfValueData[]> VD_3(
-      R->getValueForSite(IPVK_IndirectCallTarget, 3));
-  ASSERT_EQ(StringRef((const char *)VD_3[0].Value, 7), StringRef("callee1"));
-  ASSERT_EQ(1U, VD_3[0].Count);
-
-  std::unique_ptr<InstrProfValueData[]> VD_4(
-      R->getValueForSite(IPVK_IndirectCallTarget, 4));
-  ASSERT_EQ(StringRef((const char *)VD_4[0].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(6U, VD_4[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD_4[1].Value, 7), StringRef("callee2"));
-  ASSERT_EQ(4U, VD_4[1].Count);
-  ASSERT_EQ(StringRef((const char *)VD_4[2].Value, 7), StringRef("callee1"));
-  ASSERT_EQ(2U, VD_4[2].Count);
+  // Test the merged values for vtables
+  {
+    auto VD0 = R->getValueForSite(IPVK_VTableTarget, 0);
+    EXPECT_EQ(VD0[0].Value, getCalleeAddress(vtable2));
+    EXPECT_EQ(VD0[0].Count, 7U);
+    EXPECT_EQ(VD0[1].Value, getCalleeAddress(vtable3));
+    EXPECT_EQ(VD0[1].Count, 6U);
+    EXPECT_EQ(VD0[2].Value, getCalleeAddress(vtable4));
+    EXPECT_EQ(VD0[2].Count, 4U);
+    EXPECT_EQ(VD0[3].Value, getCalleeAddress(vtable1));
+    EXPECT_EQ(VD0[3].Count, 1U);
+
+    auto VD1 = R->getValueForSite(IPVK_VTableTarget, 1);
+    EXPECT_EQ(VD1[0].Value, getCalleeAddress(vtable3));
+    EXPECT_EQ(VD1[0].Count, 6U);
+    EXPECT_EQ(VD1[1].Value, getCalleeAddress(vtable4));
+    EXPECT_EQ(VD1[1].Count, 4U);
+    EXPECT_EQ(VD1[2].Value, getCalleeAddress(vtable2));
+    EXPECT_EQ(VD1[2].Count, 3U);
+    EXPECT_EQ(VD1[3].Value, getCalleeAddress(vtable1));
+    EXPECT_EQ(VD1[3].Count, 1U);
+
+    auto VD2 = R->getValueForSite(IPVK_VTableTarget, 2);
+    EXPECT_EQ(VD2[0].Value, getCalleeAddress(vtable3));
+    EXPECT_EQ(VD2[0].Count, 6U);
+    EXPECT_EQ(VD2[1].Value, getCalleeAddress(vtable2));
+    EXPECT_EQ(VD2[1].Count, 4U);
+    EXPECT_EQ(VD2[2].Value, getCalleeAddress(vtable1));
+    EXPECT_EQ(VD2[2].Count, 2U);
+  }
 }
 
 struct ValueProfileMergeEdgeCaseTest
@@ -1027,30 +1198,62 @@ INSTANTIATE_TEST_SUITE_P(
     EdgeCaseTest, ValueProfileMergeEdgeCaseTest,
     ::testing::Combine(::testing::Bool(), /* Sparse */
                        ::testing::Values(IPVK_IndirectCallTarget,
-                                         IPVK_MemOPSize) /* ValueKind */
+                                         IPVK_MemOPSize,
+                                         IPVK_VTableTarget) /* ValueKind */
                        ));
 
 static void addValueProfData(InstrProfRecord &Record) {
-  Record.reserveSites(IPVK_IndirectCallTarget, 5);
-  InstrProfValueData VD0[] = {{uint64_t(callee1), 400},
-                              {uint64_t(callee2), 1000},
-                              {uint64_t(callee3), 500},
-                              {uint64_t(callee4), 300},
-                              {uint64_t(callee5), 100}};
-  Record.addValueData(IPVK_IndirectCallTarget, 0, VD0, 5, nullptr);
-  InstrProfValueData VD1[] = {{uint64_t(callee5), 800},
-                              {uint64_t(callee3), 1000},
-                              {uint64_t(callee2), 2500},
-                              {uint64_t(callee1), 1300}};
-  Record.addValueData(IPVK_IndirectCallTarget, 1, VD1, 4, nullptr);
-  InstrProfValueData VD2[] = {{uint64_t(callee6), 800},
-                              {uint64_t(callee3), 1000},
-                              {uint64_t(callee4), 5500}};
-  Record.addValueData(IPVK_IndirectCallTarget, 2, VD2, 3, nullptr);
-  InstrProfValueData VD3[] = {{uint64_t(callee2), 1800},
-                              {uint64_t(callee3), 2000}};
-  Record.addValueData(IPVK_IndirectCallTarget, 3, VD3, 2, nullptr);
-  Record.addValueData(IPVK_IndirectCallTarget, 4, nullptr, 0, nullptr);
+  // Add test data for indirect calls.
+  {
+    Record.reserveSites(IPVK_IndirectCallTarget, 6);
+    InstrProfValueData VD0[] = {{uint64_t(callee1), 400},
+                                {uint64_t(callee2), 1000},
+                                {uint64_t(callee3), 500},
+                                {uint64_t(callee4), 300},
+                                {uint64_t(callee5), 100}};
+    Record.addValueData(IPVK_IndirectCallTarget, 0, VD0, 5, nullptr);
+    InstrProfValueData VD1[] = {{uint64_t(callee5), 800},
+                                {uint64_t(callee3), 1000},
+                                {uint64_t(callee2), 2500},
+                                {uint64_t(callee1), 1300}};
+    Record.addValueData(IPVK_IndirectCallTarget, 1, VD1, 4, nullptr);
+    InstrProfValueData VD2[] = {{uint64_t(callee6), 800},
+                                {uint64_t(callee3), 1000},
+                                {uint64_t(callee4), 5500}};
+    Record.addValueData(IPVK_IndirectCallTarget, 2, VD2, 3, nullptr);
+    InstrProfValueData VD3[] = {{uint64_t(callee2), 1800},
+                                {uint64_t(callee3), 2000}};
+    Record.addValueData(IPVK_IndirectCallTarget, 3, VD3, 2, nullptr);
+    Record.addValueData(IPVK_IndirectCallTarget, 4, nullptr, 0, nullptr);
+    InstrProfValueData VD5[] = {{uint64_t(callee7), 1234},
+                                {uint64_t(callee8), 5678}};
+    Record.addValueData(IPVK_IndirectCallTarget, 5, VD5, 2, nullptr);
+  }
+
+  // Add test data for vtables
+  {
+    Record.reserveSites(IPVK_VTableTarget, 4);
+    InstrProfValueData VD0[] = {
+        {getCalleeAddress(vtable1), 400}, {getCalleeAddress(vtable2), 1000},
+        {getCalleeAddress(vtable3), 500}, {getCalleeAddress(vtable4), 300},
+        {getCalleeAddress(vtable5), 100},
+    };
+    InstrProfValueData VD1[] = {{getCalleeAddress(vtable5), 800},
+                                {getCalleeAddress(vtable3), 1000},
+                                {getCalleeAddress(vtable2), 2500},
+                                {getCalleeAddress(vtable1), 1300}};
+    InstrProfValueData VD2[] = {
+        {getCalleeAddress(vtable6), 800},
+        {getCalleeAddress(vtable3), 1000},
+        {getCalleeAddress(vtable4), 5500},
+    };
+    InstrProfValueData VD3[] = {{getCalleeAddress(vtable2), 1800},
+                                {getCalleeAddress(vtable3), 2000}};
+    Record.addValueData(IPVK_VTableTarget, 0, VD0, 5, nullptr);
+    Record.addValueData(IPVK_VTableTarget, 1, VD1, 4, nullptr);
+    Record.addValueData(IPVK_VTableTarget, 2, VD2, 3, nullptr);
+    Record.addValueData(IPVK_VTableTarget, 3, VD3, 2, nullptr);
+  }
 }
 
 TEST(ValueProfileReadWriteTest, value_prof_data_read_write) {
@@ -1063,59 +1266,111 @@ TEST(ValueProfileReadWriteTest, value_prof_data_read_write) {
   VPData->deserializeTo(Record, nullptr);
 
   // Now read data from Record and sanity check the data
-  ASSERT_EQ(5U, Record.getNumValueSites(IPVK_IndirectCallTarget));
+  ASSERT_EQ(6U, Record.getNumValueSites(IPVK_IndirectCallTarget));
   ASSERT_EQ(5U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 0));
   ASSERT_EQ(4U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 1));
   ASSERT_EQ(3U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 2));
   ASSERT_EQ(2U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 3));
   ASSERT_EQ(0U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 4));
+  ASSERT_EQ(2U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 5));
 
   auto Cmp = [](const InstrProfValueData &VD1, const InstrProfValueData &VD2) {
     return VD1.Count > VD2.Count;
   };
+
   std::unique_ptr<InstrProfValueData[]> VD_0(
       Record.getValueForSite(IPVK_IndirectCallTarget, 0));
   llvm::sort(&VD_0[0], &VD_0[5], Cmp);
-  ASSERT_EQ(StringRef((const char *)VD_0[0].Value, 7), StringRef("callee2"));
-  ASSERT_EQ(1000U, VD_0[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD_0[1].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(500U, VD_0[1].Count);
-  ASSERT_EQ(StringRef((const char *)VD_0[2].Value, 7), StringRef("callee1"));
-  ASSERT_EQ(400U, VD_0[2].Count);
-  ASSERT_EQ(StringRef((const char *)VD_0[3].Value, 7), StringRef("callee4"));
-  ASSERT_EQ(300U, VD_0[3].Count);
-  ASSERT_EQ(StringRef((const char *)VD_0[4].Value, 7), StringRef("callee5"));
-  ASSERT_EQ(100U, VD_0[4].Count);
+  EXPECT_EQ(StringRef((const char *)VD_0[0].Value, 7), StringRef("callee2"));
+  EXPECT_EQ(1000U, VD_0[0].Count);
+  EXPECT_EQ(StringRef((const char *)VD_0[1].Value, 7), StringRef("callee3"));
+  EXPECT_EQ(500U, VD_0[1].Count);
+  EXPECT_EQ(StringRef((const char *)VD_0[2].Value, 7), StringRef("callee1"));
+  EXPECT_EQ(400U, VD_0[2].Count);
+  EXPECT_EQ(StringRef((const char *)VD_0[3].Value, 7), StringRef("callee4"));
+  EXPECT_EQ(300U, VD_0[3].Count);
+  EXPECT_EQ(StringRef((const char *)VD_0[4].Value, 7), StringRef("callee5"));
+  EXPECT_EQ(100U, VD_0[4].Count);
 
   std::unique_ptr<InstrProfValueData[]> VD_1(
       Record.getValueForSite(IPVK_IndirectCallTarget, 1));
   llvm::sort(&VD_1[0], &VD_1[4], Cmp);
-  ASSERT_EQ(StringRef((const char *)VD_1[0].Value, 7), StringRef("callee2"));
-  ASSERT_EQ(2500U, VD_1[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD_1[1].Value, 7), StringRef("callee1"));
-  ASSERT_EQ(1300U, VD_1[1].Count);
-  ASSERT_EQ(StringRef((const char *)VD_1[2].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(1000U, VD_1[2].Count);
-  ASSERT_EQ(StringRef((const char *)VD_1[3].Value, 7), StringRef("callee5"));
-  ASSERT_EQ(800U, VD_1[3].Count);
+  EXPECT_EQ(StringRef((const char *)VD_1[0].Value, 7), StringRef("callee2"));
+  EXPECT_EQ(2500U, VD_1[0].Count);
+  EXPECT_EQ(StringRef((const char *)VD_1[1].Value, 7), StringRef("callee1"));
+  EXPECT_EQ(1300U, VD_1[1].Count);
+  EXPECT_EQ(StringRef((const char *)VD_1[2].Value, 7), StringRef("callee3"));
+  EXPECT_EQ(1000U, VD_1[2].Count);
+  EXPECT_EQ(StringRef((const char *)VD_1[3].Value, 7), StringRef("callee5"));
+  EXPECT_EQ(800U, VD_1[3].Count);
 
   std::unique_ptr<InstrProfValueData[]> VD_2(
       Record.getValueForSite(IPVK_IndirectCallTarget, 2));
   llvm::sort(&VD_2[0], &VD_2[3], Cmp);
-  ASSERT_EQ(StringRef((const char *)VD_2[0].Value, 7), StringRef("callee4"));
-  ASSERT_EQ(5500U, VD_2[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD_2[1].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(1000U, VD_2[1].Count);
-  ASSERT_EQ(StringRef((const char *)VD_2[2].Value, 7), StringRef("callee6"));
-  ASSERT_EQ(800U, VD_2[2].Count);
+  EXPECT_EQ(StringRef((const char *)VD_2[0].Value, 7), StringRef("callee4"));
+  EXPECT_EQ(5500U, VD_2[0].Count);
+  EXPECT_EQ(StringRef((const char *)VD_2[1].Value, 7), StringRef("callee3"));
+  EXPECT_EQ(1000U, VD_2[1].Count);
+  EXPECT_EQ(StringRef((const char *)VD_2[2].Value, 7), StringRef("callee6"));
+  EXPECT_EQ(800U, VD_2[2].Count);
 
   std::unique_ptr<InstrProfValueData[]> VD_3(
       Record.getValueForSite(IPVK_IndirectCallTarget, 3));
   llvm::sort(&VD_3[0], &VD_3[2], Cmp);
-  ASSERT_EQ(StringRef((const char *)VD_3[0].Value, 7), StringRef("callee3"));
-  ASSERT_EQ(2000U, VD_3[0].Count);
-  ASSERT_EQ(StringRef((const char *)VD_3[1].Value, 7), StringRef("callee2"));
-  ASSERT_EQ(1800U, VD_3[1].Count);
+  EXPECT_EQ(StringRef((const char *)VD_3[0].Value, 7), StringRef("callee3"));
+  EXPECT_EQ(2000U, VD_3[0].Count);
+  EXPECT_EQ(StringRef((const char *)VD_3[1].Value, 7), StringRef("callee2"));
+  EXPECT_EQ(1800U, VD_3[1].Count);
+
+  ASSERT_EQ(4U, Record.getNumValueSites(IPVK_VTableTarget));
+  ASSERT_EQ(5U, Record.getNumValueDataForSite(IPVK_VTableTarget, 0));
+  ASSERT_EQ(4U, Record.getNumValueDataForSite(IPVK_VTableTarget, 1));
+  ASSERT_EQ(3U, Record.getNumValueDataForSite(IPVK_VTableTarget, 2));
+  ASSERT_EQ(2U, Record.getNumValueDataForSite(IPVK_VTableTarget, 3));
+
+  std::unique_ptr<InstrProfValueData[]> VD0(
+      Record.getValueForSite(IPVK_VTableTarget, 0));
+  llvm::sort(&VD0[0], &VD0[5], Cmp);
+  EXPECT_EQ(VD0[0].Value, getCalleeAddress(vtable2));
+  EXPECT_EQ(VD0[0].Count, 1000U);
+  EXPECT_EQ(VD0[1].Value, getCalleeAddress(vtable3));
+  EXPECT_EQ(VD0[1].Count, 500U);
+  EXPECT_EQ(VD0[2].Value, getCalleeAddress(vtable1));
+  EXPECT_EQ(VD0[2].Count, 400U);
+  EXPECT_EQ(VD0[3].Value, getCalleeAddress(vtable4));
+  EXPECT_EQ(VD0[3].Count, 300U);
+  EXPECT_EQ(VD0[4].Value, getCalleeAddress(vtable5));
+  EXPECT_EQ(VD0[4].Count, 100U);
+
+  std::unique_ptr<InstrProfValueData[]> VD1(
+      Record.getValueForSite(IPVK_VTableTarget, 1));
+  llvm::sort(&VD1[0], &VD1[4], Cmp);
+  EXPECT_EQ(VD1[0].Value, getCalleeAddress(vtable2));
+  EXPECT_EQ(VD1[0].Count, 2500U);
+  EXPECT_EQ(VD1[1].Value, getCalleeAddress(vtable1));
+  EXPECT_EQ(VD1[1].Count, 1300U);
+  EXPECT_EQ(VD1[2].Value, getCalleeAddress(vtable3));
+  EXPECT_EQ(VD1[2].Count, 1000U);
+  EXPECT_EQ(VD1[3].Value, getCalleeAddress(vtable5));
+  EXPECT_EQ(VD1[3].Count, 800U);
+
+  std::unique_ptr<InstrProfValueData[]> VD2(
+      Record.getValueForSite(IPVK_VTableTarget, 2));
+  llvm::sort(&VD2[0], &VD2[3], Cmp);
+  EXPECT_EQ(VD2[0].Value, getCalleeAddress(vtable4));
+  EXPECT_EQ(VD2[0].Count, 5500U);
+  EXPECT_EQ(VD2[1].Value, getCalleeAddress(vtable3));
+  EXPECT_EQ(VD2[1].Count, 1000U);
+  EXPECT_EQ(VD2[2].Value, getCalleeAddress(vtable6));
+  EXPECT_EQ(VD2[2].Count, 800U);
+
+  std::unique_ptr<InstrProfValueData[]> VD3(
+      Record.getValueForSite(IPVK_VTableTarget, 3));
+  llvm::sort(&VD3[0], &VD3[2], Cmp);
+  EXPECT_EQ(VD3[0].Value, getCalleeAddress(vtable3));
+  EXPECT_EQ(VD3[0].Count, 2000U);
+  EXPECT_EQ(VD3[1].Value, getCalleeAddress(vtable2));
+  EXPECT_EQ(VD3[1].Count, 1800U);
 }
 
 TEST(ValueProfileReadWriteTest, symtab_mapping) {
@@ -1132,10 +1387,27 @@ TEST(ValueProfileReadWriteTest, symtab_mapping) {
   Symtab.mapAddress(uint64_t(callee4), 0x4000ULL);
   // Missing mapping for callee5
 
+  auto getVTableStartAddr = [](const uint64_t *vtable) -> uint64_t {
+    return uint64_t(vtable);
+  };
+  auto getVTableEndAddr = [](const uint64_t *vtable) -> uint64_t {
+    return uint64_t(vtable) + 16;
+  };
+  // vtable1, vtable2, vtable3, vtable4 get mapped; vtable5, vtable6 are not
+  // mapped.
+  Symtab.mapVTableAddress(getVTableStartAddr(vtable1),
+                          getVTableEndAddr(vtable1), MD5Hash("vtable1"));
+  Symtab.mapVTableAddress(getVTableStartAddr(vtable2),
+                          getVTableEndAddr(vtable2), MD5Hash("vtable2"));
+  Symtab.mapVTableAddress(getVTableStartAddr(vtable3),
+                          getVTableEndAddr(vtable3), MD5Hash("vtable3"));
+  Symtab.mapVTableAddress(getVTableStartAddr(vtable4),
+                          getVTableEndAddr(vtable4), MD5Hash("vtable4"));
+
   VPData->deserializeTo(Record, &Symtab);
 
   // Now read data from Record and sanity check the data
-  ASSERT_EQ(5U, Record.getNumValueSites(IPVK_IndirectCallTarget));
+  ASSERT_EQ(6U, Record.getNumValueSites(IPVK_IndirectCallTarget));
   ASSERT_EQ(5U, Record.getNumValueDataForSite(IPVK_IndirectCallTarget, 0));
 
   auto Cmp = [](const InstrProfValueData &VD1, const InstrProfValueData &VD2) {
@@ -1153,6 +1425,74 @@ TEST(ValueProfileReadWriteTest, symtab_mapping) {
 
   // callee5 does not have a mapped value -- default to 0.
   ASSERT_EQ(VD_0[4].Value, 0ULL);
+
+  // Sanity check the vtable value data
+  ASSERT_EQ(4U, Record.getNumValueSites(IPVK_VTableTarget));
+
+  {
+    // The first vtable site.
+    std::unique_ptr<InstrProfValueData[]> VD(
+        Record.getValueForSite(IPVK_VTableTarget, 0));
+    ASSERT_EQ(5U, Record.getNumValueDataForSite(IPVK_VTableTarget, 0));
+    llvm::sort(&VD[0], &VD[5], Cmp);
+    EXPECT_EQ(1000U, VD[0].Count);
+    EXPECT_EQ(VD[0].Value, MD5Hash("vtable2"));
+    EXPECT_EQ(500U, VD[1].Count);
+    EXPECT_EQ(VD[1].Value, MD5Hash("vtable3"));
+    EXPECT_EQ(VD[2].Value, MD5Hash("vtable1"));
+    EXPECT_EQ(400U, VD[2].Count);
+    EXPECT_EQ(VD[3].Value, MD5Hash("vtable4"));
+    EXPECT_EQ(300U, VD[3].Count);
+
+    // vtable5 isn't mapped -- default to 0.
+    EXPECT_EQ(VD[4].Value, 0U);
+    EXPECT_EQ(VD[4].Count, 100U);
+  }
+
+  {
+    // The second vtable site.
+    std::unique_ptr<InstrProfValueData[]> VD(
+        Record.getValueForSite(IPVK_VTableTarget, 1));
+    ASSERT_EQ(4, Record.getNumValueDataForSite(IPVK_VTableTarget, 1));
+    llvm::sort(&VD[0], &VD[4], Cmp);
+    EXPECT_EQ(VD[0].Value, MD5Hash("vtable2"));
+    EXPECT_EQ(2500U, VD[0].Count);
+    EXPECT_EQ(VD[1].Value, MD5Hash("vtable1"));
+    EXPECT_EQ(1300U, VD[1].Count);
+
+    EXPECT_EQ(VD[2].Value, MD5Hash("vtable3"));
+    EXPECT_EQ(1000U, VD[2].Count);
+    // vtable5 isn't mapped -- default to 0.
+    EXPECT_EQ(VD[3].Value, 0U);
+    EXPECT_EQ(800U, VD[3].Count);
+  }
+
+  {
+    // The third vtable site.
+    std::unique_ptr<InstrProfValueData[]> VD(
+        Record.getValueForSite(IPVK_VTableTarget, 2));
+    ASSERT_EQ(3, Record.getNumValueDataForSite(IPVK_VTableTarget, 2));
+    llvm::sort(&VD[0], &VD[3], Cmp);
+    EXPECT_EQ(5500U, VD[0].Count);
+    EXPECT_EQ(VD[0].Value, MD5Hash("vtable4"));
+    EXPECT_EQ(1000U, VD[1].Count);
+    EXPECT_EQ(VD[1].Value, MD5Hash("vtable3"));
+    // vtable6 isn't mapped -- default to 0.
+    EXPECT_EQ(VD[2].Value, 0U);
+    EXPECT_EQ(800U, VD[2].Count);
+  }
+
+  {
+    // The fourth vtable site.
+    std::unique_ptr<InstrProfValueData[]> VD(
+        Record.getValueForSite(IPVK_VTableTarget, 3));
+    ASSERT_EQ(2, Record.getNumValueDataForSite(IPVK_VTableTarget, 3));
+    llvm::sort(&VD[0], &VD[2], Cmp);
+    EXPECT_EQ(2000U, VD[0].Count);
+    EXPECT_EQ(VD[0].Value, MD5Hash("vtable3"));
+    EXPECT_EQ(1800U, VD[1].Count);
+    EXPECT_EQ(VD[1].Value, MD5Hash("vtable2"));
+  }
 }
 
 TEST_P(MaybeSparseInstrProfTest, get_max_function_count) {

>From 66dbbfef52bdc092cbd4ed619bba38c003f6063d Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Thu, 8 Feb 2024 09:07:27 -0800
Subject: [PATCH 02/16] [InstrProf] Add vtables with type metadata into symtab
 to look it up with GUID

---
 llvm/include/llvm/ProfileData/InstrProf.h    | 19 +++++
 llvm/lib/ProfileData/InstrProf.cpp           | 87 ++++++++++++++------
 llvm/unittests/ProfileData/InstrProfTest.cpp | 55 +++++++++++++
 3 files changed, 138 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 53108a093bf4d..6e799cf8aa273 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -487,8 +487,25 @@ class InstrProfSymtab {
     return "** External Symbol **";
   }
 
+  // Returns the canonical name of the given PGOName by stripping the names
+  // suffixes that begins with ".". If MayHaveUniqueSuffix is true, ".__uniq."
+  // suffix is kept in the canonical name.
+  StringRef getCanonicalName(StringRef PGOName, bool MayHaveUniqueSuffix);
+
+  // Add the function into the symbol table, by creating the following
+  // map entries:
+  // - <MD5Hash(PGOFuncName), PGOFuncName>
+  // - <MD5Hash(PGOFuncName), F>
+  // - <MD5Hash(getCanonicalName(PGOFuncName), F)
   Error addFuncWithName(Function &F, StringRef PGOFuncName);
 
+  // Add the vtable into the symbol table, by creating the following
+  // map entries:
+  // - <MD5Hash(PGOVTableName), PGOVTableName>
+  // - <MD5Hash(PGOVTableName), V>
+  // - <MD5Hash(getCanonicalName(PGOVTableName), V)
+  Error addVTableWithName(GlobalVariable &V, StringRef PGOVTableName);
+
   // If the symtab is created by a series of calls to \c addFuncName, \c
   // finalizeSymtab needs to be called before looking up function names.
   // This is required because the underlying map is a vector (for space
@@ -543,6 +560,7 @@ class InstrProfSymtab {
   Error create(const FuncNameIterRange &FuncIterRange,
                const VTableNameIterRange &VTableIterRange);
 
+  // Map the MD5 of the symbol name to the name.
   Error addSymbolName(StringRef SymbolName) {
     if (SymbolName.empty())
       return make_error<InstrProfError>(instrprof_error::malformed,
@@ -665,6 +683,7 @@ void InstrProfSymtab::finalizeSymtab() {
   if (Sorted)
     return;
   llvm::sort(MD5NameMap, less_first());
+  llvm::sort(MD5VTableMap, less_first());
   llvm::sort(MD5FuncMap, less_first());
   llvm::sort(AddrToMD5Map, less_first());
   AddrToMD5Map.erase(std::unique(AddrToMD5Map.begin(), AddrToMD5Map.end()),
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 9ebcba10c860f..a09a2bb0ba77c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -480,7 +480,9 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
     Types.clear();
     G.getMetadata(LLVMContext::MD_type, Types);
     if (!Types.empty()) {
-      MD5VTableMap.emplace_back(G.getGUID(), &G);
+      if (Error E = addVTableWithName(
+              G, getIRPGOObjectName(G, InLTO, /* PGONameMetadata */ nullptr)))
+        return E;
     }
   }
   Sorted = false;
@@ -488,6 +490,30 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
   return Error::success();
 }
 
+Error InstrProfSymtab::addVTableWithName(GlobalVariable &VTable,
+                                         StringRef VTablePGOName) {
+  if (Error E = addVTableName(VTablePGOName))
+    return E;
+
+  MD5VTableMap.emplace_back(GlobalValue::getGUID(VTablePGOName), &VTable);
+
+  // NOTE: `-funique-internal-linkage-names` doesn't uniqufy vtables, so no
+  // need to check ".__uniq."
+
+  // If a local-linkage vtable is promoted to have external linkage in ThinLTO,
+  // it will have `.llvm.` in its name. Use the name before externalization.
+  StringRef CanonicalName =
+      getCanonicalName(VTablePGOName, /* MayHaveUniqueSuffix= */ false);
+  if (CanonicalName != VTablePGOName) {
+    if (Error E = addVTableName(CanonicalName))
+      return E;
+
+    MD5VTableMap.emplace_back(GlobalValue::getGUID(CanonicalName), &VTable);
+  }
+
+  return Error::success();
+}
+
 /// \c NameStrings is a string composed of one of more possibly encoded
 /// sub-strings. The substrings are separated by 0 or more zero bytes. This
 /// method decodes the string and calls `NameCallback` for each substring.
@@ -560,35 +586,50 @@ Error InstrProfSymtab::initVTableNamesFromCompressedStrings(
       std::bind(&InstrProfSymtab::addVTableName, this, std::placeholders::_1));
 }
 
-Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
-  if (Error E = addFuncName(PGOFuncName))
-    return E;
-  MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+StringRef InstrProfSymtab::getCanonicalName(StringRef PGOName,
+                                            bool MayHaveUniqueSuffix) {
+  size_t pos = 0;
   // In ThinLTO, local function may have been promoted to global and have
   // suffix ".llvm." added to the function name. We need to add the
   // stripped function name to the symbol table so that we can find a match
   // from profile.
   //
-  // We may have other suffixes similar as ".llvm." which are needed to
-  // be stripped before the matching, but ".__uniq." suffix which is used
-  // to differentiate internal linkage functions in different modules
-  // should be kept. Now this is the only suffix with the pattern ".xxx"
-  // which is kept before matching.
-  const std::string UniqSuffix = ".__uniq.";
-  auto pos = PGOFuncName.find(UniqSuffix);
-  // Search '.' after ".__uniq." if ".__uniq." exists, otherwise
-  // search '.' from the beginning.
-  if (pos != std::string::npos)
-    pos += UniqSuffix.length();
-  else
-    pos = 0;
-  pos = PGOFuncName.find('.', pos);
-  if (pos != std::string::npos && pos != 0) {
-    StringRef OtherFuncName = PGOFuncName.substr(0, pos);
-    if (Error E = addFuncName(OtherFuncName))
+  // ".__uniq." suffix is used to differentiate internal linkage functions in
+  // different modules and should be kept. Now this is the only suffix with the
+  // pattern ".xxx" which is kept before matching, other suffixes similar as
+  // ".llvm." will be stripped.
+  if (MayHaveUniqueSuffix) {
+    const std::string UniqSuffix = ".__uniq.";
+    pos = PGOName.find(UniqSuffix);
+    if (pos != StringRef::npos)
+      pos += UniqSuffix.length();
+    else
+      pos = 0;
+  }
+
+  // Search '.' after ".__uniq." if ".__uniq." exists, otherwise search '.' from
+  // the beginning.
+  pos = PGOName.find('.', pos);
+  if (pos != StringRef::npos && pos != 0)
+    return PGOName.substr(0, pos);
+
+  return PGOName;
+}
+
+Error InstrProfSymtab::addFuncWithName(Function &F, StringRef PGOFuncName) {
+  if (Error E = addFuncName(PGOFuncName))
+    return E;
+  MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+
+  StringRef CanonicalName =
+      getCanonicalName(PGOFuncName, /* MayHaveUniqueSuffix= */ true);
+
+  if (CanonicalName != PGOFuncName) {
+    if (Error E = addFuncName(CanonicalName))
       return E;
-    MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
+    MD5FuncMap.emplace_back(Function::getGUID(CanonicalName), &F);
   }
+
   return Error::success();
 }
 
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 4b99195c1b859..edde544660e45 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
@@ -1605,6 +1607,44 @@ TEST(SymtabTest, instr_prof_symtab_module_test) {
   Function::Create(FTy, Function::WeakODRLinkage, "Wblah", M.get());
   Function::Create(FTy, Function::WeakODRLinkage, "Wbar", M.get());
 
+  // [ptr, ptr, ptr]
+  ArrayType *VTableArrayType = ArrayType::get(
+      PointerType::get(Ctx, M->getDataLayout().getDefaultGlobalsAddressSpace()),
+      3);
+  Constant *Int32TyNull =
+      llvm::ConstantExpr::getNullValue(PointerType::getUnqual(Ctx));
+  SmallVector<llvm::Type *, 1> tys = {VTableArrayType};
+  StructType *VTableType = llvm::StructType::get(Ctx, tys);
+
+  // Create a vtable definition with external linkage.
+  GlobalVariable *ExternalGV = new llvm::GlobalVariable(
+      *M, VTableType, /* isConstant= */ true,
+      llvm::GlobalValue::ExternalLinkage,
+      llvm::ConstantStruct::get(
+          VTableType, {llvm::ConstantArray::get(
+                          VTableArrayType,
+                          {Int32TyNull, Int32TyNull,
+                           Function::Create(FTy, Function::ExternalLinkage,
+                                            "VFuncInExternalGV", M.get())})}),
+      "ExternalGV");
+
+  // Create a vtable definition for local-linkage function.
+  GlobalVariable *LocalGV = new llvm::GlobalVariable(
+      *M, VTableType, /* isConstant= */ true,
+      llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantStruct::get(
+          VTableType,
+          {llvm::ConstantArray::get(
+              VTableArrayType, {Int32TyNull, Int32TyNull,
+                                Function::Create(FTy, Function::ExternalLinkage,
+                                                 "VFuncInLocalGV", M.get())})}),
+      "LocalGV");
+
+  // Add type metadata for the test data, since vtables with type metadata are
+  // added to symtab.
+  ExternalGV->addTypeMetadata(16, MDString::get(Ctx, "ExternalGV"));
+  LocalGV->addTypeMetadata(16, MDString::get(Ctx, "LocalGV"));
+
   InstrProfSymtab ProfSymtab;
   EXPECT_THAT_ERROR(ProfSymtab.create(*M), Succeeded());
 
@@ -1626,6 +1666,21 @@ TEST(SymtabTest, instr_prof_symtab_module_test) {
     EXPECT_EQ(StringRef(PGOName), PGOFuncName);
     EXPECT_THAT(PGOFuncName.str(), EndsWith(Funcs[I].str()));
   }
+
+  StringRef VTables[] = {"ExternalGV", "LocalGV"};
+  for (StringRef VTableName : VTables) {
+    GlobalVariable *GV =
+        M->getGlobalVariable(VTableName, /* AllowInternal=*/true);
+
+    // Test that ProfSymtab returns the expected name given a hash.
+    std::string IRPGOName = getPGOName(*GV);
+    uint64_t GUID = IndexedInstrProf::ComputeHash(IRPGOName);
+    EXPECT_EQ(IRPGOName, ProfSymtab.getFuncOrVarName(GUID));
+    EXPECT_EQ(VTableName, getParsedIRPGOName(IRPGOName).second);
+
+    // Test that ProfSymtab returns the expected global variable
+    EXPECT_EQ(GV, ProfSymtab.getGlobalVariable(GUID));
+  }
 }
 
 // Testing symtab serialization and creator/deserialization interface

>From 7ebae253ab1808bca328453f68af2b595d07176e Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Thu, 8 Feb 2024 11:32:50 -0800
Subject: [PATCH 03/16] [NFC][CallPromotionUtils]Extract a helper function
 versionCallSiteWithCond from versionCallSite

---
 .../Transforms/Utils/CallPromotionUtils.cpp   | 36 +++++++++++--------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 4e84927f1cfc9..d0cf0792eface 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -188,10 +188,9 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 /// Predicate and clone the given call site.
 ///
 /// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition compares the call site's called value to the given
-/// callee. The original call site is moved into the "else" block, and a clone
-/// of the call site is placed in the "then" block. The cloned instruction is
-/// returned.
+/// site. The "if" condition is specified by `Cond`.
+/// The original call site is moved into the "else" block, and a clone of the
+/// call site is placed in the "then" block. The cloned instruction is returned.
 ///
 /// For example, the call instruction below:
 ///
@@ -202,7 +201,6 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 /// Is replace by the following:
 ///
 ///   orig_bb:
-///     %cond = icmp eq i32 ()* %ptr, @func
 ///     br i1 %cond, %then_bb, %else_bb
 ///
 ///   then_bb:
@@ -232,7 +230,6 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 /// Is replace by the following:
 ///
 ///   orig_bb:
-///     %cond = icmp eq i32 ()* %ptr, @func
 ///     br i1 %cond, %then_bb, %else_bb
 ///
 ///   then_bb:
@@ -267,7 +264,6 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 /// Is replaced by the following:
 ///
 ///   cond_bb:
-///     %cond = icmp eq i32 ()* %ptr, @func
 ///     br i1 %cond, %then_bb, %orig_bb
 ///
 ///   then_bb:
@@ -280,19 +276,13 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 ///     ; The original call instruction stays in its original block.
 ///     %t0 = musttail call i32 %ptr()
 ///     ret %t0
-CallBase &llvm::versionCallSite(CallBase &CB, Value *Callee,
-                                MDNode *BranchWeights) {
+static CallBase &versionCallSiteWithCond(CallBase &CB, Value *Cond,
+                                         MDNode *BranchWeights) {
 
   IRBuilder<> Builder(&CB);
   CallBase *OrigInst = &CB;
   BasicBlock *OrigBlock = OrigInst->getParent();
 
-  // Create the compare. The called value and callee must have the same type to
-  // be compared.
-  if (CB.getCalledOperand()->getType() != Callee->getType())
-    Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType());
-  auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee);
-
   if (OrigInst->isMustTailCall()) {
     // Create an if-then structure. The original instruction stays in its block,
     // and a clone of the original instruction is placed in the "then" block.
@@ -380,6 +370,22 @@ CallBase &llvm::versionCallSite(CallBase &CB, Value *Callee,
   return *NewInst;
 }
 
+// Predicate and clone the given call site usingc condition `CB.callee ==
+// Callee`. See the comment `versionCallSiteWithCond` for the transformation.
+CallBase &llvm::versionCallSite(CallBase &CB, Value *Callee,
+                                MDNode *BranchWeights) {
+
+  IRBuilder<> Builder(&CB);
+
+  // Create the compare. The called value and callee must have the same type to
+  // be compared.
+  if (CB.getCalledOperand()->getType() != Callee->getType())
+    Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType());
+  auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee);
+
+  return versionCallSiteWithCond(CB, Cond, BranchWeights);
+}
+
 bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
                             const char **FailureReason) {
   assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");

>From ac5dc1bf77b67cbf0aa5e2c8fb6a7ce0080fb501 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Sat, 10 Feb 2024 12:03:25 -0800
Subject: [PATCH 04/16] [CallPromotionUtils]Implement conditional indirect call
 promotion with vtable-based comparison

---
 .../Transforms/Utils/CallPromotionUtils.h     |  50 ++++++-
 .../Transforms/Utils/CallPromotionUtils.cpp   |  64 ++++++++-
 .../Utils/CallPromotionUtilsTest.cpp          | 127 ++++++++++++++++++
 3 files changed, 233 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
index fcb384ec36133..5f3a71206876c 100644
--- a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
@@ -14,10 +14,17 @@
 #ifndef LLVM_TRANSFORMS_UTILS_CALLPROMOTIONUTILS_H
 #define LLVM_TRANSFORMS_UTILS_CALLPROMOTIONUTILS_H
 
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+
 namespace llvm {
+class Constant;
 class CallBase;
 class CastInst;
 class Function;
+class GlobalVariable;
+class Instruction;
 class MDNode;
 class Value;
 
@@ -41,7 +48,9 @@ bool isLegalToPromote(const CallBase &CB, Function *Callee,
 CallBase &promoteCall(CallBase &CB, Function *Callee,
                       CastInst **RetBitCast = nullptr);
 
-/// Promote the given indirect call site to conditionally call \p Callee.
+/// Promote the given indirect call site to conditionally call \p Callee. The
+/// promoted direct call instruction is predicated on `CB.getCalledOperand() ==
+/// Callee`.
 ///
 /// This function creates an if-then-else structure at the location of the call
 /// site. The original call site is moved into the "else" block. A clone of the
@@ -51,6 +60,31 @@ CallBase &promoteCall(CallBase &CB, Function *Callee,
 CallBase &promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
                                     MDNode *BranchWeights = nullptr);
 
+/// This is similar to `promoteCallWithIfThenElse` except that the condition to
+/// promote a virtual call is that \p VPtr is the same as any of \p
+/// AddressPoints.
+///
+/// This function is expected to be used on virtual calls (a subset of indirect
+/// calls). \p VPtr is the virtual table address stored in the objects, and
+/// \p AddressPoints contains address points of vtables to be compared with.
+///
+/// It's the responsibility of caller to guarantee the transformation
+/// correctness (by specifying \p VPtr and \p AddressPoints properly).
+///
+/// This function doesn't sink the address-calculation instructions of indirect
+/// callee to the indirect call fallback. The subsequent passes (e.g.
+/// inst-combine) should sink them if possible and handle the sink of debug
+/// intrinsics together.
+CallBase &promoteCallWithVTableCmp(CallBase &CB, Instruction *VPtr,
+                                   Function *Callee,
+                                   ArrayRef<Constant *> AddressPoints,
+                                   MDNode *BranchWeights);
+
+/// Returns a constant representing the vtable's address point specified by the
+/// offset. Caller should ensure \p AddressPointOffset is valid.
+Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
+                                      uint32_t AddressPointOffset);
+
 /// Try to promote (devirtualize) a virtual call on an Alloca. Return true on
 /// success.
 ///
@@ -74,13 +108,17 @@ CallBase &promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
 ///
 bool tryPromoteCall(CallBase &CB);
 
+/// Predicate and clone the given call site using the given condition.
+CallBase &versionCallSiteWithCond(CallBase &CB, Value *Cond,
+                                  MDNode *BranchWeights);
+
 /// Predicate and clone the given call site.
 ///
-/// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition compares the call site's called value to the given
-/// callee. The original call site is moved into the "else" block, and a clone
-/// of the call site is placed in the "then" block. The cloned instruction is
-/// returned.
+/// This function creates an if-then-else structure at the location of the
+/// call site. The "if" condition compares the call site's called value to
+/// the given callee. The original call site is moved into the "else" block,
+/// and a clone of the call site is placed in the "then" block. The cloned
+/// instruction is returned.
 CallBase &versionCallSite(CallBase &CB, Value *Callee, MDNode *BranchWeights);
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index d0cf0792eface..ea855b9a4d841 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -12,9 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -185,6 +187,24 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
     U->replaceUsesOfWith(&CB, Cast);
 }
 
+// Returns the or result of all icmp instructions.
+static Value *getOrResult(const SmallVector<Value *, 2> &ICmps,
+                          IRBuilder<> &Builder) {
+  assert(!ICmps.empty() && "Must have at least one icmp instructions");
+  if (ICmps.size() == 1)
+    return ICmps[0];
+
+  SmallVector<Value *, 2> OrResults;
+  int i = 0, NumICmp = ICmps.size();
+  for (i = 0; i + 1 < NumICmp; i += 2)
+    OrResults.push_back(Builder.CreateOr(ICmps[i], ICmps[i + 1], "icmp-or"));
+
+  if (i < NumICmp)
+    OrResults.push_back(ICmps[i]);
+
+  return getOrResult(OrResults, Builder);
+}
+
 /// Predicate and clone the given call site.
 ///
 /// This function creates an if-then-else structure at the location of the call
@@ -276,8 +296,8 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 ///     ; The original call instruction stays in its original block.
 ///     %t0 = musttail call i32 %ptr()
 ///     ret %t0
-static CallBase &versionCallSiteWithCond(CallBase &CB, Value *Cond,
-                                         MDNode *BranchWeights) {
+CallBase &llvm::versionCallSiteWithCond(CallBase &CB, Value *Cond,
+                                        MDNode *BranchWeights) {
 
   IRBuilder<> Builder(&CB);
   CallBase *OrigInst = &CB;
@@ -565,6 +585,46 @@ CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
   return promoteCall(NewInst, Callee);
 }
 
+Constant *llvm::getVTableAddressPointOffset(GlobalVariable *VTable,
+                                            uint32_t AddressPointOffset) {
+  Module &M = *VTable->getParent();
+  const DataLayout &DL = M.getDataLayout();
+  LLVMContext &Context = M.getContext();
+  Type *VTableType = VTable->getValueType();
+  assert(AddressPointOffset < DL.getTypeAllocSize(VTableType) &&
+         "Out-of-bound access");
+  APInt AddressPointOffsetAPInt(32, AddressPointOffset, false);
+  SmallVector<APInt> Indices =
+      DL.getGEPIndicesForOffset(VTableType, AddressPointOffsetAPInt);
+  SmallVector<llvm::Constant *> GEPIndices;
+  for (const auto &Index : Indices)
+    GEPIndices.push_back(llvm::ConstantInt::get(Type::getInt32Ty(Context),
+                                                Index.getZExtValue()));
+
+  return ConstantExpr::getInBoundsGetElementPtr(VTable->getValueType(), VTable,
+                                                GEPIndices);
+}
+
+CallBase &llvm::promoteCallWithVTableCmp(CallBase &CB, Instruction *VPtr,
+                                         Function *Callee,
+                                         ArrayRef<Constant *> AddressPoints,
+                                         MDNode *BranchWeights) {
+  assert(!AddressPoints.empty() && "Caller should guarantee");
+  IRBuilder<> Builder(&CB);
+  SmallVector<Value *, 2> ICmps;
+  for (auto &AddressPoint : AddressPoints)
+    ICmps.push_back(Builder.CreateICmpEQ(VPtr, AddressPoint));
+
+  Value *Cond = getOrResult(ICmps, Builder);
+
+  // Version the indirect call site. If Cond is true, 'NewInst' will be
+  // executed, otherwise the original call site will be executed.
+  CallBase &NewInst = versionCallSiteWithCond(CB, Cond, BranchWeights);
+
+  // Promote 'NewInst' so that it directly calls the desired function.
+  return promoteCall(NewInst, Callee);
+}
+
 bool llvm::tryPromoteCall(CallBase &CB) {
   assert(!CB.getCalledFunction());
   Module *M = CB.getCaller()->getParent();
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index eff8e27d36d64..c57abb54e4684 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -8,9 +8,12 @@
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
@@ -368,3 +371,127 @@ declare %struct2 @_ZN4Impl3RunEv(%class.Impl* %this)
   bool IsPromoted = tryPromoteCall(*CI);
   EXPECT_FALSE(IsPromoted);
 }
+
+TEST(CallPromotionUtilsTest, getVTableAddressPointOffset) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C,
+                                      R"IR(
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV8Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base35func3Ev], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base25func2Ev], [4 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }
+
+declare i32 @_ZN5Base15func1Ev(ptr)
+declare i32 @_ZN5Base25func2Ev(ptr)
+declare i32 @_ZN5Base15func0Ev(ptr)
+declare void @_ZN5Base35func3Ev(ptr)
+)IR");
+  GlobalVariable *GV = M->getGlobalVariable("_ZTV8Derived2");
+
+  for (auto [AddressPointOffset, Index] :
+       {std::pair{16, 0}, {40, 1}, {64, 2}}) {
+    Constant *AddressPoint =
+        getVTableAddressPointOffset(GV, AddressPointOffset);
+
+    ConstantExpr *GEP = dyn_cast<ConstantExpr>(AddressPoint);
+    ASSERT_TRUE(GEP);
+    SmallVector<Constant *> Indices = {
+        llvm::ConstantInt::get(Type::getInt32Ty(C), 0U),
+        llvm::ConstantInt::get(Type::getInt32Ty(C), Index),
+        llvm::ConstantInt::get(Type::getInt32Ty(C), 2U)};
+    EXPECT_EQ(GEP, ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(),
+                                                          GV, Indices));
+  }
+}
+
+TEST(CallPromotionUtilsTest, promoteCallWithVTableCmp) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C,
+                                      R"IR(
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV5Base1 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }, !type !0
+ at _ZTV8Derived1 = constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev], [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base25func2Ev] }, !type !1, !type !2, !type !3
+ at _ZTV5Base2 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base25func2Ev] }, !type !2
+ at _ZTV8Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base35func3Ev], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base25func2Ev], [4 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }, !type !4, !type !5, !type !6, !type !7
+ at _ZTV5Base3 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base35func3Ev] }, !type !6
+
+define i32 @testfunc(ptr %d) {
+entry:
+  %vtable = load ptr, ptr %d, !prof !8
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS5Base1")
+  tail call void @llvm.assume(i1 %0)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %1 = load ptr, ptr %vfn
+  %call = tail call i32 %1(ptr %d), !prof !9
+  ret i32 %call
+}
+
+define i32 @_ZN5Base15func1Ev(ptr %this) {
+entry:
+  ret i32 2
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+declare i32 @_ZN5Base25func2Ev(ptr)
+declare i32 @_ZN5Base15func0Ev(ptr)
+declare void @_ZN5Base35func3Ev(ptr)
+
+!0 = !{i64 16, !"_ZTS5Base1"}
+!1 = !{i64 16, !"_ZTS5Base1"}
+!2 = !{i64 48, !"_ZTS5Base2"}
+!3 = !{i64 16, !"_ZTS8Derived1"}
+!4 = !{i64 64, !"_ZTS5Base1"}
+!5 = !{i64 40, !"_ZTS5Base2"}
+!6 = !{i64 16, !"_ZTS5Base3"}
+!7 = !{i64 16, !"_ZTS8Derived2"}
+!8 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 800, i64 5035968517245772950, i64 500, i64 3215870116411581797, i64 300}
+!9 = !{!"VP", i32 0, i64 1600, i64 6804820478065511155, i64 1600})IR");
+
+  Function *F = M->getFunction("testfunc");
+  ASSERT_TRUE(F);
+  CallInst *CI = dyn_cast<CallInst>(&*std::next(F->front().rbegin()));
+  ASSERT_TRUE(CI && CI->isIndirectCall());
+
+  LoadInst *FuncPtr = dyn_cast<LoadInst>(CI->getCalledOperand());
+  ASSERT_TRUE(FuncPtr);
+
+  GetElementPtrInst *GEP =
+      dyn_cast<GetElementPtrInst>(FuncPtr->getPointerOperand());
+  ASSERT_TRUE(GEP);
+
+  LoadInst *VPtr = dyn_cast<LoadInst>(&*F->front().begin());
+
+  Function *Callee = M->getFunction("_ZN5Base15func1Ev");
+
+  // Create the constant and the branch weights
+  SmallVector<Constant *, 3> VTableAddressPoints;
+
+  for (auto &[VTableName, AddressPointOffset] : {std::pair{"_ZTV5Base1", 16},
+                                                 {"_ZTV8Derived1", 16},
+                                                 {"_ZTV8Derived2", 64}})
+    VTableAddressPoints.push_back(getVTableAddressPointOffset(
+        M->getGlobalVariable(VTableName), AddressPointOffset));
+
+  MDBuilder MDB(C);
+  MDNode *BranchWeights = MDB.createBranchWeights(1600, 0);
+
+  size_t OrigEntryBBSize = F->front().size();
+
+  // Tests that promoted direct call is returned.
+  CallBase &DirectCB = promoteCallWithVTableCmp(
+      *CI, VPtr, Callee, VTableAddressPoints, BranchWeights);
+  EXPECT_EQ(DirectCB.getCalledOperand(), Callee);
+
+  // Tests that GEP and FuncPtr sink to the basic block of indirect call.
+  BasicBlock *EntryBB = &F->front();
+  EXPECT_EQ(EntryBB, GEP->getParent());
+  EXPECT_EQ(EntryBB, FuncPtr->getParent());
+
+  // Promotion inserts 3 icmp instructions and 2 or instructions, and removes
+  // 1 call instruction from the entry block.
+  EXPECT_EQ(F->front().size(), OrigEntryBBSize + 4);
+}

>From 29d9cd2f128da0adde011a0a8362ec252104c901 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Sat, 10 Feb 2024 15:21:49 -0800
Subject: [PATCH 05/16] [TypeProf][IndirectCallPromotion]Implement vtable-based
 transformation

---
 .../Analysis/IndirectCallPromotionAnalysis.h  |   2 +-
 .../IndirectCallPromotionAnalysis.cpp         |   6 +-
 .../Instrumentation/IndirectCallPromotion.cpp | 391 +++++++++++++++++-
 .../Transforms/PGOProfile/icp_vtable_cmp.ll   | 206 +++++++++
 .../PGOProfile/icp_vtable_invoke.ll           | 201 +++++++++
 .../PGOProfile/icp_vtable_tail_call.ll        |  92 +++++
 6 files changed, 876 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
 create mode 100644 llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll

diff --git a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
index 8a05e913a9106..eda672d7d50ee 100644
--- a/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
+++ b/llvm/include/llvm/Analysis/IndirectCallPromotionAnalysis.h
@@ -57,7 +57,7 @@ class ICallPromotionAnalysis {
   ///
   /// The returned array space is owned by this class, and overwritten on
   /// subsequent calls.
-  ArrayRef<InstrProfValueData>
+  MutableArrayRef<InstrProfValueData>
   getPromotionCandidatesForInstruction(const Instruction *I, uint32_t &NumVals,
                                        uint64_t &TotalCount,
                                        uint32_t &NumCandidates);
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index ab53717eb889a..643c155ba6d7e 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -87,7 +87,7 @@ uint32_t ICallPromotionAnalysis::getProfitablePromotionCandidates(
   return I;
 }
 
-ArrayRef<InstrProfValueData>
+MutableArrayRef<InstrProfValueData>
 ICallPromotionAnalysis::getPromotionCandidatesForInstruction(
     const Instruction *I, uint32_t &NumVals, uint64_t &TotalCount,
     uint32_t &NumCandidates) {
@@ -96,8 +96,8 @@ ICallPromotionAnalysis::getPromotionCandidatesForInstruction(
                                ValueDataArray.get(), NumVals, TotalCount);
   if (!Res) {
     NumCandidates = 0;
-    return ArrayRef<InstrProfValueData>();
+    return MutableArrayRef<InstrProfValueData>();
   }
   NumCandidates = getProfitablePromotionCandidates(I, NumVals, TotalCount);
-  return ArrayRef<InstrProfValueData>(ValueDataArray.get(), NumVals);
+  return MutableArrayRef<InstrProfValueData>(ValueDataArray.get(), NumVals);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 6a44a32bb34dc..85af3d7cc56b7 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -13,13 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -51,6 +54,8 @@ using namespace llvm;
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
 
+extern cl::opt<unsigned> MaxNumVTableAnnotations;
+
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
 static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
@@ -103,13 +108,71 @@ static cl::opt<bool>
     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
                  cl::desc("Dump IR after transformation happens"));
 
+// This option is meant to be used by LLVM regression test and test the
+// transformation that compares vtables.
+// TODO: ICP pass will do cost-benefit analysis between function-based
+// comparison and vtable-based comparison and choose one of the two
+// transformations.
+static cl::opt<bool> ICPEnableVTableCmp(
+    "icp-enable-vtable-cmp", cl::init(false), cl::Hidden,
+    cl::desc("If ThinLTO and WPD is enabled and this option is true, "
+             "indirect-call promotion pass will compare vtables rather than "
+             "functions for speculative devirtualization of virtual calls."
+             " If set to false, indirect-call promotion pass will always "
+             "compare functions."));
+
 namespace {
 
+using VTableAddressPointOffsetValMap =
+    SmallDenseMap<const GlobalVariable *, SmallDenseMap<int, Constant *, 4>, 8>;
+
+// A struct to collect type information for a virtual call site.
+struct VirtualCallSiteInfo {
+  // The offset from the address point to virtual function in the vtable.
+  uint64_t FunctionOffset;
+  // The instruction that computes the address point of vtable.
+  Instruction *VPtr;
+  // The compatible type used in LLVM type intrinsics.
+  StringRef CompatibleTypeStr;
+};
+
+// The key is a virtual call, and value is its type information.
+using VirtualCallSiteTypeInfoMap =
+    SmallDenseMap<const CallBase *, VirtualCallSiteInfo, 8>;
+
+// Given the list of compatible type metadata for a vtable and one specified
+// type, returns the address point offset of the type if any.
+static std::optional<uint64_t>
+getCompatibleTypeOffset(const ArrayRef<MDNode *> &Types,
+                        StringRef CompatibleType) {
+  if (Types.empty()) {
+    return std::nullopt;
+  }
+  std::optional<uint64_t> Offset;
+  // find the offset where type string is equal to the one in llvm.type.test
+  // intrinsic
+  for (MDNode *Type : Types) {
+    auto TypeIDMetadata = Type->getOperand(1).get();
+    if (auto *TypeId = dyn_cast<MDString>(TypeIDMetadata)) {
+      StringRef TypeStr = TypeId->getString();
+      if (TypeStr != CompatibleType) {
+        continue;
+      }
+      Offset = cast<ConstantInt>(
+                   cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+                   ->getZExtValue();
+      break;
+    }
+  }
+  return Offset;
+}
+
 // Promote indirect calls to conditional direct calls, keeping track of
 // thresholds.
 class IndirectCallPromoter {
 private:
   Function &F;
+  Module &M;
 
   // Symtab that maps indirect call profile values to function names and
   // defines.
@@ -117,6 +180,11 @@ class IndirectCallPromoter {
 
   const bool SamplePGO;
 
+  // A map from a virtual call to its type information.
+  const VirtualCallSiteTypeInfoMap &VirtualCSInfo;
+
+  VTableAddressPointOffsetValMap &VTableAddressPointOffsetVal;
+
   OptimizationRemarkEmitter &ORE;
 
   // A struct that records the direct target and it's call count.
@@ -124,9 +192,17 @@ class IndirectCallPromoter {
     Function *const TargetFunction;
     const uint64_t Count;
 
+    uint64_t FunctionOffset;
+
+    SmallVector<std::pair<uint64_t, uint64_t>, 2> VTableGUIDAndCounts;
+
+    SmallVector<Constant *, 2> AddressPoints;
+
     PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
   };
 
+  using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t, 4>;
+
   // Check if the indirect-call call site should be promoted. Return the number
   // of promotions. Inst is the candidate indirect call, ValueDataRef
   // contains the array of value profile data for profiled targets,
@@ -134,7 +210,8 @@ class IndirectCallPromoter {
   // NumCandidates is the number of candidate entries in ValueDataRef.
   std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
       const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
-      uint64_t TotalCount, uint32_t NumCandidates);
+      uint64_t TotalCount, uint32_t NumCandidates,
+      VTableGUIDCountsMap &VTableGUIDCounts);
 
   // Promote a list of targets for one indirect-call callsite by comparing
   // indirect callee with functions. Returns true if there are IR
@@ -144,10 +221,33 @@ class IndirectCallPromoter {
       uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
       uint32_t NumCandidates);
 
+  bool tryToPromoteWithVTableCmp(
+      CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+      uint64_t TotalFuncCount, uint32_t NumCandidates,
+      MutableArrayRef<InstrProfValueData> ICallProfDataRef,
+      VTableGUIDCountsMap &VTableGUIDCounts);
+
+  void
+  tryGetVTableInfos(const CallBase &CB,
+                    const SmallDenseMap<Function *, int, 4> &CalleeIndexMap,
+                    VTableGUIDCountsMap &VTableGUIDCounts,
+                    std::vector<PromotionCandidate> &Candidates);
+
+  Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV,
+                                             uint64_t AddressPointOffset);
+
+  bool isProfitableToCompareVTables(
+      const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount);
+
 public:
-  IndirectCallPromoter(Function &Func, InstrProfSymtab *Symtab, bool SamplePGO,
-                       OptimizationRemarkEmitter &ORE)
-      : F(Func), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {}
+  IndirectCallPromoter(
+      Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO,
+      const VirtualCallSiteTypeInfoMap &VirtualCSInfo,
+      VTableAddressPointOffsetValMap &VTableAddressPointOffsetVal,
+      OptimizationRemarkEmitter &ORE)
+      : F(Func), M(M), Symtab(Symtab), SamplePGO(SamplePGO),
+        VirtualCSInfo(VirtualCSInfo),
+        VTableAddressPointOffsetVal(VTableAddressPointOffsetVal), ORE(ORE) {}
   IndirectCallPromoter(const IndirectCallPromoter &) = delete;
   IndirectCallPromoter &operator=(const IndirectCallPromoter &) = delete;
 
@@ -161,9 +261,12 @@ class IndirectCallPromoter {
 std::vector<IndirectCallPromoter::PromotionCandidate>
 IndirectCallPromoter::getPromotionCandidatesForCallSite(
     const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
-    uint64_t TotalCount, uint32_t NumCandidates) {
+    uint64_t TotalCount, uint32_t NumCandidates,
+    VTableGUIDCountsMap &VTableGUIDCounts) {
   std::vector<PromotionCandidate> Ret;
 
+  SmallDenseMap<Function *, int, 4> CalleeIndexMap;
+
   LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB
                     << " Num_targets: " << ValueDataRef.size()
                     << " Num_candidates: " << NumCandidates << "\n");
@@ -237,30 +340,114 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite(
       break;
     }
 
+    CalleeIndexMap[TargetFunction] = Ret.size();
     Ret.push_back(PromotionCandidate(TargetFunction, Count));
+
     TotalCount -= Count;
   }
+
+  if (!ICPEnableVTableCmp)
+    return Ret;
+
+  tryGetVTableInfos(CB, CalleeIndexMap, VTableGUIDCounts, Ret);
+
+  return Ret;
+}
+
+Constant *IndirectCallPromoter::getOrCreateVTableAddressPointVar(
+    GlobalVariable *GV, uint64_t AddressPointOffset) {
+  Constant *Var = VTableAddressPointOffsetVal[GV][AddressPointOffset];
+  if (Var != nullptr)
+    return Var;
+  Constant *Ret = getVTableAddressPointOffset(GV, AddressPointOffset);
+  VTableAddressPointOffsetVal[GV][AddressPointOffset] = Ret;
   return Ret;
 }
 
+void IndirectCallPromoter::tryGetVTableInfos(
+    const CallBase &CB, const SmallDenseMap<Function *, int, 4> &CalleeIndexMap,
+    VTableGUIDCountsMap &GUIDCountsMap,
+    std::vector<PromotionCandidate> &Candidates) {
+  if (!ICPEnableVTableCmp)
+    return;
+
+  auto Iter = VirtualCSInfo.find(&CB);
+  if (Iter == VirtualCSInfo.end())
+    return;
+
+  auto &VirtualCallInfo = Iter->second;
+
+  uint32_t ActualNumValueData = 0;
+
+  uint64_t TotalVTableCount = 0;
+  auto VTableValueDataArray = getValueProfDataFromInst(
+      *VirtualCallInfo.VPtr, IPVK_VTableTarget, MaxNumVTableAnnotations,
+      ActualNumValueData, TotalVTableCount);
+
+  if (VTableValueDataArray.get() == nullptr)
+    return;
+
+  SmallVector<MDNode *, 2> Types; // type metadata associated with a vtable.
+  // Compute the functions and counts from by each vtable.
+  for (size_t j = 0; j < ActualNumValueData; j++) {
+    uint64_t VTableVal = VTableValueDataArray[j].Value;
+    GUIDCountsMap[VTableVal] = VTableValueDataArray[j].Count;
+    GlobalVariable *VTableVariable = Symtab->getGlobalVariable(VTableVal);
+    if (!VTableVariable) {
+      LLVM_DEBUG(dbgs() << "\tCannot find vtable definition for " << VTableVal
+                        << "\n");
+      continue;
+    }
+
+    Types.clear();
+    VTableVariable->getMetadata(LLVMContext::MD_type, Types);
+    std::optional<uint64_t> MaybeAddressPointOffset =
+        getCompatibleTypeOffset(Types, VirtualCallInfo.CompatibleTypeStr);
+    if (!MaybeAddressPointOffset)
+      continue;
+
+    const uint64_t AddressPointOffset = *MaybeAddressPointOffset;
+
+    Function *Callee = nullptr;
+
+    std::tie(Callee, std::ignore) = getFunctionAtVTableOffset(
+        VTableVariable, AddressPointOffset + VirtualCallInfo.FunctionOffset,
+        *(F.getParent()));
+    if (!Callee)
+      continue;
+
+    auto CalleeIndexIter = CalleeIndexMap.find(Callee);
+    if (CalleeIndexIter == CalleeIndexMap.end())
+      continue;
+
+    auto &Candidate = Candidates[CalleeIndexIter->second];
+    Candidate.VTableGUIDAndCounts.push_back(
+        {VTableVal, VTableValueDataArray[j].Count});
+    Candidate.AddressPoints.push_back(
+        getOrCreateVTableAddressPointVar(VTableVariable, AddressPointOffset));
+  }
+}
+
+static MDNode *getBranchWeights(LLVMContext &Context, uint64_t IfCount,
+                                uint64_t ElseCount) {
+  MDBuilder MDB(Context);
+  uint64_t Scale = calculateCountScale(std::max(IfCount, ElseCount));
+  return MDB.createBranchWeights(scaleBranchCount(IfCount, Scale),
+                                 scaleBranchCount(ElseCount, Scale));
+}
+
 CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
                                          uint64_t Count, uint64_t TotalCount,
                                          bool AttachProfToDirectCall,
                                          OptimizationRemarkEmitter *ORE) {
-
-  uint64_t ElseCount = TotalCount - Count;
-  uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
-  uint64_t Scale = calculateCountScale(MaxCount);
-  MDBuilder MDB(CB.getContext());
-  MDNode *BranchWeights = MDB.createBranchWeights(
-      scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
+  MDNode *BranchWeights =
+      getBranchWeights(CB.getContext(), Count, TotalCount - Count);
 
   CallBase &NewInst =
       promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
 
-  if (AttachProfToDirectCall) {
+  if (AttachProfToDirectCall)
     setBranchWeights(NewInst, {static_cast<uint32_t>(Count)});
-  }
 
   using namespace ore;
 
@@ -304,6 +491,80 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
   return Changed;
 }
 
+bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
+    CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t TotalFuncCount, uint32_t NumCandidates,
+    MutableArrayRef<InstrProfValueData> ICallProfDataRef,
+    VTableGUIDCountsMap &VTableGUIDCounts) {
+  Instruction *VPtr = VirtualCSInfo.at(&CB).VPtr;
+
+  SmallVector<int, 4> PromotedFuncCount;
+  for (const auto &Candidate : Candidates) {
+    uint64_t IfCount = 0;
+    // FIXME: Skip vtables with cold count in the comparison.
+    for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts) {
+      IfCount += Count;
+      VTableGUIDCounts[GUID] -= Count;
+    }
+
+    promoteCallWithVTableCmp(
+        CB, VPtr, Candidate.TargetFunction, Candidate.AddressPoints,
+        getBranchWeights(CB.getContext(), IfCount, TotalFuncCount - IfCount));
+
+    PromotedFuncCount.push_back(IfCount);
+
+    TotalFuncCount -= IfCount;
+    NumOfPGOICallPromotion++;
+  }
+
+  if (PromotedFuncCount.empty())
+    return false;
+
+  // A comparator that sorts value profile data descendingly.
+  auto Cmp = [](const InstrProfValueData &LHS, const InstrProfValueData &RHS) {
+    return LHS.Count > RHS.Count;
+  };
+
+  CB.setMetadata(LLVMContext::MD_prof, nullptr);
+  // Update indirect call value profiles if total count of the call site is not
+  // zero.
+  if (TotalFuncCount != 0) {
+    for (size_t I = 0; I < PromotedFuncCount.size(); I++)
+      ICallProfDataRef[I].Count -= PromotedFuncCount[I];
+
+    llvm::sort(ICallProfDataRef.begin(), ICallProfDataRef.end(), Cmp);
+
+    // Locate the first <target, count> pair where the count is zero or less.
+    auto UB = llvm::upper_bound(
+        ICallProfDataRef, 0U,
+        [](uint64_t Count, const InstrProfValueData &ProfData) {
+          return ProfData.Count <= Count;
+        });
+
+    ArrayRef<InstrProfValueData> VDs(ICallProfDataRef.begin(), UB);
+    annotateValueSite(M, CB, VDs, TotalFuncCount, IPVK_IndirectCallTarget,
+                      NumCandidates);
+  }
+
+  VPtr->setMetadata(LLVMContext::MD_prof, nullptr);
+  std::vector<InstrProfValueData> VTableValueProfiles;
+  uint64_t TotalVTableCount = 0;
+  for (auto [GUID, Count] : VTableGUIDCounts) {
+    if (Count == 0)
+      continue;
+
+    VTableValueProfiles.push_back({GUID, Count});
+    TotalVTableCount += Count;
+  }
+  llvm::sort(VTableValueProfiles, Cmp);
+
+  annotateValueSite(M, *VPtr, VTableValueProfiles, TotalVTableCount,
+                    IPVK_VTableTarget, VTableValueProfiles.size());
+
+  // Update vtable profile metadata
+  return true;
+}
+
 // Traverse all the indirect-call callsite and get the value profile
 // annotation to perform indirect-call promotion.
 bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
@@ -317,14 +578,96 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
     if (!NumCandidates ||
         (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
       continue;
+    VTableGUIDCountsMap VTableGUIDCounts;
     auto PromotionCandidates = getPromotionCandidatesForCallSite(
-        *CB, ICallProfDataRef, TotalCount, NumCandidates);
-    Changed |= tryToPromoteWithFuncCmp(*CB, PromotionCandidates, TotalCount,
-                                       ICallProfDataRef, NumCandidates);
+        *CB, ICallProfDataRef, TotalCount, NumCandidates, VTableGUIDCounts);
+
+    if (isProfitableToCompareVTables(PromotionCandidates, TotalCount))
+      Changed |= tryToPromoteWithVTableCmp(*CB, PromotionCandidates, TotalCount,
+                                           NumCandidates, ICallProfDataRef,
+                                           VTableGUIDCounts);
+    else
+      Changed |= tryToPromoteWithFuncCmp(*CB, PromotionCandidates, TotalCount,
+                                         ICallProfDataRef, NumCandidates);
   }
   return Changed;
 }
 
+bool IndirectCallPromoter::isProfitableToCompareVTables(
+    const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount) {
+  if (!ICPEnableVTableCmp)
+    return false;
+
+  // FIXME: Implement cost-benefit analysis in a follow-up change.
+  return true;
+}
+
+static void
+computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
+                                  VirtualCallSiteTypeInfoMap &VirtualCSInfo) {
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+
+  // Right now only llvm.type.test is used to find out virtual call sites.
+  // With ThinLTO and whole-program-devirtualization, llvm.type.test and
+  // llvm.public.type.test are emitted, and llvm.public.type.test is either
+  // refined to llvm.type.test or dropped before indirect-call-promotion pass.
+  //
+  // FIXME: For fullLTO with VFE, `llvm.type.checked.load intrinsic` is emitted.
+  // Find out virtual calls by looking at users of llvm.type.checked.load in
+  // that case.
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+
+  if (!TypeTestFunc || TypeTestFunc->use_empty())
+    return;
+
+  // Iterate all type.test calls and find all indirect calls.
+  for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
+    if (!CI)
+      continue;
+
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+    if (!TypeMDVal)
+      continue;
+
+    auto *CompatibleTypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!CompatibleTypeId)
+      continue;
+
+    StringRef CompatibleTypeStr = CompatibleTypeId->getString();
+
+    // Find out all devirtualizable call sites given a llvm.type.test intrinsic
+    // call.
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<CallInst *, 1> Assumes;
+    auto &DT = LookupDomTree(*CI->getFunction());
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
+
+    // type-id, offset from the address point
+    // combined with type metadata to compute function offset
+    for (auto &DevirtCall : DevirtCalls) {
+      CallBase &CB = DevirtCall.CB;
+      // This is the offset from the address point offset to the virtual
+      // function.
+      uint64_t Offset = DevirtCall.Offset;
+
+      // Given an indirect call, try find the instruction which loads a pointer
+      // to virtual table.
+      Instruction *VTablePtr =
+          PGOIndirectCallVisitor::tryGetVTableInstruction(&CB);
+
+      if (!VTablePtr)
+        continue;
+
+      VirtualCSInfo[&CB] = {Offset, VTablePtr, CompatibleTypeStr};
+    }
+  }
+}
+
 // A wrapper function that does the actual work.
 static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
                                  bool SamplePGO, ModuleAnalysisManager &MAM) {
@@ -337,6 +680,17 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
     return false;
   }
   bool Changed = false;
+  VirtualCallSiteTypeInfoMap VirtualCSInfo;
+
+  computeVirtualCallSiteTypeInfoMap(M, MAM, VirtualCSInfo);
+
+  // This map records states across functions in an LLVM IR module.
+  // IndirectCallPromoter processes one
+  // function at a time and updates this map with new entries the first time
+  // the entry is needed in the module; the subsequent functions could re-use
+  // map entries inserted when processing prior functions.
+  VTableAddressPointOffsetValMap VTableAddressPointOffsetVal;
+
   for (auto &F : M) {
     if (F.isDeclaration() || F.hasOptNone())
       continue;
@@ -345,7 +699,8 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
         MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    IndirectCallPromoter CallPromoter(F, &Symtab, SamplePGO, ORE);
+    IndirectCallPromoter CallPromoter(F, M, &Symtab, SamplePGO, VirtualCSInfo,
+                                      VTableAddressPointOffsetVal, ORE);
     bool FuncChanged = CallPromoter.processFunction(PSI);
     if (ICPDUMPAFTER && FuncChanged) {
       LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
new file mode 100644
index 0000000000000..75eda4b66be02
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
@@ -0,0 +1,206 @@
+
+; RUN: opt < %s -passes=pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-FUNC
+
+; Invoke instcombine after pgo-icall-prom so the address calculation instructions for virtual calls get sink into the basic block for indirect fallback.
+; RUN: opt < %s -passes='pgo-icall-prom,instcombine' -icp-enable-vtable-cmp -S | FileCheck %s --check-prefix=ICALL-VTABLE
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV4Base = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0
+ at _ZTV8Derived1 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN8Derived15func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0, !type !1
+ at _ZTV8Derived2 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN8Derived25func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0, !type !2
+ at _ZTV8Derived3 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN8Derived35func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0, !type !3
+
+; Test the IR transformation from function-based indirect-call promotion and vtable-based indirect-call promotion.
+
+; The tested function has one function candidate which comes from one vtable.
+define i32 @test_one_function_one_vtable(ptr %d) {
+; ICALL-FUNC-LABEL: define i32 @test_one_function_one_vtable(
+; ICALL-FUNC-SAME: ptr [[D:%.*]]) {
+; ICALL-FUNC-NEXT:  entry:
+; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF4:![0-9]+]]
+; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-FUNC-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
+; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func2Ev
+; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5:![0-9]+]]
+; ICALL-FUNC:       if.true.direct_targ:
+; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr [[D]])
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP:%.*]]
+; ICALL-FUNC:       if.false.orig_indirect:
+; ICALL-FUNC-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP1]](ptr [[D]])
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
+; ICALL-FUNC:       if.end.icp:
+; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-FUNC-NEXT:    ret i32 [[TMP4]]
+;
+; ICALL-VTABLE-LABEL: define i32 @test_one_function_one_vtable(
+; ICALL-VTABLE-SAME: ptr [[D:%.*]]) {
+; ICALL-VTABLE-NEXT:  entry:
+; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived2, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    br i1 [[TMP1]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4:![0-9]+]]
+; ICALL-VTABLE:       if.true.direct_targ:
+; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr nonnull [[D]])
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP:%.*]]
+; ICALL-VTABLE:       if.false.orig_indirect:
+; ICALL-VTABLE-NEXT:    [[VFN:%.*]] = getelementptr inbounds i8, ptr [[VTABLE]], i64 8
+; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VFN]], align 8
+; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP3]](ptr nonnull [[D]])
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
+; ICALL-VTABLE:       if.end.icp:
+; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP2]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-VTABLE-NEXT:    ret i32 [[TMP4]]
+;
+entry:
+  %vtable = load ptr, ptr %d, !prof !4
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %1 = load ptr, ptr %vfn
+  %call = tail call i32 %1(ptr %d), !prof !5
+  ret i32 %call
+}
+
+; The tested function has one function candidate which comes from two vtables.
+define i32 @test_one_function_two_vtables(ptr %d) {
+; ICALL-FUNC-LABEL: define i32 @test_one_function_two_vtables(
+; ICALL-FUNC-SAME: ptr [[D:%.*]]) {
+; ICALL-FUNC-NEXT:  entry:
+; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF6:![0-9]+]]
+; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-FUNC-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
+; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func2Ev
+; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5]]
+; ICALL-FUNC:       if.true.direct_targ:
+; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr [[D]])
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP:%.*]]
+; ICALL-FUNC:       if.false.orig_indirect:
+; ICALL-FUNC-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP1]](ptr [[D]])
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
+; ICALL-FUNC:       if.end.icp:
+; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-FUNC-NEXT:    ret i32 [[TMP4]]
+;
+; ICALL-VTABLE-LABEL: define i32 @test_one_function_two_vtables(
+; ICALL-VTABLE-SAME: ptr [[D:%.*]]) {
+; ICALL-VTABLE-NEXT:  entry:
+; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived1, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived2, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    [[ICMP_OR:%.*]] = or i1 [[TMP1]], [[TMP2]]
+; ICALL-VTABLE-NEXT:    br i1 [[ICMP_OR]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4]]
+; ICALL-VTABLE:       if.true.direct_targ:
+; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr nonnull [[D]])
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP:%.*]]
+; ICALL-VTABLE:       if.false.orig_indirect:
+; ICALL-VTABLE-NEXT:    [[VFN:%.*]] = getelementptr inbounds i8, ptr [[VTABLE]], i64 8
+; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VFN]], align 8
+; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP4]](ptr nonnull [[D]])
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
+; ICALL-VTABLE:       if.end.icp:
+; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-VTABLE-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  %vtable = load ptr, ptr %d, !prof !6
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %1 = load ptr, ptr %vfn
+  %call = tail call i32 %1(ptr %d), !prof !5
+  ret i32 %call
+}
+
+; The tested function has one function candidate which comes from three vtables.
+define i32 @test_one_function_three_vtables(ptr %d) {
+; ICALL-FUNC-LABEL: define i32 @test_one_function_three_vtables(
+; ICALL-FUNC-SAME: ptr [[D:%.*]]) {
+; ICALL-FUNC-NEXT:  entry:
+; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF7:![0-9]+]]
+; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-FUNC-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
+; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func2Ev
+; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5]]
+; ICALL-FUNC:       if.true.direct_targ:
+; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr [[D]])
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP:%.*]]
+; ICALL-FUNC:       if.false.orig_indirect:
+; ICALL-FUNC-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP1]](ptr [[D]])
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
+; ICALL-FUNC:       if.end.icp:
+; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-FUNC-NEXT:    ret i32 [[TMP4]]
+;
+; ICALL-VTABLE-LABEL: define i32 @test_one_function_three_vtables(
+; ICALL-VTABLE-SAME: ptr [[D:%.*]]) {
+; ICALL-VTABLE-NEXT:  entry:
+; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived1, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived2, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    [[ICMP_OR:%.*]] = or i1 [[TMP1]], [[TMP2]]
+; ICALL-VTABLE-NEXT:    [[ICMP_OR1:%.*]] = or i1 [[ICMP_OR]], [[TMP3]]
+; ICALL-VTABLE-NEXT:    br i1 [[ICMP_OR1]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4]]
+; ICALL-VTABLE:       if.true.direct_targ:
+; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr nonnull [[D]])
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP:%.*]]
+; ICALL-VTABLE:       if.false.orig_indirect:
+; ICALL-VTABLE-NEXT:    [[VFN:%.*]] = getelementptr inbounds i8, ptr [[VTABLE]], i64 8
+; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[VFN]], align 8
+; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP5]](ptr nonnull [[D]])
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
+; ICALL-VTABLE:       if.end.icp:
+; ICALL-VTABLE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP4]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-VTABLE-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  %vtable = load ptr, ptr %d, !prof !7
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %1 = load ptr, ptr %vfn
+  %call = tail call i32 %1(ptr %d), !prof !5
+  ret i32 %call
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1 noundef)
+declare i32 @_ZN4Base5func1Ei(ptr, i32)
+declare i32 @_ZN8Derived15func1Ei(ptr, i32)
+declare i32 @_ZN8Derived25func1Ei(ptr, i32)
+declare i32 @_ZN8Derived35func1Ei(ptr, i32)
+
+define i32 @_ZN4Base5func2Ev(ptr %this) {
+entry:
+  ret i32 0
+}
+
+!0 = !{i64 16, !"_ZTS4Base"}
+!1 = !{i64 16, !"_ZTS8Derived1"}
+!2 = !{i64 16, !"_ZTS8Derived2"}
+!3 = !{i64 16, !"_ZTS8Derived3"}
+!4 = !{!"VP", i32 2, i64 1600, i64 5035968517245772950, i64 1600}
+!5 = !{!"VP", i32 0, i64 1600, i64 -3104805163612457913, i64 1600}
+!6 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 1000, i64 5035968517245772950, i64 600}
+!7 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 600, i64 5035968517245772950, i64 550, i64 1960855528937986108, i64 450}
+
+; ICALL-FUNC: [[PROF4]] = !{!"VP", i32 2, i64 1600, i64 5035968517245772950, i64 1600}
+; ICALL-FUNC: [[PROF5]] = !{!"branch_weights", i32 1600, i32 0}
+; ICALL-FUNC: [[PROF6]] = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 1000, i64 5035968517245772950, i64 600}
+; ICALL-FUNC: [[PROF7]] = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 600, i64 5035968517245772950, i64 550, i64 1960855528937986108, i64 450}
+
+; ICALL-VTABLE: [[PROF4]] = !{!"branch_weights", i32 1600, i32 0}
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
new file mode 100644
index 0000000000000..a2924420fd2a0
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=pgo-icall-prom -S  | FileCheck %s --check-prefix=ICALL-FUNC
+; RUN: opt < %s -passes='pgo-icall-prom,instcombine' -icp-enable-vtable-cmp -S | FileCheck %s --check-prefix=ICALL-VTABLE
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.Error = type { i8 }
+
+ at _ZTI5Error = dso_local constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr null, i64 2), ptr null }
+ at _ZTV4Base = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base10get_ticketEv] }, !type !0, !type !1
+ at _ZTV7Derived = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived10get_ticketEv] }, !type !0, !type !1, !type !2, !type !3
+
+ at .str = private unnamed_addr constant [15 x i8] c"out of tickets\00"
+
+define i32 @_Z4testP4Base(ptr %b) personality ptr @__gxx_personality_v0 {
+; ICALL-FUNC-LABEL: define i32 @_Z4testP4Base(
+; ICALL-FUNC-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 {
+; ICALL-FUNC-NEXT:  entry:
+; ICALL-FUNC-NEXT:    [[E:%.*]] = alloca [[CLASS_ERROR:%.*]], align 8
+; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8, !prof [[PROF4:![0-9]+]]
+; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN7Derived10get_ticketEv
+; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5:![0-9]+]]
+; ICALL-FUNC:       if.true.direct_targ:
+; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = invoke i32 @_ZN7Derived10get_ticketEv(ptr [[B]])
+; ICALL-FUNC-NEXT:            to label [[IF_END_ICP:%.*]] unwind label [[LPAD:%.*]]
+; ICALL-FUNC:       if.false.orig_indirect:
+; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base10get_ticketEv
+; ICALL-FUNC-NEXT:    br i1 [[TMP4]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[IF_FALSE_ORIG_INDIRECT2:%.*]], !prof [[PROF6:![0-9]+]]
+; ICALL-FUNC:       if.true.direct_targ1:
+; ICALL-FUNC-NEXT:    [[TMP5:%.*]] = invoke i32 @_ZN4Base10get_ticketEv(ptr [[B]])
+; ICALL-FUNC-NEXT:            to label [[IF_END_ICP3:%.*]] unwind label [[LPAD]]
+; ICALL-FUNC:       if.false.orig_indirect2:
+; ICALL-FUNC-NEXT:    [[CALL:%.*]] = invoke i32 [[TMP1]](ptr [[B]])
+; ICALL-FUNC-NEXT:            to label [[IF_END_ICP3]] unwind label [[LPAD]]
+; ICALL-FUNC:       if.end.icp3:
+; ICALL-FUNC-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT2]] ], [ [[TMP5]], [[IF_TRUE_DIRECT_TARG1]] ]
+; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
+; ICALL-FUNC:       if.end.icp:
+; ICALL-FUNC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], [[IF_END_ICP3]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-FUNC-NEXT:    br label %try.cont
+; ICALL-FUNC:       lpad:
+
+;
+; ICALL-VTABLE-LABEL: define i32 @_Z4testP4Base(
+; ICALL-VTABLE-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 {
+; ICALL-VTABLE-NEXT:  entry:
+; ICALL-VTABLE-NEXT:    [[E:%.*]] = alloca [[CLASS_ERROR:%.*]], align 8
+; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4:![0-9]+]]
+; ICALL-VTABLE:       if.true.direct_targ:
+; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = invoke i32 @_ZN7Derived10get_ticketEv(ptr nonnull [[B]])
+; ICALL-VTABLE-NEXT:            to label [[IF_END_ICP:%.*]] unwind label [[LPAD:%.*]]
+; ICALL-VTABLE:       if.false.orig_indirect:
+; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    br i1 [[TMP4]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[IF_FALSE_ORIG_INDIRECT2:%.*]], !prof [[PROF5:![0-9]+]]
+; ICALL-VTABLE:       if.true.direct_targ1:
+; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = invoke i32 @_ZN4Base10get_ticketEv(ptr nonnull [[B]])
+; ICALL-VTABLE-NEXT:            to label [[IF_END_ICP3:%.*]] unwind label [[LPAD]]
+; ICALL-VTABLE:       if.false.orig_indirect2:
+; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = invoke i32 [[TMP1]](ptr nonnull [[B]])
+; ICALL-VTABLE-NEXT:            to label [[IF_END_ICP3]] unwind label [[LPAD]]
+; ICALL-VTABLE:       if.end.icp3:
+; ICALL-VTABLE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT2]] ], [ [[TMP5]], [[IF_TRUE_DIRECT_TARG1]] ]
+; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
+; ICALL-VTABLE:       if.end.icp:
+; ICALL-VTABLE-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], [[IF_END_ICP3]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
+; ICALL-VTABLE-NEXT:    br label %try.cont
+; ICALL-VTABLE:       lpad:
+;
+entry:
+  %e = alloca %class.Error
+  %vtable = load ptr, ptr %b, !prof !4
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %1 = load ptr, ptr %vtable
+  %call = invoke i32 %1(ptr %b)
+  to label %try.cont unwind label %lpad, !prof !5
+
+lpad:
+  %2 = landingpad { ptr, i32 }
+  cleanup
+  catch ptr @_ZTI5Error
+  %3 = extractvalue { ptr, i32 } %2, 1
+  %4 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTI5Error)
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %ehcleanup
+
+catch:
+  %5 = extractvalue { ptr, i32 } %2, 0
+
+  %call3 = invoke i32 @_ZN5Error10error_codeEv(ptr nonnull align 1 dereferenceable(1) %e)
+  to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  %ret.0 = phi i32 [ %call3, %invoke.cont2 ], [ %call, %entry ]
+  ret i32 %ret.0
+
+lpad1:
+  %6 = landingpad { ptr, i32 }
+  cleanup
+  invoke void @__cxa_end_catch()
+  to label %invoke.cont4 unwind label %terminate.lpad
+
+invoke.cont4:
+  br label %ehcleanup
+
+ehcleanup:
+  %lpad.val7.merged = phi { ptr, i32 } [ %6, %invoke.cont4 ], [ %2, %lpad ]
+  resume { ptr, i32 } %lpad.val7.merged
+
+terminate.lpad:
+  %7 = landingpad { ptr, i32 }
+  catch ptr null
+  %8 = extractvalue { ptr, i32 } %7, 0
+  unreachable
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1 noundef)
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(ptr)
+
+declare i32 @_ZN5Error10error_codeEv(ptr nonnull align 1 dereferenceable(1))
+
+declare void @__cxa_end_catch()
+
+define i32 @_ZN4Base10get_ticketEv(ptr %this) align 2 personality ptr @__gxx_personality_v0 {
+entry:
+  %call = tail call i32 @_Z13get_ticket_idv()
+  %cmp.not = icmp eq i32 %call, -1
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  ret i32 %call
+
+if.end:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 1)
+  invoke void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1) %exception, ptr nonnull @.str, i32 1)
+  to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  unreachable
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+  cleanup
+  resume { ptr, i32 } %0
+}
+
+define i32 @_ZN7Derived10get_ticketEv(ptr %this) align 2 personality ptr @__gxx_personality_v0 {
+entry:
+  %call = tail call i32 @_Z13get_ticket_idv()
+  %cmp.not = icmp eq i32 %call, -1
+  br i1 %cmp.not, label %if.end, label %if.then
+
+if.then:
+  ret i32 %call
+
+if.end:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 1)
+  invoke void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1) %exception, ptr nonnull @.str, i32 2)
+  to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  unreachable
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+  cleanup
+  resume { ptr, i32 } %0
+}
+
+declare i32 @_Z13get_ticket_idv()
+declare ptr @__cxa_allocate_exception(i64)
+declare void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1), ptr, i32)
+
+!0 = !{i64 16, !"_ZTS4Base"}
+!1 = !{i64 16, !"_ZTSM4BaseFivE.virtual"}
+!2 = !{i64 16, !"_ZTS7Derived"}
+!3 = !{i64 16, !"_ZTSM7DerivedFivE.virtual"}
+!4 = !{!"VP", i32 2, i64 1600, i64 13870436605473471591, i64 900, i64 1960855528937986108, i64 700}
+!5 = !{!"VP", i32 0, i64 1600, i64 14811317294552474744, i64 900, i64 9261744921105590125, i64 700}
+
+; ICALL-FUNC: [[PROF4]] = !{!"VP", i32 2, i64 1600, i64 -4576307468236080025, i64 900, i64 1960855528937986108, i64 700}
+; ICALL-FUNC: [[PROF5]] = !{!"branch_weights", i32 900, i32 700}
+; ICALL-FUNC: [[PROF6]] = !{!"branch_weights", i32 700, i32 0}
+
+; ICALL-VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
+; ICALL-VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
new file mode 100644
index 0000000000000..94ed588c5458d
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt < %s -passes=pgo-icall-prom -pass-remarks=pgo-icall-prom -S 2>&1 | FileCheck %s --check-prefix=ICALL-FUNC
+; RUN: opt < %s -passes='pgo-icall-prom,instcombine' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -S 2>&1 | FileCheck %s --check-prefix=ICALL-VTABLE
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, align 8, !type !0, !type !1, !type !2, !type !3
+ at _ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, align 8, !type !0, !type !1
+
+define i32 @test_tail_call(ptr %ptr, i32 %a, i32 %b) {
+; ICALL-FUNC-LABEL: define i32 @test_tail_call(
+; ICALL-FUNC-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; ICALL-FUNC-NEXT:  entry:
+; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[PTR]], align 8, !prof [[PROF4:![0-9]+]]
+; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN7Derived5func1Eii
+; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[TMP4:%.*]], !prof [[PROF5:![0-9]+]]
+; ICALL-FUNC:       if.true.direct_targ:
+; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = musttail call i32 @_ZN7Derived5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
+; ICALL-FUNC-NEXT:    ret i32 [[TMP3]]
+; ICALL-FUNC:       4:
+; ICALL-FUNC-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func1Eii
+; ICALL-FUNC-NEXT:    br i1 [[TMP5]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[TMP7:%.*]], !prof [[PROF6:![0-9]+]]
+; ICALL-FUNC:       if.true.direct_targ1:
+; ICALL-FUNC-NEXT:    [[TMP6:%.*]] = musttail call i32 @_ZN4Base5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
+; ICALL-FUNC-NEXT:    ret i32 [[TMP6]]
+; ICALL-FUNC:       7:
+; ICALL-FUNC-NEXT:    [[CALL:%.*]] = musttail call i32 [[TMP1]](ptr [[PTR]], i32 [[A]], i32 [[B]])
+; ICALL-FUNC-NEXT:    ret i32 [[CALL]]
+;
+; ICALL-VTABLE-LABEL: define i32 @test_tail_call(
+; ICALL-VTABLE-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; ICALL-VTABLE-NEXT:  entry:
+; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[PTR]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[TMP4:%.*]], !prof [[PROF4:![0-9]+]]
+; ICALL-VTABLE:       if.true.direct_targ:
+; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = musttail call i32 @_ZN7Derived5func1Eii(ptr nonnull [[PTR]], i32 [[A]], i32 [[B]])
+; ICALL-VTABLE-NEXT:    ret i32 [[TMP3]]
+; ICALL-VTABLE:       4:
+; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2)
+; ICALL-VTABLE-NEXT:    br i1 [[TMP5]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[TMP7:%.*]], !prof [[PROF5:![0-9]+]]
+; ICALL-VTABLE:       if.true.direct_targ1:
+; ICALL-VTABLE-NEXT:    [[TMP6:%.*]] = musttail call i32 @_ZN4Base5func1Eii(ptr nonnull [[PTR]], i32 [[A]], i32 [[B]])
+; ICALL-VTABLE-NEXT:    ret i32 [[TMP6]]
+; ICALL-VTABLE:       7:
+; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = musttail call i32 [[TMP1]](ptr nonnull [[PTR]], i32 [[A]], i32 [[B]])
+; ICALL-VTABLE-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %vtable = load ptr, ptr %ptr, !prof !4
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  tail call void @llvm.assume(i1 %0)
+  %1 = load ptr, ptr %vtable
+  %call = musttail call i32 %1(ptr %ptr, i32 %a, i32 %b), !prof !5
+  ret i32 %call
+}
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+define i32 @_ZN7Derived5func1Eii(ptr %this, i32 %a, i32 %b) {
+entry:
+  %sub = sub nsw i32 %a, %b
+  ret i32 %sub
+}
+
+define i32 @_ZN4Base5func1Eii(ptr %this, i32 %a, i32 %b) {
+entry:
+  %add = add nsw i32 %b, %a
+  ret i32 %add
+}
+
+
+!0 = !{i64 16, !"_ZTS4Base"}
+!1 = !{i64 16, !"_ZTSM4BaseFiiiE.virtual"}
+!2 = !{i64 16, !"_ZTS7Derived"}
+!3 = !{i64 16, !"_ZTSM7DerivedFiiiE.virtual"}
+!4 = !{!"VP", i32 2, i64 1600, i64 13870436605473471591, i64 900, i64 1960855528937986108, i64 700}
+!5 = !{!"VP", i32 0, i64 1600, i64 7889036118036845314, i64 900, i64 10495086226207060333, i64 700}
+
+; ICALL-FUNC: [[PROF4]] = !{!"VP", i32 2, i64 1600, i64 -4576307468236080025, i64 900, i64 1960855528937986108, i64 700}
+; ICALL-FUNC: [[PROF5]] = !{!"branch_weights", i32 900, i32 700}
+; ICALL-FUNC: [[PROF6]] = !{!"branch_weights", i32 700, i32 0}
+
+; ICALL-VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
+; ICALL-VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}

>From ff3c219db4d9d925af282e9f579fb9bd55985508 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Wed, 22 May 2024 12:18:20 -0700
Subject: [PATCH 06/16] [TypeProf][InstrFDO]Implement more efficient comparison
 sequence for indirect-call-promotion with vtable profiles.

Clang's `-fwhole-program-vtables` is required for this optimization to
take place. If `-fwhole-program-vtables` is not enabled, this change is
no-op.

Function-comparison (before):

VTable-comparison (after):

Key changes:
1. Find out virtual calls and the vtables they come from.
   - The ICP relies on type intrinsic `llvm.type.test` and
     `llvm.public.type.test` to find out virtual calls and the
     compatible vtables, and relies on type metadata to find the address
     point (offset) for comparison.
2. ICP pass does cost-benefit analysis and compares vtable only when
   both conditions are met
   1) The function addressing and vtable load can sink to indirect
      fallback, and the indirect fallback is cold block
   2) The number of vtables for a function candidate is within
      (option specified) threshold.
3. Sink the function addressing and vtable load instruction to indirect
   fallback.
   - The sink helper functions are simplified versions of
     `InstCombinerImpl::tryToSinkInstruction`.
   - The helper functions to handle debug intrinsics are copied from
     `InstCombinerImpl::tryToSinkInstructionDbgValues` and
     `InstCombinerImpl::tryToSinkInstructionDbgVariableRecords` into
     Transforms/Utils/Local.cpp. Ideally only one copy should exist
     for inst-combine, icp and other passes.
4. Keep value profiles updated
   1) Update vtable value profiles after inline
   2) For either function-based comparison or vtable-based comparison,
      update both vtable and indirect call value profiles.
---
 .../Linux/instrprof-vtable-value-prof.cpp     | 104 +--
 .../llvm/Analysis/IndirectCallVisitor.h       |   3 +
 llvm/include/llvm/Transforms/Utils/Local.h    |   9 +
 .../Instrumentation/IndirectCallPromotion.cpp | 595 ++++++++++++------
 .../Transforms/Utils/CallPromotionUtils.cpp   |  20 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  36 +-
 llvm/lib/Transforms/Utils/Local.cpp           | 184 ++++++
 .../Transforms/Inline/update_invoke_prof.ll   |  74 ++-
 .../Transforms/Inline/update_value_profile.ll |  54 +-
 .../Transforms/PGOProfile/icp_vtable_cmp.ll   | 299 ++++-----
 .../PGOProfile/icp_vtable_invoke.ll           | 192 ++----
 .../PGOProfile/icp_vtable_tail_call.ll        |  83 +--
 12 files changed, 963 insertions(+), 690 deletions(-)

diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index e51805bdf923c..73921adcc0c15 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -5,59 +5,61 @@
 // ld.lld: error: /lib/../lib64/Scrt1.o: ABI version 1 is not supported
 // UNSUPPORTED: ppc && host-byteorder-big-endian
 
-// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -g -fprofile-generate=. -mllvm -enable-vtable-value-profiling %s -o %t-test
-// RUN: env LLVM_PROFILE_FILE=%t-test.profraw %t-test
+// RUN: rm -rf %t && mkdir %t && cd %t
+
+// RUN: %clangxx_pgogen -fuse-ld=lld -O2 -fprofile-generate=. -mllvm -enable-vtable-value-profiling %s -o test
+// RUN: env LLVM_PROFILE_FILE=test.profraw ./test
 
 // Show vtable profiles from raw profile.
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-test.profraw | FileCheck %s --check-prefixes=COMMON,RAW
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables test.profraw | FileCheck %s --check-prefixes=COMMON,RAW
 
 // Generate indexed profile from raw profile and show the data.
-// RUN: llvm-profdata merge %t-test.profraw -o %t-test.profdata
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-test.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
+// RUN: llvm-profdata merge test.profraw -o test.profdata
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables test.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
 
 // Generate text profile from raw and indexed profiles respectively and show the data.
-// RUN: llvm-profdata merge --text %t-test.profraw -o %t-raw.proftext
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %t-raw.proftext | FileCheck %s --check-prefix=ICTEXT
-// RUN: llvm-profdata merge --text %t-test.profdata -o %t-indexed.proftext
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text %t-indexed.proftext | FileCheck %s --check-prefix=ICTEXT
+// RUN: llvm-profdata merge --text test.profraw -o raw.proftext
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text raw.proftext | FileCheck %s --check-prefix=ICTEXT
+// RUN: llvm-profdata merge --text test.profdata -o indexed.proftext
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables --text indexed.proftext | FileCheck %s --check-prefix=ICTEXT
 
 // Generate indexed profile from text profiles and show the data
-// RUN: llvm-profdata merge --binary %t-raw.proftext -o %t-text.profraw
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-text.profraw | FileCheck %s --check-prefixes=COMMON,INDEXED
-// RUN: llvm-profdata merge --binary %t-indexed.proftext -o %t-text.profdata
-// RUN: llvm-profdata show --function=main --ic-targets --show-vtables %t-text.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
+// RUN: llvm-profdata merge --binary raw.proftext -o text.profraw
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables text.profraw | FileCheck %s --check-prefixes=COMMON,INDEXED
+// RUN: llvm-profdata merge --binary indexed.proftext -o text.profdata
+// RUN: llvm-profdata show --function=main --ic-targets --show-vtables text.profdata | FileCheck %s --check-prefixes=COMMON,INDEXED
 
 // COMMON: Counters:
 // COMMON-NEXT:  main:
-// COMMON-NEXT:  Hash: 0x0f9a16fe6d398548
-// COMMON-NEXT:  Counters: 2
+// COMMON-NEXT:  Hash: 0x068617320ec408a0
+// COMMON-NEXT:  Counters: 4
 // COMMON-NEXT:  Indirect Call Site Count: 2
 // COMMON-NEXT:  Number of instrumented vtables: 2
 // RAW:  Indirect Target Results:
-// RAW-NEXT:       [  0, _ZN8Derived15func1Eii,        250 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func1Eii,        750 ] (75.00%)
-// RAW-NEXT:       [  1, _ZN8Derived15func2Eii,        250 ] (25.00%)
-// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func2Eii,        750 ] (75.00%)
+// RAW-NEXT:       [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
+// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// RAW-NEXT:       [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
+// RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
 // RAW-NEXT:  VTable Results:
-// RAW-NEXT:       [  0, _ZTV8Derived1,        250 ] (25.00%)
-// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
+// RAW-NEXT:       [  0, _ZTV8Derived1,        50 ] (25.00%)
+// RAW-NEXT:       [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
 // RAW-NEXT:       [  1, _ZTV8Derived1,        250 ] (25.00%)
 // RAW-NEXT:       [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED:     Indirect Target Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func1Eii,        750 ] (75.00%)
-// INDEXED-NEXT:         [  0, _ZN8Derived15func1Eii,        250 ] (25.00%)
-// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func2Eii,        750 ] (75.00%)
-// INDEXED-NEXT:         [  1, _ZN8Derived15func2Eii,        250 ] (25.00%)
+// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, _ZN8Derived14funcEii,        50 ] (25.00%)
+// INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev,        750 ] (75.00%)
+// INDEXED-NEXT:         [  1, _ZN8Derived1D0Ev,        250 ] (25.00%)
 // INDEXED-NEXT:     VTable Results:
-// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
-// INDEXED-NEXT:         [  0, _ZTV8Derived1,        250 ] (25.00%)
+// INDEXED-NEXT:         [  0, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        150 ] (75.00%)
+// INDEXED-NEXT:         [  0, _ZTV8Derived1,        50 ] (25.00%)
 // INDEXED-NEXT:         [  1, {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E,        750 ] (75.00%)
 // INDEXED-NEXT:         [  1, _ZTV8Derived1,        250 ] (25.00%)
 // COMMON: Instrumentation level: IR  entry_first = 0
 // COMMON-NEXT: Functions shown: 1
-// COMMON-NEXT: Total functions: 6
+// COMMON-NEXT: Total functions: 7
 // COMMON-NEXT: Maximum function count: 1000
-// COMMON-NEXT: Maximum internal block count: 250
+// COMMON-NEXT: Maximum internal block count: 1000
 // COMMON-NEXT: Statistics for indirect call sites profile:
 // COMMON-NEXT:   Total number of sites: 2
 // COMMON-NEXT:   Total number of sites with values: 2
@@ -76,11 +78,13 @@
 // ICTEXT: :ir
 // ICTEXT: main
 // ICTEXT: # Func Hash:
-// ICTEXT: 1124236338992350536
+// ICTEXT: 470088714870327456
 // ICTEXT: # Num Counters:
-// ICTEXT: 2
+// ICTEXT: 4
 // ICTEXT: # Counter Values:
 // ICTEXT: 1000
+// ICTEXT: 1000
+// ICTEXT: 200
 // ICTEXT: 1
 // ICTEXT: # Num Value Kinds:
 // ICTEXT: 2
@@ -89,41 +93,50 @@
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func1Eii:750
-// ICTEXT: _ZN8Derived15func1Eii:250
+// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived24funcEii:150
+// ICTEXT: _ZN8Derived14funcEii:50
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived25func2Eii:750
-// ICTEXT: _ZN8Derived15func2Eii:250
+// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZN12_GLOBAL__N_18Derived2D0Ev:750
+// ICTEXT: _ZN8Derived1D0Ev:250
 // ICTEXT: # ValueKind = IPVK_VTableTarget:
 // ICTEXT: 2
 // ICTEXT: # NumValueSites:
 // ICTEXT: 2
 // ICTEXT: 2
-// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
-// ICTEXT: _ZTV8Derived1:250
+// ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:150
+// ICTEXT: _ZTV8Derived1:50
 // ICTEXT: 2
 // ICTEXT: {{.*}}instrprof-vtable-value-prof.cpp;_ZTVN12_GLOBAL__N_18Derived2E:750
 // ICTEXT: _ZTV8Derived1:250
 
+// Test indirect call promotion transformation using vtable profiles.
+// RUN: %clangxx -fprofile-use=test.profdata -fuse-ld=lld -flto=thin -fwhole-program-vtables -O2 -mllvm -enable-vtable-value-profiling -mllvm -icp-enable-vtable-cmp -Rpass=pgo-icall-prom %s 2>&1 | FileCheck %s --check-prefix=REMARK --implicit-check-not="!VP"
+
+// REMARK: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, compare 1 vtables and sink 1 instructions
+// REMARK: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, compare 1 vtables and sink 1 instructions
+// REMARK: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, compare 1 vtables and sink 2 instructions
+// REMARK: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, compare 1 vtables and sink 2 instructions
+
 #include <cstdio>
 #include <cstdlib>
 class Base {
 public:
-  virtual int func1(int a, int b) = 0;
-  virtual int func2(int a, int b) = 0;
+  virtual int func(int a, int b) = 0;
+
+  virtual ~Base() {};
 };
 class Derived1 : public Base {
 public:
-  int func1(int a, int b) override { return a + b; }
+  int func(int a, int b) override { return a * b; }
 
-  int func2(int a, int b) override { return a * b; }
+  ~Derived1() {}
 };
 namespace {
 class Derived2 : public Base {
 public:
-  int func1(int a, int b) override { return a - b; }
+  int func(int a, int b) override { return a * (a - b); }
 
-  int func2(int a, int b) override { return a * (a - b); }
+  ~Derived2() {}
 };
 } // namespace
 __attribute__((noinline)) Base *createType(int a) {
@@ -140,7 +153,10 @@ int main(int argc, char **argv) {
     int a = rand();
     int b = rand();
     Base *ptr = createType(i);
-    sum += ptr->func1(a, b) + ptr->func2(b, a);
+    if (i % 5 == 0)
+      sum += ptr->func(b, a);
+
+    delete ptr;
   }
   printf("sum is %d\n", sum);
   return 0;
diff --git a/llvm/include/llvm/Analysis/IndirectCallVisitor.h b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
index 66c972572b06c..f070e83c41689 100644
--- a/llvm/include/llvm/Analysis/IndirectCallVisitor.h
+++ b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
@@ -37,6 +37,9 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
   // A heuristic is used to find the address feeding instructions.
   static Instruction *tryGetVTableInstruction(CallBase *CB) {
     assert(CB != nullptr && "Caller guaranteed");
+    if (!CB->isIndirectCall())
+      return nullptr;
+
     LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());
 
     if (LI != nullptr) {
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 6937ec8dfd21c..5535a722a40fe 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -316,6 +316,15 @@ void salvageDebugInfoForDbgValues(Instruction &I,
                                   ArrayRef<DbgVariableIntrinsic *> Insns,
                                   ArrayRef<DbgVariableRecord *> DPInsns);
 
+void tryToSinkInstructionDbgValues(
+    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
+    BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers);
+
+void tryToSinkInstructionDPValues(
+    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
+    BasicBlock *DestBlock,
+    SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords);
+
 /// Given an instruction \p I and DIExpression \p DIExpr operating on
 /// it, append the effects of \p I to the DIExpression operand list
 /// \p Ops, or return \p nullptr if it cannot be salvaged.
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index a121cf74e71a0..4de0aaef8d7ca 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -40,6 +41,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -110,9 +112,6 @@ static cl::opt<bool>
 
 // This option is meant to be used by LLVM regression test and test the
 // transformation that compares vtables.
-// TODO: ICP pass will do cost-benefit analysis between function-based
-// comparison and vtable-based comparison and choose one of the two
-// transformations.
 static cl::opt<bool> ICPEnableVTableCmp(
     "icp-enable-vtable-cmp", cl::init(false), cl::Hidden,
     cl::desc("If ThinLTO and WPD is enabled and this option is true, "
@@ -121,6 +120,15 @@ static cl::opt<bool> ICPEnableVTableCmp(
              " If set to false, indirect-call promotion pass will always "
              "compare functions."));
 
+static cl::opt<float>
+    ICPVTableCountPercentage("icp-vtable-count-percentage", cl::init(0.99),
+                             cl::Hidden,
+                             cl::desc("Percentage of vtable count to compare"));
+
+static cl::opt<int> ICPNumAdditionalVTableLast(
+    "icp-num-additional-vtable-last", cl::init(0), cl::Hidden,
+    cl::desc("The number of additional instruction for the last candidate"));
+
 namespace {
 
 using VTableAddressPointOffsetValMap =
@@ -140,31 +148,22 @@ struct VirtualCallSiteInfo {
 using VirtualCallSiteTypeInfoMap =
     SmallDenseMap<const CallBase *, VirtualCallSiteInfo, 8>;
 
-// Given the list of compatible type metadata for a vtable and one specified
-// type, returns the address point offset of the type if any.
+// Find the offset where type string is `CompatibleType`.
 static std::optional<uint64_t>
-getCompatibleTypeOffset(const ArrayRef<MDNode *> &Types,
+getCompatibleTypeOffset(const GlobalVariable &VTableVar,
                         StringRef CompatibleType) {
-  if (Types.empty()) {
-    return std::nullopt;
-  }
-  std::optional<uint64_t> Offset;
-  // find the offset where type string is equal to the one in llvm.type.test
-  // intrinsic
-  for (MDNode *Type : Types) {
-    auto TypeIDMetadata = Type->getOperand(1).get();
-    if (auto *TypeId = dyn_cast<MDString>(TypeIDMetadata)) {
-      StringRef TypeStr = TypeId->getString();
-      if (TypeStr != CompatibleType) {
-        continue;
-      }
-      Offset = cast<ConstantInt>(
-                   cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
-                   ->getZExtValue();
-      break;
-    }
-  }
-  return Offset;
+  SmallVector<MDNode *, 2> Types; // type metadata associated with a vtable.
+  VTableVar.getMetadata(LLVMContext::MD_type, Types);
+
+  for (MDNode *Type : Types)
+    if (auto *TypeId = dyn_cast<MDString>(Type->getOperand(1).get());
+        TypeId && TypeId->getString() == CompatibleType)
+
+      return cast<ConstantInt>(
+                 cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+          ->getZExtValue();
+
+  return std::nullopt;
 }
 
 // Returns a constant representing the vtable's address point specified by the
@@ -182,6 +181,123 @@ static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
       llvm::ConstantInt::get(Type::getInt32Ty(Context), AddressPointOffset));
 }
 
+// Returns the basic block in which `Inst` by `Use`.
+static BasicBlock *getUserBasicBlock(Instruction *Inst, unsigned int OperandNo,
+                                     Instruction *UserInst) {
+  if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+    return PN->getIncomingBlock(
+        PHINode::getIncomingValueNumForOperand(OperandNo));
+
+  return UserInst->getParent();
+}
+
+// `DestBB` is a suitable basic block to sink `Inst` into when the following
+// conditions are true:
+// 1) `Inst->getParent()` is the sole predecessor of `DestBB`. This way `DestBB`
+//    is dominated by `Inst->getParent()` and we don't need to sink across a
+//    critical edge.
+// 2) `Inst` have users and all users are in `DestBB`.
+static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
+  BasicBlock *BB = Inst->getParent();
+  assert(Inst->getParent() != DestBB &&
+         BB->getTerminator()->getNumSuccessors() == 2 &&
+         "Caller should guarantee");
+  // Do not sink across a critical edge for simplicity.
+  if (DestBB->getUniquePredecessor() != BB)
+    return false;
+
+  // Now we know BB dominates DestBB.
+  BasicBlock *UserBB = nullptr;
+  for (Use &Use : Inst->uses()) {
+    User *User = Use.getUser();
+    // Do checked cast since IR verifier guarantees that the user of an
+    // instruction must be an instruction. See `Verifier::visitInstruction`.
+    Instruction *UserInst = cast<Instruction>(User);
+    // We can sink debug or pseudo instructions together with Inst.
+    if (UserInst->isDebugOrPseudoInst())
+      continue;
+    UserBB = getUserBasicBlock(Inst, Use.getOperandNo(), UserInst);
+    // Do not sink if Inst is used in a basic block that is not DestBB.
+    // TODO: Sink to the common dominator of all user blocks.
+    if (UserBB != DestBB)
+      return false;
+  }
+  return UserBB != nullptr;
+}
+
+// For the virtual call dispatch sequence, try to sink vtable load instructions
+// to the cold indirect call fallback.
+static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(!I->isTerminator());
+  if (!isDestBBSuitableForSink(I, DestBlock))
+    return false;
+
+  assert(DestBlock->getUniquePredecessor() == I->getParent());
+  BasicBlock *SrcBlock = I->getParent();
+
+  // Do not move control-flow-involving, volatile loads, vaarg, etc.
+  // Do not sink static or dynamic alloca instructions. Static allocas must
+  // remain in the entry block, and dynamic allocas must not be sunk in between
+  // a stacksave / stackrestore pair, which would incorrectly shorten its
+  // lifetime.
+  if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() ||
+      isa<AllocaInst>(I))
+    return false;
+
+  // Do not sink convergent call instructions.
+  if (const auto *C = dyn_cast<CallBase>(I))
+    if (C->isInlineAsm() || C->cannotMerge() || C->isConvergent())
+      return false;
+
+  // Do not move an instruction that may write to memory.
+  if (I->mayWriteToMemory())
+    return false;
+
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    // We know that SrcBlock is the unique predecessor of DestBlock.
+    for (BasicBlock::iterator Scan = std::next(I->getIterator()),
+                              E = I->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
+  I->moveBefore(*DestBlock, InsertPos);
+
+  // Also sink all related debug uses from the source basic block. Otherwise we
+  // get debug use before the def. Attempt to salvage debug uses first, to
+  // maximise the range variables have location for. If we cannot salvage, then
+  // mark the location undef: we know it was supposed to receive a new location
+  // here, but that computation has been sunk.
+  SmallVector<DbgVariableIntrinsic *> DbgUsers;
+  SmallVector<DbgVariableRecord *> DPValues;
+  findDbgUsers(DbgUsers, I, &DPValues);
+  if (!DbgUsers.empty())
+    tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers);
+  if (!DPValues.empty())
+    tryToSinkInstructionDPValues(I, InsertPos, SrcBlock, DestBlock, DPValues);
+  return true;
+}
+
+// Try to sink instructions after VPtr to the indirect call fallback.
+// Returns the number of sunk IR instructions.
+static int tryToSinkInstructions(Instruction *VPtr,
+                                 BasicBlock *IndirectCallBB) {
+  BasicBlock *OriginalBB = VPtr->getParent();
+
+  int SinkCount = 0;
+  // FIXME: Find a way to bail out of the loop.
+  for (Instruction &I :
+       llvm::make_early_inc_range(llvm::drop_begin(llvm::reverse(*OriginalBB))))
+    if (tryToSinkInstruction(&I, IndirectCallBB))
+      SinkCount++;
+
+  return SinkCount;
+}
+
 // Promote indirect calls to conditional direct calls, keeping track of
 // thresholds.
 class IndirectCallPromoter {
@@ -189,6 +305,8 @@ class IndirectCallPromoter {
   Function &F;
   Module &M;
 
+  ProfileSummaryInfo *PSI = nullptr;
+
   // Symtab that maps indirect call profile values to function names and
   // defines.
   InstrProfSymtab *const Symtab;
@@ -207,10 +325,9 @@ class IndirectCallPromoter {
     Function *const TargetFunction;
     const uint64_t Count;
 
+    // The byte offset of TargetFunction starting from the vtable address point.
     uint64_t FunctionOffset;
-
     SmallVector<std::pair<uint64_t, uint64_t>, 2> VTableGUIDAndCounts;
-
     SmallVector<Constant *, 2> AddressPoints;
 
     PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
@@ -225,42 +342,55 @@ class IndirectCallPromoter {
   // NumCandidates is the number of candidate entries in ValueDataRef.
   std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
       const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
-      uint64_t TotalCount, uint32_t NumCandidates,
-      VTableGUIDCountsMap &VTableGUIDCounts);
+      uint64_t TotalCount, uint32_t NumCandidates);
 
   // Promote a list of targets for one indirect-call callsite by comparing
   // indirect callee with functions. Returns true if there are IR
   // transformations and false otherwise.
   bool tryToPromoteWithFuncCmp(
-      CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
-      uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
-      uint32_t NumCandidates);
+      CallBase &CB, Instruction *VPtr,
+      const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount,
+      ArrayRef<InstrProfValueData> ICallProfDataRef, uint32_t NumCandidates,
+      VTableGUIDCountsMap &VTableGUIDCounts);
 
+  // Promote a list of targets for one indirect call by comparing vtables with
+  // functions. Returns true if there are IR transformations and false
+  // otherwise.
   bool tryToPromoteWithVTableCmp(
-      CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+      CallBase &CB, Instruction *VPtr,
+      const std::vector<PromotionCandidate> &Candidates,
       uint64_t TotalFuncCount, uint32_t NumCandidates,
       MutableArrayRef<InstrProfValueData> ICallProfDataRef,
       VTableGUIDCountsMap &VTableGUIDCounts);
 
-  void
-  tryGetVTableInfos(const CallBase &CB,
-                    const SmallDenseMap<Function *, int, 4> &CalleeIndexMap,
-                    VTableGUIDCountsMap &VTableGUIDCounts,
-                    std::vector<PromotionCandidate> &Candidates);
+  // Returns true if it's profitable to compare vtables.
+  bool isProfitableToCompareVTables(
+      const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount);
+
+  // Populate `VTableGUIDCounts` vtable GUIDs and their counts and each
+  // candidate with vtable information. Returns the vtable instruction if not
+  // null.
+  Instruction *computeVTableInfos(const CallBase *CB,
+                                  VTableGUIDCountsMap &VTableGUIDCounts,
+                                  std::vector<PromotionCandidate> &Candidates);
 
   Constant *getOrCreateVTableAddressPointVar(GlobalVariable *GV,
                                              uint64_t AddressPointOffset);
 
-  bool isProfitableToCompareVTables(
-      const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount);
+  void updateFuncValueProfiles(CallBase &CB, ArrayRef<InstrProfValueData> VDs,
+                               uint64_t Sum, uint32_t MaxMDCount);
+
+  void updateVPtrValueProfiles(Instruction *VPtr,
+                               VTableGUIDCountsMap &VTableGUIDCounts);
 
 public:
   IndirectCallPromoter(
-      Function &Func, Module &M, InstrProfSymtab *Symtab, bool SamplePGO,
+      Function &Func, Module &M, ProfileSummaryInfo *PSI,
+      InstrProfSymtab *Symtab, bool SamplePGO,
       const VirtualCallSiteTypeInfoMap &VirtualCSInfo,
       VTableAddressPointOffsetValMap &VTableAddressPointOffsetVal,
       OptimizationRemarkEmitter &ORE)
-      : F(Func), M(M), Symtab(Symtab), SamplePGO(SamplePGO),
+      : F(Func), M(M), PSI(PSI), Symtab(Symtab), SamplePGO(SamplePGO),
         VirtualCSInfo(VirtualCSInfo),
         VTableAddressPointOffsetVal(VTableAddressPointOffsetVal), ORE(ORE) {}
   IndirectCallPromoter(const IndirectCallPromoter &) = delete;
@@ -276,12 +406,9 @@ class IndirectCallPromoter {
 std::vector<IndirectCallPromoter::PromotionCandidate>
 IndirectCallPromoter::getPromotionCandidatesForCallSite(
     const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
-    uint64_t TotalCount, uint32_t NumCandidates,
-    VTableGUIDCountsMap &VTableGUIDCounts) {
+    uint64_t TotalCount, uint32_t NumCandidates) {
   std::vector<PromotionCandidate> Ret;
 
-  SmallDenseMap<Function *, int, 4> CalleeIndexMap;
-
   LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB
                     << " Num_targets: " << ValueDataRef.size()
                     << " Num_candidates: " << NumCandidates << "\n");
@@ -355,82 +482,71 @@ IndirectCallPromoter::getPromotionCandidatesForCallSite(
       break;
     }
 
-    CalleeIndexMap[TargetFunction] = Ret.size();
     Ret.push_back(PromotionCandidate(TargetFunction, Count));
-
     TotalCount -= Count;
   }
 
-  if (!ICPEnableVTableCmp)
-    return Ret;
-
-  tryGetVTableInfos(CB, CalleeIndexMap, VTableGUIDCounts, Ret);
-
   return Ret;
 }
 
 Constant *IndirectCallPromoter::getOrCreateVTableAddressPointVar(
     GlobalVariable *GV, uint64_t AddressPointOffset) {
-  Constant *Var = VTableAddressPointOffsetVal[GV][AddressPointOffset];
-  if (Var != nullptr)
-    return Var;
-  Constant *Ret = getVTableAddressPointOffset(GV, AddressPointOffset);
-  VTableAddressPointOffsetVal[GV][AddressPointOffset] = Ret;
-  return Ret;
+  auto [Iter, Inserted] =
+      VTableAddressPointOffsetVal[GV].try_emplace(AddressPointOffset, nullptr);
+  if (Inserted)
+    Iter->second = getVTableAddressPointOffset(GV, AddressPointOffset);
+  return Iter->second;
 }
 
-void IndirectCallPromoter::tryGetVTableInfos(
-    const CallBase &CB, const SmallDenseMap<Function *, int, 4> &CalleeIndexMap,
-    VTableGUIDCountsMap &GUIDCountsMap,
+Instruction *IndirectCallPromoter::computeVTableInfos(
+    const CallBase *CB, VTableGUIDCountsMap &GUIDCountsMap,
     std::vector<PromotionCandidate> &Candidates) {
   if (!ICPEnableVTableCmp)
-    return;
+    return nullptr;
 
-  auto Iter = VirtualCSInfo.find(&CB);
+  // Only virtual calls have virtual call site info.
+  auto Iter = VirtualCSInfo.find(CB);
   if (Iter == VirtualCSInfo.end())
-    return;
+    return nullptr;
 
-  auto &VirtualCallInfo = Iter->second;
+  const auto &VirtualCallInfo = Iter->second;
+  Instruction *VPtr = VirtualCallInfo.VPtr;
 
-  uint32_t ActualNumValueData = 0;
+  SmallDenseMap<Function *, int, 4> CalleeIndexMap;
+  for (size_t I = 0; I < Candidates.size(); I++)
+    CalleeIndexMap[Candidates[I].TargetFunction] = I;
 
+  uint32_t ActualNumValueData = 0;
   uint64_t TotalVTableCount = 0;
   auto VTableValueDataArray = getValueProfDataFromInst(
       *VirtualCallInfo.VPtr, IPVK_VTableTarget, MaxNumVTableAnnotations,
       ActualNumValueData, TotalVTableCount);
-
   if (VTableValueDataArray.get() == nullptr)
-    return;
+    return VPtr;
 
-  SmallVector<MDNode *, 2> Types; // type metadata associated with a vtable.
   // Compute the functions and counts from by each vtable.
   for (size_t j = 0; j < ActualNumValueData; j++) {
     uint64_t VTableVal = VTableValueDataArray[j].Value;
     GUIDCountsMap[VTableVal] = VTableValueDataArray[j].Count;
-    GlobalVariable *VTableVariable = Symtab->getGlobalVariable(VTableVal);
-    if (!VTableVariable) {
+    GlobalVariable *VTableVar = Symtab->getGlobalVariable(VTableVal);
+    if (!VTableVar) {
       LLVM_DEBUG(dbgs() << "\tCannot find vtable definition for " << VTableVal
-                        << "\n");
+                        << "; maybe the vtable isn't imported\n");
       continue;
     }
 
-    Types.clear();
-    VTableVariable->getMetadata(LLVMContext::MD_type, Types);
     std::optional<uint64_t> MaybeAddressPointOffset =
-        getCompatibleTypeOffset(Types, VirtualCallInfo.CompatibleTypeStr);
+        getCompatibleTypeOffset(*VTableVar, VirtualCallInfo.CompatibleTypeStr);
     if (!MaybeAddressPointOffset)
       continue;
 
     const uint64_t AddressPointOffset = *MaybeAddressPointOffset;
 
     Function *Callee = nullptr;
-
     std::tie(Callee, std::ignore) = getFunctionAtVTableOffset(
-        VTableVariable, AddressPointOffset + VirtualCallInfo.FunctionOffset,
-        *(F.getParent()));
+        VTableVar, AddressPointOffset + VirtualCallInfo.FunctionOffset, M);
     if (!Callee)
       continue;
-
     auto CalleeIndexIter = CalleeIndexMap.find(Callee);
     if (CalleeIndexIter == CalleeIndexMap.end())
       continue;
@@ -439,8 +555,10 @@ void IndirectCallPromoter::tryGetVTableInfos(
     Candidate.VTableGUIDAndCounts.push_back(
         {VTableVal, VTableValueDataArray[j].Count});
     Candidate.AddressPoints.push_back(
-        getOrCreateVTableAddressPointVar(VTableVariable, AddressPointOffset));
+        getOrCreateVTableAddressPointVar(VTableVar, AddressPointOffset));
   }
+
+  return VPtr;
 }
 
 static MDNode *getBranchWeights(LLVMContext &Context, uint64_t IfCount,
@@ -478,54 +596,122 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
 
 // Promote indirect-call to conditional direct-call for one callsite.
 bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
-    CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
-    uint64_t TotalCount, ArrayRef<InstrProfValueData> ICallProfDataRef,
-    uint32_t NumCandidates) {
+    CallBase &CB, Instruction *VPtr,
+    const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount,
+    ArrayRef<InstrProfValueData> ICallProfDataRef, uint32_t NumCandidates,
+    VTableGUIDCountsMap &VTableGUIDCounts) {
   uint32_t NumPromoted = 0;
 
   for (const auto &C : Candidates) {
-    uint64_t Count = C.Count;
-    pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO,
-                             &ORE);
-    assert(TotalCount >= Count);
-    TotalCount -= Count;
+    uint64_t FuncCount = C.Count;
+    pgo::promoteIndirectCall(CB, C.TargetFunction, FuncCount, TotalCount,
+                             SamplePGO, &ORE);
+    assert(TotalCount >= FuncCount);
+    TotalCount -= FuncCount;
     NumOfPGOICallPromotion++;
     NumPromoted++;
+
+    if (!ICPEnableVTableCmp || C.VTableGUIDAndCounts.empty())
+      continue;
+
+    // Update VTableGUIDCounts
+    uint64_t SumVTableCount = 0;
+    for (const auto &[GUID, VTableCount] : C.VTableGUIDAndCounts)
+      SumVTableCount += VTableCount;
+
+    for (const auto &[GUID, VTableCount] : C.VTableGUIDAndCounts) {
+      APInt APFuncCount((unsigned)128, FuncCount, false /*signed*/);
+      APFuncCount *= VTableCount;
+      VTableGUIDCounts[GUID] -= APFuncCount.udiv(SumVTableCount).getZExtValue();
+    }
   }
+  if (NumPromoted == 0)
+    return false;
 
-  const bool Changed = (NumPromoted != 0);
+  assert(NumPromoted <= ICallProfDataRef.size() &&
+         "Number of promoted functions should not be greater than the number "
+         "of values in profile metadata");
+
+  // Update value profiles on the indirect call.
+  // TODO: Handle profile update properly when Clang `-fstrict-vtable-pointers`
+  // is enabled and a vtable is used to load multiple virtual functions.
+  updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
+                          NumCandidates);
+  // Update value profiles on the vtable pointer if it exists.
+  if (VPtr)
+    updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
+  return true;
+}
 
-  if (Changed) {
-    CB.setMetadata(LLVMContext::MD_prof, nullptr);
+void IndirectCallPromoter::updateFuncValueProfiles(
+    CallBase &CB, ArrayRef<InstrProfValueData> CallVDs, uint64_t TotalCount,
+    uint32_t MaxMDCount) {
+  // First clear the existing !prof.
+  CB.setMetadata(LLVMContext::MD_prof, nullptr);
+  // Annotate the remaining value profiles if counter is not zero.
+  if (TotalCount != 0)
+    annotateValueSite(M, CB, CallVDs, TotalCount, IPVK_IndirectCallTarget,
+                      MaxMDCount);
+}
+
+void IndirectCallPromoter::updateVPtrValueProfiles(
+    Instruction *VPtr, VTableGUIDCountsMap &VTableGUIDCounts) {
+  VPtr->setMetadata(LLVMContext::MD_prof, nullptr);
+  std::vector<InstrProfValueData> VTableValueProfiles;
+  uint64_t TotalVTableCount = 0;
+  for (auto [GUID, Count] : VTableGUIDCounts) {
+    if (Count == 0)
+      continue;
 
-    if (TotalCount != 0)
-      annotateValueSite(*F.getParent(), CB, ICallProfDataRef.slice(NumPromoted),
-                        TotalCount, IPVK_IndirectCallTarget, NumCandidates);
+    VTableValueProfiles.push_back({GUID, Count});
+    TotalVTableCount += Count;
   }
+  llvm::sort(VTableValueProfiles,
+             [](const InstrProfValueData &LHS, const InstrProfValueData &RHS) {
+               return LHS.Count > RHS.Count;
+             });
 
-  return Changed;
+  annotateValueSite(M, *VPtr, VTableValueProfiles, TotalVTableCount,
+                    IPVK_VTableTarget, VTableValueProfiles.size());
 }
 
 bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
-    CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
-    uint64_t TotalFuncCount, uint32_t NumCandidates,
+    CallBase &CB, Instruction *VPtr,
+    const std::vector<PromotionCandidate> &Candidates, uint64_t TotalFuncCount,
+    uint32_t NumCandidates,
     MutableArrayRef<InstrProfValueData> ICallProfDataRef,
     VTableGUIDCountsMap &VTableGUIDCounts) {
-  Instruction *VPtr = VirtualCSInfo.at(&CB).VPtr;
-
-  SmallVector<int, 4> PromotedFuncCount;
+  SmallVector<uint64_t, 4> PromotedFuncCount;
+  // TODO: Explain the branch accuracy (-fstrict-vtable-pointer) with a
+  // compiler-rt test.
   for (const auto &Candidate : Candidates) {
     uint64_t IfCount = 0;
-    // FIXME: Skip vtables with cold count in the comparison.
     for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts) {
       IfCount += Count;
       VTableGUIDCounts[GUID] -= Count;
     }
 
+    BasicBlock *OriginalBB = CB.getParent();
     promoteCallWithVTableCmp(
         CB, VPtr, Candidate.TargetFunction, Candidate.AddressPoints,
         getBranchWeights(CB.getContext(), IfCount, TotalFuncCount - IfCount));
 
+    int SinkCount = tryToSinkInstructions(
+        PromotedFuncCount.empty() ? VPtr : OriginalBB->getFirstNonPHI(),
+        CB.getParent());
+
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
+             << "Promote indirect call to "
+             << ore::NV("DirectCallee", Candidate.TargetFunction)
+             << " with count " << ore::NV("Count", Candidate.Count)
+             << " out of " << ore::NV("TotalCount", TotalFuncCount)
+             << ", compare "
+             << ore::NV("VTable", Candidate.VTableGUIDAndCounts.size())
+             << " vtables and sink " << ore::NV("SinkCount", SinkCount)
+             << " instructions";
+    });
+
     PromotedFuncCount.push_back(IfCount);
 
     TotalFuncCount -= IfCount;
@@ -535,48 +721,27 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
   if (PromotedFuncCount.empty())
     return false;
 
-  // A comparator that sorts value profile data descendingly.
-  auto Cmp = [](const InstrProfValueData &LHS, const InstrProfValueData &RHS) {
-    return LHS.Count > RHS.Count;
-  };
-
-  CB.setMetadata(LLVMContext::MD_prof, nullptr);
-  // Update indirect call value profiles if total count of the call site is not
-  // zero.
-  if (TotalFuncCount != 0) {
-    for (size_t I = 0; I < PromotedFuncCount.size(); I++)
-      ICallProfDataRef[I].Count -= PromotedFuncCount[I];
-
-    llvm::sort(ICallProfDataRef.begin(), ICallProfDataRef.end(), Cmp);
-
-    // Locate the first <target, count> pair where the count is zero or less.
-    auto UB = llvm::upper_bound(
-        ICallProfDataRef, 0U,
-        [](uint64_t Count, const InstrProfValueData &ProfData) {
-          return ProfData.Count <= Count;
-        });
-
-    ArrayRef<InstrProfValueData> VDs(ICallProfDataRef.begin(), UB);
-    annotateValueSite(M, CB, VDs, TotalFuncCount, IPVK_IndirectCallTarget,
-                      NumCandidates);
-  }
-
-  VPtr->setMetadata(LLVMContext::MD_prof, nullptr);
-  std::vector<InstrProfValueData> VTableValueProfiles;
-  uint64_t TotalVTableCount = 0;
-  for (auto [GUID, Count] : VTableGUIDCounts) {
-    if (Count == 0)
-      continue;
-
-    VTableValueProfiles.push_back({GUID, Count});
-    TotalVTableCount += Count;
-  }
-  llvm::sort(VTableValueProfiles, Cmp);
-
-  annotateValueSite(M, *VPtr, VTableValueProfiles, TotalVTableCount,
-                    IPVK_VTableTarget, VTableValueProfiles.size());
-
-  // Update vtable profile metadata
+  // Update value profiles for 'CB' and 'VPtr', assuming that each 'CB' has a
+  // a distinct 'VPtr'.
+  // TODO: Handle profile update properly when Clang `-fstrict-vtable-pointers`
+  // is enabled and a vtable is used to load multiple virtual functions.
+  for (size_t I = 0; I < PromotedFuncCount.size(); I++)
+    ICallProfDataRef[I].Count -=
+        std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count);
+  // Sort value profiles by count in descending order.
+  llvm::sort(ICallProfDataRef.begin(), ICallProfDataRef.end(),
+             [](const InstrProfValueData &LHS, const InstrProfValueData &RHS) {
+               return LHS.Count > RHS.Count;
+             });
+  // Drop the <target-value, count> pair if count is not greater than zero.
+  ArrayRef<InstrProfValueData> VDs(
+      ICallProfDataRef.begin(),
+      llvm::upper_bound(ICallProfDataRef, 0U,
+                        [](uint64_t Count, const InstrProfValueData &ProfData) {
+                          return ProfData.Count <= Count;
+                        }));
+  updateFuncValueProfiles(CB, VDs, TotalFuncCount, NumCandidates);
+  updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
   return true;
 }
 
@@ -593,27 +758,59 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
     if (!NumCandidates ||
         (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
       continue;
-    VTableGUIDCountsMap VTableGUIDCounts;
+
     auto PromotionCandidates = getPromotionCandidatesForCallSite(
-        *CB, ICallProfDataRef, TotalCount, NumCandidates, VTableGUIDCounts);
+        *CB, ICallProfDataRef, TotalCount, NumCandidates);
+
+    VTableGUIDCountsMap VTableGUIDCounts;
+    Instruction *VPtr =
+        computeVTableInfos(CB, VTableGUIDCounts, PromotionCandidates);
 
     if (isProfitableToCompareVTables(PromotionCandidates, TotalCount))
-      Changed |= tryToPromoteWithVTableCmp(*CB, PromotionCandidates, TotalCount,
-                                           NumCandidates, ICallProfDataRef,
-                                           VTableGUIDCounts);
+      Changed |= tryToPromoteWithVTableCmp(*CB, VPtr, PromotionCandidates,
+                                           TotalCount, NumCandidates,
+                                           ICallProfDataRef, VTableGUIDCounts);
     else
-      Changed |= tryToPromoteWithFuncCmp(*CB, PromotionCandidates, TotalCount,
-                                         ICallProfDataRef, NumCandidates);
+      Changed |= tryToPromoteWithFuncCmp(*CB, VPtr, PromotionCandidates,
+                                         TotalCount, ICallProfDataRef,
+                                         NumCandidates, VTableGUIDCounts);
   }
   return Changed;
 }
 
+// TODO: Returns false if the function addressing and vtable load instructions
+// cannot sink to indirect fallback.
 bool IndirectCallPromoter::isProfitableToCompareVTables(
     const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount) {
-  if (!ICPEnableVTableCmp)
+  if (!ICPEnableVTableCmp || Candidates.empty())
+    return false;
+  uint64_t RemainingVTableCount = TotalCount;
+  for (size_t I = 0; I < Candidates.size(); I++) {
+    auto &Candidate = Candidates[I];
+    uint64_t VTableSumCount = 0;
+    for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
+      VTableSumCount += Count;
+
+    if (VTableSumCount < Candidate.Count * ICPVTableCountPercentage)
+      return false;
+
+    RemainingVTableCount -= Candidate.Count;
+
+    int NumAdditionalVTable = 0;
+    if (I == Candidates.size() - 1)
+      NumAdditionalVTable = ICPNumAdditionalVTableLast;
+
+    int ActualNumAdditionalInst = Candidate.AddressPoints.size() - 1;
+    if (ActualNumAdditionalInst > NumAdditionalVTable) {
+      return false;
+    }
+  }
+
+  // If the indirect fallback is not cold, don't compare vtables.
+  if (PSI && PSI->hasProfileSummary() &&
+      !PSI->isColdCount(RemainingVTableCount))
     return false;
 
-  // FIXME: Implement cost-benefit analysis in a follow-up change.
   return true;
 }
 
@@ -625,6 +822,45 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
     return FAM.getResult<DominatorTreeAnalysis>(F);
   };
 
+  auto compute = [&](Function *Func) {
+    if (!Func || Func->use_empty())
+      return;
+    // Iterate all type.test calls and find all indirect calls.
+    // TODO: Add llvm.public.type.test
+    for (Use &U : llvm::make_early_inc_range(Func->uses())) {
+      auto *CI = dyn_cast<CallInst>(U.getUser());
+      if (!CI)
+        continue;
+      auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+      if (!TypeMDVal)
+        continue;
+      auto *CompatibleTypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+      if (!CompatibleTypeId)
+        continue;
+
+      // Find out all devirtualizable call sites given a llvm.type.test
+      // intrinsic call.
+      SmallVector<DevirtCallSite, 1> DevirtCalls;
+      SmallVector<CallInst *, 1> Assumes;
+      auto &DT = LookupDomTree(*CI->getFunction());
+      findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
+
+      // type-id, offset from the address point
+      // combined with type metadata to compute function offset
+      for (auto &DevirtCall : DevirtCalls) {
+        CallBase &CB = DevirtCall.CB;
+        // Given an indirect call, try find the instruction which loads a
+        // pointer to virtual table.
+        Instruction *VTablePtr =
+            PGOIndirectCallVisitor::tryGetVTableInstruction(&CB);
+        if (!VTablePtr)
+          continue;
+        VirtualCSInfo[&CB] = {DevirtCall.Offset, VTablePtr,
+                              CompatibleTypeId->getString()};
+      }
+    }
+  };
+
   // Right now only llvm.type.test is used to find out virtual call sites.
   // With ThinLTO and whole-program-devirtualization, llvm.type.test and
   // llvm.public.type.test are emitted, and llvm.public.type.test is either
@@ -636,51 +872,11 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
 
-  if (!TypeTestFunc || TypeTestFunc->use_empty())
-    return;
-
-  // Iterate all type.test calls and find all indirect calls.
-  for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
-    auto *CI = dyn_cast<CallInst>(U.getUser());
-    if (!CI)
-      continue;
-
-    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
-    if (!TypeMDVal)
-      continue;
-
-    auto *CompatibleTypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
-    if (!CompatibleTypeId)
-      continue;
+  compute(TypeTestFunc);
 
-    StringRef CompatibleTypeStr = CompatibleTypeId->getString();
-
-    // Find out all devirtualizable call sites given a llvm.type.test intrinsic
-    // call.
-    SmallVector<DevirtCallSite, 1> DevirtCalls;
-    SmallVector<CallInst *, 1> Assumes;
-    auto &DT = LookupDomTree(*CI->getFunction());
-    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
-
-    // type-id, offset from the address point
-    // combined with type metadata to compute function offset
-    for (auto &DevirtCall : DevirtCalls) {
-      CallBase &CB = DevirtCall.CB;
-      // This is the offset from the address point offset to the virtual
-      // function.
-      uint64_t Offset = DevirtCall.Offset;
-
-      // Given an indirect call, try find the instruction which loads a pointer
-      // to virtual table.
-      Instruction *VTablePtr =
-          PGOIndirectCallVisitor::tryGetVTableInstruction(&CB);
-
-      if (!VTablePtr)
-        continue;
-
-      VirtualCSInfo[&CB] = {Offset, VTablePtr, CompatibleTypeStr};
-    }
-  }
+  Function *PublicTypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+  compute(PublicTypeTestFunc);
 }
 
 // A wrapper function that does the actual work.
@@ -714,7 +910,8 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
         MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    IndirectCallPromoter CallPromoter(F, M, &Symtab, SamplePGO, VirtualCSInfo,
+    IndirectCallPromoter CallPromoter(F, M, PSI, &Symtab, SamplePGO,
+                                      VirtualCSInfo,
                                       VTableAddressPointOffsetVal, ORE);
     bool FuncChanged = CallPromoter.processFunction(PSI);
     if (ICPDUMPAFTER && FuncChanged) {
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 5ad612eb9a8a5..dda80d419999d 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -187,24 +187,6 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
     U->replaceUsesOfWith(&CB, Cast);
 }
 
-// Returns the or result of all icmp instructions.
-static Value *getOrResult(const SmallVector<Value *, 2> &ICmps,
-                          IRBuilder<> &Builder) {
-  assert(!ICmps.empty() && "Must have at least one icmp instructions");
-  if (ICmps.size() == 1)
-    return ICmps[0];
-
-  SmallVector<Value *, 2> OrResults;
-  int i = 0, NumICmp = ICmps.size();
-  for (i = 0; i + 1 < NumICmp; i += 2)
-    OrResults.push_back(Builder.CreateOr(ICmps[i], ICmps[i + 1], "icmp-or"));
-
-  if (i < NumICmp)
-    OrResults.push_back(ICmps[i]);
-
-  return getOrResult(OrResults, Builder);
-}
-
 /// Predicate and clone the given call site.
 ///
 /// This function creates an if-then-else structure at the location of the call
@@ -393,7 +375,7 @@ static CallBase &versionCallSiteWithCond(CallBase &CB, Value *Cond,
   return *NewInst;
 }
 
-// Predicate and clone the given call site usingc condition `CB.callee ==
+// Predicate and clone the given call site using condition `CB.callee ==
 // Callee`. See the comment `versionCallSiteWithCond` for the transformation.
 CallBase &llvm::versionCallSite(CallBase &CB, Value *Callee,
                                 MDNode *BranchWeights) {
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 82daaedaa0e81..308a07ddf8d2e 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
@@ -30,8 +31,8 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -55,6 +56,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -1967,11 +1969,23 @@ void llvm::updateProfileCallee(
     uint64_t CloneEntryCount = PriorEntryCount - NewEntryCount;
     for (auto Entry : *VMap) {
       if (isa<CallInst>(Entry.first))
-        if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+        if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second)) {
           CI->updateProfWeight(CloneEntryCount, PriorEntryCount);
+
+          Instruction *VPtr =
+              PGOIndirectCallVisitor::tryGetVTableInstruction(CI);
+          if (VPtr)
+            scaleProfData(*VPtr, CloneEntryCount, PriorEntryCount);
+        }
       if (isa<InvokeInst>(Entry.first))
-        if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second))
+        if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second)) {
           II->updateProfWeight(CloneEntryCount, PriorEntryCount);
+
+          Instruction *VPtr =
+              PGOIndirectCallVisitor::tryGetVTableInstruction(II);
+          if (VPtr)
+            scaleProfData(*VPtr, CloneEntryCount, PriorEntryCount);
+        }
     }
   }
 
@@ -1982,10 +1996,22 @@ void llvm::updateProfileCallee(
       // No need to update the callsite if it is pruned during inlining.
       if (!VMap || VMap->count(&BB))
         for (Instruction &I : BB) {
-          if (CallInst *CI = dyn_cast<CallInst>(&I))
+          if (CallInst *CI = dyn_cast<CallInst>(&I)) {
             CI->updateProfWeight(NewEntryCount, PriorEntryCount);
-          if (InvokeInst *II = dyn_cast<InvokeInst>(&I))
+
+            Instruction *VPtr =
+                PGOIndirectCallVisitor::tryGetVTableInstruction(CI);
+            if (VPtr)
+              scaleProfData(*VPtr, NewEntryCount, PriorEntryCount);
+          }
+          if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
             II->updateProfWeight(NewEntryCount, PriorEntryCount);
+
+            Instruction *VPtr =
+                PGOIndirectCallVisitor::tryGetVTableInstruction(II);
+            if (VPtr)
+              scaleProfData(*VPtr, NewEntryCount, PriorEntryCount);
+          }
         }
   }
 }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index f3cd3104c3128..305770d2b7c91 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2538,6 +2538,190 @@ Value *getSalvageOpsForIcmpOp(ICmpInst *Icmp, uint64_t CurrentLocOps,
   return Icmp->getOperand(0);
 }
 
+void llvm::tryToSinkInstructionDbgValues(
+    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
+    BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers) {
+  // For all debug values in the destination block, the sunk instruction
+  // will still be available, so they do not need to be dropped.
+  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSalvage;
+  for (auto &DbgUser : DbgUsers)
+    if (DbgUser->getParent() != DestBlock)
+      DbgUsersToSalvage.push_back(DbgUser);
+
+  // Process the sinking DbgUsersToSalvage in reverse order, as we only want
+  // to clone the last appearing debug intrinsic for each given variable.
+  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSink;
+  for (DbgVariableIntrinsic *DVI : DbgUsersToSalvage)
+    if (DVI->getParent() == SrcBlock)
+      DbgUsersToSink.push_back(DVI);
+  llvm::sort(DbgUsersToSink,
+             [](auto *A, auto *B) { return B->comesBefore(A); });
+
+  SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
+  SmallSet<DebugVariable, 4> SunkVariables;
+  for (auto *User : DbgUsersToSink) {
+    // A dbg.declare instruction should not be cloned, since there can only be
+    // one per variable fragment. It should be left in the original place
+    // because the sunk instruction is not an alloca (otherwise we could not be
+    // here).
+    if (isa<DbgDeclareInst>(User))
+      continue;
+
+    DebugVariable DbgUserVariable =
+        DebugVariable(User->getVariable(), User->getExpression(),
+                      User->getDebugLoc()->getInlinedAt());
+
+    if (!SunkVariables.insert(DbgUserVariable).second)
+      continue;
+
+    // Leave dbg.assign intrinsics in their original positions and there should
+    // be no need to insert a clone.
+    if (isa<DbgAssignIntrinsic>(User))
+      continue;
+
+    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
+    if (isa<DbgDeclareInst>(User) && isa<CastInst>(I))
+      DIIClones.back()->replaceVariableLocationOp(I, I->getOperand(0));
+    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
+  }
+
+  // Perform salvaging without the clones, then sink the clones.
+  if (!DIIClones.empty()) {
+    salvageDebugInfoForDbgValues(*I, DbgUsersToSalvage, {});
+    // The clones are in reverse order of original appearance, reverse again to
+    // maintain the original order.
+    for (auto &DIIClone : llvm::reverse(DIIClones)) {
+      DIIClone->insertBefore(&*InsertPos);
+      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
+    }
+  }
+}
+
+void llvm::tryToSinkInstructionDPValues(
+    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
+    BasicBlock *DestBlock,
+    SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  // Fetch all DbgVariableRecords not already in the destination.
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage;
+  for (auto &DVR : DbgVariableRecords)
+    if (DVR->getParent() != DestBlock)
+      DbgVariableRecordsToSalvage.push_back(DVR);
+
+  // Fetch a second collection, of DbgVariableRecords in the source block that
+  // we're going to sink.
+  SmallVector<DbgVariableRecord *> DbgVariableRecordsToSink;
+  for (DbgVariableRecord *DVR : DbgVariableRecordsToSalvage)
+    if (DVR->getParent() == SrcBlock)
+      DbgVariableRecordsToSink.push_back(DVR);
+
+  // Sort DbgVariableRecords according to their position in the block. This is a
+  // partial order: DbgVariableRecords attached to different instructions will
+  // be ordered by the instruction order, but DbgVariableRecords attached to the
+  // same instruction won't have an order.
+  auto Order = [](DbgVariableRecord *A, DbgVariableRecord *B) -> bool {
+    return B->getInstruction()->comesBefore(A->getInstruction());
+  };
+  llvm::stable_sort(DbgVariableRecordsToSink, Order);
+
+  // If there are two assignments to the same variable attached to the same
+  // instruction, the ordering between the two assignments is important. Scan
+  // for this (rare) case and establish which is the last assignment.
+  using InstVarPair = std::pair<const Instruction *, DebugVariable>;
+  SmallDenseMap<InstVarPair, DbgVariableRecord *> FilterOutMap;
+  if (DbgVariableRecordsToSink.size() > 1) {
+    SmallDenseMap<InstVarPair, unsigned> CountMap;
+    // Count how many assignments to each variable there is per instruction.
+    for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
+      DebugVariable DbgUserVariable =
+          DebugVariable(DVR->getVariable(), DVR->getExpression(),
+                        DVR->getDebugLoc()->getInlinedAt());
+      CountMap[std::make_pair(DVR->getInstruction(), DbgUserVariable)] += 1;
+    }
+
+    // If there are any instructions with two assignments, add them to the
+    // FilterOutMap to record that they need extra filtering.
+    SmallPtrSet<const Instruction *, 4> DupSet;
+    for (auto It : CountMap) {
+      if (It.second > 1) {
+        FilterOutMap[It.first] = nullptr;
+        DupSet.insert(It.first.first);
+      }
+    }
+
+    // For all instruction/variable pairs needing extra filtering, find the
+    // latest assignment.
+    for (const Instruction *Inst : DupSet) {
+      for (DbgVariableRecord &DVR :
+           llvm::reverse(filterDbgVars(Inst->getDbgRecordRange()))) {
+        DebugVariable DbgUserVariable =
+            DebugVariable(DVR.getVariable(), DVR.getExpression(),
+                          DVR.getDebugLoc()->getInlinedAt());
+        auto FilterIt =
+            FilterOutMap.find(std::make_pair(Inst, DbgUserVariable));
+        if (FilterIt == FilterOutMap.end())
+          continue;
+        if (FilterIt->second != nullptr)
+          continue;
+        FilterIt->second = &DVR;
+      }
+    }
+  }
+
+  // Perform cloning of the DbgVariableRecords that we plan on sinking, filter
+  // out any duplicate assignments identified above.
+  SmallVector<DbgVariableRecord *, 2> DVRClones;
+  SmallSet<DebugVariable, 4> SunkVariables;
+  for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
+    if (DVR->Type == DbgVariableRecord::LocationType::Declare)
+      continue;
+
+    DebugVariable DbgUserVariable =
+        DebugVariable(DVR->getVariable(), DVR->getExpression(),
+                      DVR->getDebugLoc()->getInlinedAt());
+
+    // For any variable where there were multiple assignments in the same place,
+    // ignore all but the last assignment.
+    if (!FilterOutMap.empty()) {
+      InstVarPair IVP = std::make_pair(DVR->getInstruction(), DbgUserVariable);
+      auto It = FilterOutMap.find(IVP);
+
+      // Filter out.
+      if (It != FilterOutMap.end() && It->second != DVR)
+        continue;
+    }
+
+    if (!SunkVariables.insert(DbgUserVariable).second)
+      continue;
+
+    if (DVR->isDbgAssign())
+      continue;
+
+    DVRClones.emplace_back(DVR->clone());
+    LLVM_DEBUG(dbgs() << "CLONE: " << *DVRClones.back() << '\n');
+  }
+
+  // Perform salvaging without the clones, then sink the clones.
+  if (DVRClones.empty())
+    return;
+
+  salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
+
+  // The clones are in reverse order of original appearance. Assert that the
+  // head bit is set on the iterator as we _should_ have received it via
+  // getFirstInsertionPt. Inserting like this will reverse the clone order as
+  // we'll repeatedly insert at the head, such as:
+  //   DVR-3 (third insertion goes here)
+  //   DVR-2 (second insertion goes here)
+  //   DVR-1 (first insertion goes here)
+  //   Any-Prior-DVRs
+  //   InsertPtInst
+  assert(InsertPos.getHeadBit());
+  for (DbgVariableRecord *DVRClone : DVRClones) {
+    InsertPos->getParent()->insertDbgRecordBefore(DVRClone, InsertPos);
+    LLVM_DEBUG(dbgs() << "SINK: " << *DVRClone << '\n');
+  }
+}
+
 Value *llvm::salvageDebugInfoImpl(Instruction &I, uint64_t CurrentLocOps,
                                   SmallVectorImpl<uint64_t> &Ops,
                                   SmallVectorImpl<Value *> &AdditionalValues) {
diff --git a/llvm/test/Transforms/Inline/update_invoke_prof.ll b/llvm/test/Transforms/Inline/update_invoke_prof.ll
index f6b86dfe5bb1b..12eb7dbf418c5 100644
--- a/llvm/test/Transforms/Inline/update_invoke_prof.ll
+++ b/llvm/test/Transforms/Inline/update_invoke_prof.ll
@@ -1,6 +1,7 @@
-; Test that branch weights and value profiles associated with invoke are updated
-; in both caller and callee after inline, but invoke instructions with taken or
-; not taken branch probabilities are not updated.
+; Tests that instructions with value profiles and count-type branch weights are
+; updated in both caller and callee after inline, but invoke instructions with
+; taken or not taken branch probabilities are not updated.
+
 ; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
 
 declare i32 @__gxx_personality_v0(...)
@@ -15,21 +16,23 @@ declare void @callee1(ptr %func)
 
 declare void @callee2(ptr %func)
 
-define void @callee(ptr %func) personality ptr @__gxx_personality_v0 !prof !17 {
+define void @callee(ptr %obj) personality ptr @__gxx_personality_v0 !prof !17 {
+  %vtable = load ptr, ptr %obj, !prof !21
+  %func = load ptr, ptr %vtable
   invoke void %func()
-          to label %next unwind label %lpad, !prof !18
+  to label %next unwind label %lpad, !prof !18
 
 next:
   invoke void @callee1(ptr %func)
-          to label %cont unwind label %lpad, !prof !19
+  to label %cont unwind label %lpad, !prof !19
 
 cont:
   invoke void @callee2(ptr %func)
-          to label %ret unwind label %lpad, !prof !20
+  to label %ret unwind label %lpad, !prof !20
 
 lpad:
   %exn = landingpad {ptr, i32}
-          cleanup
+  cleanup
   unreachable
 
 ret:
@@ -57,26 +60,41 @@ ret:
 !18 = !{!"VP", i32 0, i64 1500, i64 123, i64 900, i64 456, i64 600}
 !19 = !{!"branch_weights", i32 1500}
 !20 = !{!"branch_weights", i32 1234, i32 5678}
+!21 = !{!"VP", i32 2, i64 1500, i64 789, i64 900, i64 321, i64 600}
 
-; CHECK-LABEL: @caller(
-; CHECK:  invoke void %func(
-; CHECK-NEXT: {{.*}} !prof ![[PROF1:[0-9]+]]
-; CHECK:  invoke void @callee1(
-; CHECK-NEXT: {{.*}} !prof ![[PROF2:[0-9]+]]
-; CHECK:  invoke void @callee2(
-; CHECK-NEXT: {{.*}} !prof ![[PROF3:[0-9]+]]
-
-; CHECK-LABL: @callee(
-; CHECK:  invoke void %func(
-; CHECK-NEXT: {{.*}} !prof ![[PROF4:[0-9]+]]
-; CHECK:  invoke void @callee1(
-; CHECK-NEXT: {{.*}} !prof ![[PROF5:[0-9]+]]
-; CHECK:  invoke void @callee2(
-; CHECK-NEXT: {{.*}} !prof ![[PROF3]]
+; CHECK-LABEL: define void @caller(
+; CHECK-SAME: ptr [[FUNC:%.*]]) personality ptr @__gxx_personality_v0 !prof [[PROF14:![0-9]+]] {
+; CHECK-NEXT:    [[VTABLE_I:%.*]] = load ptr, ptr [[FUNC]], align 8, !prof [[PROF15:![0-9]+]]
+; CHECK-NEXT:    [[FUNC_I:%.*]] = load ptr, ptr [[VTABLE_I]], align 8
+; CHECK-NEXT:    invoke void [[FUNC_I]]()
+; CHECK-NEXT:            to label %[[NEXT_I:.*]] unwind label %[[LPAD_I:.*]], !prof [[PROF16:![0-9]+]]
+; CHECK:       [[NEXT_I]]:
+; CHECK-NEXT:    invoke void @callee1(ptr [[FUNC_I]])
+; CHECK-NEXT:            to label %[[CONT_I:.*]] unwind label %[[LPAD_I]], !prof [[PROF17:![0-9]+]]
+; CHECK:       [[CONT_I]]:
+; CHECK-NEXT:    invoke void @callee2(ptr [[FUNC_I]])
+; CHECK-NEXT:            to label %[[CALLEE_EXIT:.*]] unwind label %[[LPAD_I]], !prof [[PROF18:![0-9]+]]
+;
 
+; CHECK-LABEL: define void @callee(
+; CHECK-SAME: ptr [[OBJ:%.*]]) personality ptr @__gxx_personality_v0 !prof [[PROF19:![0-9]+]] {
+; CHECK-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[OBJ]], align 8, !prof [[PROF20:![0-9]+]]
+; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    invoke void [[FUNC]]()
+; CHECK-NEXT:            to label %[[NEXT:.*]] unwind label %[[LPAD:.*]], !prof [[PROF21:![0-9]+]]
+; CHECK:       [[NEXT]]:
+; CHECK-NEXT:    invoke void @callee1(ptr [[FUNC]])
+; CHECK-NEXT:            to label %[[CONT:.*]] unwind label %[[LPAD]], !prof [[PROF22:![0-9]+]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    invoke void @callee2(ptr [[FUNC]])
+; CHECK-NEXT:            to label %[[RET:.*]] unwind label %[[LPAD]], !prof [[PROF18]]
 
-; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 1000, i64 123, i64 600, i64 456, i64 400}
-; CHECK: ![[PROF2]] = !{!"branch_weights", i32 1000}
-; CHECK: ![[PROF3]] = !{!"branch_weights", i32 1234, i32 5678}
-; CHECK: ![[PROF4]] = !{!"VP", i32 0, i64 500, i64 123, i64 300, i64 456, i64 200}
-; CHECK: ![[PROF5]] = !{!"branch_weights", i32 500}
+; CHECK: [[PROF14]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF15]] = !{!"VP", i32 2, i64 1000, i64 789, i64 600, i64 321, i64 400}
+; CHECK: [[PROF16]] = !{!"VP", i32 0, i64 1000, i64 123, i64 600, i64 456, i64 400}
+; CHECK: [[PROF17]] = !{!"branch_weights", i32 1000}
+; CHECK: [[PROF18]] = !{!"branch_weights", i32 1234, i32 5678}
+; CHECK: [[PROF19]] = !{!"function_entry_count", i64 500}
+; CHECK: [[PROF20]] = !{!"VP", i32 2, i64 500, i64 789, i64 300, i64 321, i64 200}
+; CHECK: [[PROF21]] = !{!"VP", i32 0, i64 500, i64 123, i64 300, i64 456, i64 200}
+; CHECK: [[PROF22]] = !{!"branch_weights", i32 500}
diff --git a/llvm/test/Transforms/Inline/update_value_profile.ll b/llvm/test/Transforms/Inline/update_value_profile.ll
index daa95e93b68ec..96aa35fb572de 100644
--- a/llvm/test/Transforms/Inline/update_value_profile.ll
+++ b/llvm/test/Transforms/Inline/update_value_profile.ll
@@ -2,33 +2,33 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; When 'callee' is inlined into caller1 and caller2, the indirect call value
-; profiles of the inlined copy should be scaled based on callers' profiles,
-; and the indirect call value profiles in 'callee' should be updated.
-define i32 @callee(ptr %0, i32 %1) !prof !20 {
+; When 'callee' is inlined into caller1 and caller2, the indirect call and vtable
+; value profiles of the inlined copy should be scaled based on callers' profiles.
+; The indirect call and vtable value profiles in 'callee' should be updated.
+define i32 @callee(ptr %0, i32 %1) !prof !19 {
 ; CHECK-LABEL: define i32 @callee(
 ; CHECK-SAME: ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) !prof [[PROF0:![0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8, !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP0]], i32 [[TMP1]]), !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP0]], i32 [[TMP1]]), !prof [[PROF2:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
-  %3 = load ptr, ptr %0
+  %3 = load ptr, ptr %0, !prof !15
   %5 = getelementptr inbounds i8, ptr %3, i64 8
   %6 = load ptr, ptr %5
-  %7 = tail call i32 %6(ptr %0, i32 %1), !prof !17
+  %7 = tail call i32 %6(ptr %0, i32 %1), !prof !16
   ret i32 %7
 }
 
-define i32 @caller1(i32 %0) !prof !18 {
+define i32 @caller1(i32 %0) !prof !17 {
 ; CHECK-LABEL: define i32 @caller1(
-; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF2:![0-9]+]] {
+; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF3:![0-9]+]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call ptr @_Z10createTypei(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !prof [[PROF4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF3:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF5:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
   %2 = tail call ptr @_Z10createTypei(i32 %0)
@@ -36,14 +36,14 @@ define i32 @caller1(i32 %0) !prof !18 {
   ret i32 %3
 }
 
-define i32 @caller2(i32 %0) !prof !19  {
+define i32 @caller2(i32 %0) !prof !18  {
 ; CHECK-LABEL: define i32 @caller2(
-; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF4:![0-9]+]] {
+; CHECK-SAME: i32 [[TMP0:%.*]]) !prof [[PROF6:![0-9]+]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call ptr @_Z10createTypei(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !prof [[PROF7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 [[TMP5]](ptr [[TMP2]], i32 [[TMP0]]), !prof [[PROF8:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
   %2 = tail call ptr @_Z10createTypei(i32 %0)
@@ -67,15 +67,19 @@ declare ptr @_Z10createTypei(i32)
 !12 = !{i32 10000, i64 100, i32 1}
 !13 = !{i32 999000, i64 100, i32 1}
 !14 = !{i32 999999, i64 1, i32 2}
-!17 = !{!"VP", i32 0, i64 1600, i64 123, i64 1000, i64 456, i64 600}
-!18 = !{!"function_entry_count", i64 1000}
-!19 = !{!"function_entry_count", i64 600}
-!20 = !{!"function_entry_count", i64 1700}
+!15 = !{!"VP", i32 2, i64 1600, i64 321, i64 1000, i64 789, i64 600}
+!16 = !{!"VP", i32 0, i64 1600, i64 123, i64 1000, i64 456, i64 600}
+!17 = !{!"function_entry_count", i64 1000}
+!18 = !{!"function_entry_count", i64 600}
+!19 = !{!"function_entry_count", i64 1700}
 ;.
 ; CHECK: [[PROF0]] = !{!"function_entry_count", i64 100}
-; CHECK: [[PROF1]] = !{!"VP", i32 0, i64 94, i64 123, i64 58, i64 456, i64 35}
-; CHECK: [[PROF2]] = !{!"function_entry_count", i64 1000}
-; CHECK: [[PROF3]] = !{!"VP", i32 0, i64 941, i64 123, i64 588, i64 456, i64 352}
-; CHECK: [[PROF4]] = !{!"function_entry_count", i64 600}
-; CHECK: [[PROF5]] = !{!"VP", i32 0, i64 564, i64 123, i64 352, i64 456, i64 211}
+; CHECK: [[PROF1]] = !{!"VP", i32 2, i64 94, i64 321, i64 58, i64 789, i64 35}
+; CHECK: [[PROF2]] = !{!"VP", i32 0, i64 94, i64 123, i64 58, i64 456, i64 35}
+; CHECK: [[PROF3]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF4]] = !{!"VP", i32 2, i64 941, i64 321, i64 588, i64 789, i64 352}
+; CHECK: [[PROF5]] = !{!"VP", i32 0, i64 941, i64 123, i64 588, i64 456, i64 352}
+; CHECK: [[PROF6]] = !{!"function_entry_count", i64 600}
+; CHECK: [[PROF7]] = !{!"VP", i32 2, i64 564, i64 321, i64 352, i64 789, i64 211}
+; CHECK: [[PROF8]] = !{!"VP", i32 0, i64 564, i64 123, i64 352, i64 456, i64 211}
 ;.
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
index 75eda4b66be02..96a2b2360787b 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
@@ -1,206 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 
-; RUN: opt < %s -passes=pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-FUNC
-
-; Invoke instcombine after pgo-icall-prom so the address calculation instructions for virtual calls get sink into the basic block for indirect fallback.
-; RUN: opt < %s -passes='pgo-icall-prom,instcombine' -icp-enable-vtable-cmp -S | FileCheck %s --check-prefix=ICALL-VTABLE
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -icp-num-additional-vtable-last=2 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,VTABLE-CMP
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -icp-num-additional-vtable-last=0 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,FUNC-CMP
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
- at _ZTV4Base = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0
- at _ZTV8Derived1 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN8Derived15func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0, !type !1
- at _ZTV8Derived2 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN8Derived25func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0, !type !2
- at _ZTV8Derived3 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN8Derived35func1Ei, ptr @_ZN4Base5func2Ev] }, !type !0, !type !3
+ at Base1 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0
+ at Base2 = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo] }, !type !2
+ at Base3 = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo] }, !type !6
 
-; Test the IR transformation from function-based indirect-call promotion and vtable-based indirect-call promotion.
+ at Derived1 = dso_local constant { [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived1_bar] }, !type !1, !type !2, !type !3
+ at Derived2 = dso_local constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo], [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived2_bar] }, !type !4, !type !5, !type !6, !type !7
+ at Derived3 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0, !type !8
 
-; The tested function has one function candidate which comes from one vtable.
-define i32 @test_one_function_one_vtable(ptr %d) {
-; ICALL-FUNC-LABEL: define i32 @test_one_function_one_vtable(
-; ICALL-FUNC-SAME: ptr [[D:%.*]]) {
-; ICALL-FUNC-NEXT:  entry:
-; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF4:![0-9]+]]
-; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-FUNC-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
-; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
-; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func2Ev
-; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5:![0-9]+]]
-; ICALL-FUNC:       if.true.direct_targ:
-; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr [[D]])
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP:%.*]]
-; ICALL-FUNC:       if.false.orig_indirect:
-; ICALL-FUNC-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP1]](ptr [[D]])
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
-; ICALL-FUNC:       if.end.icp:
-; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-FUNC-NEXT:    ret i32 [[TMP4]]
-;
-; ICALL-VTABLE-LABEL: define i32 @test_one_function_one_vtable(
-; ICALL-VTABLE-SAME: ptr [[D:%.*]]) {
-; ICALL-VTABLE-NEXT:  entry:
-; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived2, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    br i1 [[TMP1]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4:![0-9]+]]
-; ICALL-VTABLE:       if.true.direct_targ:
-; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr nonnull [[D]])
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP:%.*]]
-; ICALL-VTABLE:       if.false.orig_indirect:
-; ICALL-VTABLE-NEXT:    [[VFN:%.*]] = getelementptr inbounds i8, ptr [[VTABLE]], i64 8
-; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VFN]], align 8
-; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP3]](ptr nonnull [[D]])
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
-; ICALL-VTABLE:       if.end.icp:
-; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP2]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-VTABLE-NEXT:    ret i32 [[TMP4]]
-;
-entry:
-  %vtable = load ptr, ptr %d, !prof !4
-  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
-  tail call void @llvm.assume(i1 %0)
-  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
-  %1 = load ptr, ptr %vfn
-  %call = tail call i32 %1(ptr %d), !prof !5
-  ret i32 %call
-}
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived1_bar with count 600 out of 1600, compare 1 vtables and sink 2 instructions
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived2_bar with count 500 out of 1000, compare 1 vtables and sink 2 instructions
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Base1_bar with count 400 out of 500, compare 2 vtables and sink 2 instructions
 
-; The tested function has one function candidate which comes from two vtables.
-define i32 @test_one_function_two_vtables(ptr %d) {
-; ICALL-FUNC-LABEL: define i32 @test_one_function_two_vtables(
-; ICALL-FUNC-SAME: ptr [[D:%.*]]) {
-; ICALL-FUNC-NEXT:  entry:
-; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF6:![0-9]+]]
-; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-FUNC-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
-; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
-; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func2Ev
-; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5]]
-; ICALL-FUNC:       if.true.direct_targ:
-; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr [[D]])
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP:%.*]]
-; ICALL-FUNC:       if.false.orig_indirect:
-; ICALL-FUNC-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP1]](ptr [[D]])
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
-; ICALL-FUNC:       if.end.icp:
-; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-FUNC-NEXT:    ret i32 [[TMP4]]
+define void @test(ptr %d) {
+; VTABLE-CMP-LABEL: define void @test(
+; VTABLE-CMP-SAME: ptr [[D:%.*]]) {
+; VTABLE-CMP-NEXT:  [[ENTRY:.*:]]
+; VTABLE-CMP-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF9:![0-9]+]]
+; VTABLE-CMP-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"Base1")
+; VTABLE-CMP-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; VTABLE-CMP-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived1, i32 40)
+; VTABLE-CMP-NEXT:    br i1 [[TMP1]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF10:![0-9]+]]
+; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG]]:
+; VTABLE-CMP-NEXT:    tail call void @Derived1_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP:.*]]
+; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT]]:
+; VTABLE-CMP-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived2, i32 64)
+; VTABLE-CMP-NEXT:    br i1 [[TMP2]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF11:![0-9]+]]
+; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG1]]:
+; VTABLE-CMP-NEXT:    tail call void @Derived2_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP3:.*]]
+; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT2]]:
+; VTABLE-CMP-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Base1, i32 16)
+; VTABLE-CMP-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived3, i32 16)
+; VTABLE-CMP-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; VTABLE-CMP-NEXT:    br i1 [[TMP5]], label %[[IF_TRUE_DIRECT_TARG4:.*]], label %[[IF_FALSE_ORIG_INDIRECT5:.*]], !prof [[PROF12:![0-9]+]]
+; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG4]]:
+; VTABLE-CMP-NEXT:    tail call void @Base1_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP6:.*]]
+; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT5]]:
+; VTABLE-CMP-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; VTABLE-CMP-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[VFN]], align 8
+; VTABLE-CMP-NEXT:    tail call void [[TMP6]](ptr [[D]])
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP6]]
+; VTABLE-CMP:       [[IF_END_ICP6]]:
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP3]]
+; VTABLE-CMP:       [[IF_END_ICP3]]:
+; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP]]
+; VTABLE-CMP:       [[IF_END_ICP]]:
+; VTABLE-CMP-NEXT:    ret void
 ;
-; ICALL-VTABLE-LABEL: define i32 @test_one_function_two_vtables(
-; ICALL-VTABLE-SAME: ptr [[D:%.*]]) {
-; ICALL-VTABLE-NEXT:  entry:
-; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived1, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived2, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    [[ICMP_OR:%.*]] = or i1 [[TMP1]], [[TMP2]]
-; ICALL-VTABLE-NEXT:    br i1 [[ICMP_OR]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4]]
-; ICALL-VTABLE:       if.true.direct_targ:
-; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr nonnull [[D]])
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP:%.*]]
-; ICALL-VTABLE:       if.false.orig_indirect:
-; ICALL-VTABLE-NEXT:    [[VFN:%.*]] = getelementptr inbounds i8, ptr [[VTABLE]], i64 8
-; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VFN]], align 8
-; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP4]](ptr nonnull [[D]])
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
-; ICALL-VTABLE:       if.end.icp:
-; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-VTABLE-NEXT:    ret i32 [[TMP5]]
+; FUNC-CMP-LABEL: define void @test(
+; FUNC-CMP-SAME: ptr [[D:%.*]]) {
+; FUNC-CMP-NEXT:  [[ENTRY:.*:]]
+; FUNC-CMP-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF9:![0-9]+]]
+; FUNC-CMP-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"Base1")
+; FUNC-CMP-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; FUNC-CMP-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
+; FUNC-CMP-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
+; FUNC-CMP-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @Derived1_bar
+; FUNC-CMP-NEXT:    br i1 [[TMP2]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF10:![0-9]+]]
+; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG]]:
+; FUNC-CMP-NEXT:    tail call void @Derived1_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP:.*]]
+; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT]]:
+; FUNC-CMP-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[TMP1]], @Derived2_bar
+; FUNC-CMP-NEXT:    br i1 [[TMP3]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF11:![0-9]+]]
+; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG1]]:
+; FUNC-CMP-NEXT:    tail call void @Derived2_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP3:.*]]
+; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT2]]:
+; FUNC-CMP-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[TMP1]], @Base1_bar
+; FUNC-CMP-NEXT:    br i1 [[TMP4]], label %[[IF_TRUE_DIRECT_TARG4:.*]], label %[[IF_FALSE_ORIG_INDIRECT5:.*]], !prof [[PROF12:![0-9]+]]
+; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG4]]:
+; FUNC-CMP-NEXT:    tail call void @Base1_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP6:.*]]
+; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT5]]:
+; FUNC-CMP-NEXT:    tail call void [[TMP1]](ptr [[D]])
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP6]]
+; FUNC-CMP:       [[IF_END_ICP6]]:
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP3]]
+; FUNC-CMP:       [[IF_END_ICP3]]:
+; FUNC-CMP-NEXT:    br label %[[IF_END_ICP]]
+; FUNC-CMP:       [[IF_END_ICP]]:
+; FUNC-CMP-NEXT:    ret void
 ;
 entry:
-  %vtable = load ptr, ptr %d, !prof !6
-  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
+  %vtable = load ptr, ptr %d, !prof !9
+  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"Base1")
   tail call void @llvm.assume(i1 %0)
   %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
   %1 = load ptr, ptr %vfn
-  %call = tail call i32 %1(ptr %d), !prof !5
-  ret i32 %call
+  tail call void %1(ptr %d), !prof !10
+  ret void
 }
 
-; The tested function has one function candidate which comes from three vtables.
-define i32 @test_one_function_three_vtables(ptr %d) {
-; ICALL-FUNC-LABEL: define i32 @test_one_function_three_vtables(
-; ICALL-FUNC-SAME: ptr [[D:%.*]]) {
-; ICALL-FUNC-NEXT:  entry:
-; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8, !prof [[PROF7:![0-9]+]]
-; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-FUNC-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
-; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VFN]], align 8
-; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func2Ev
-; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5]]
-; ICALL-FUNC:       if.true.direct_targ:
-; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr [[D]])
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP:%.*]]
-; ICALL-FUNC:       if.false.orig_indirect:
-; ICALL-FUNC-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP1]](ptr [[D]])
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
-; ICALL-FUNC:       if.end.icp:
-; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-FUNC-NEXT:    ret i32 [[TMP4]]
-;
-; ICALL-VTABLE-LABEL: define i32 @test_one_function_three_vtables(
-; ICALL-VTABLE-SAME: ptr [[D:%.*]]) {
-; ICALL-VTABLE-NEXT:  entry:
-; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[D]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived1, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV8Derived2, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    [[ICMP_OR:%.*]] = or i1 [[TMP1]], [[TMP2]]
-; ICALL-VTABLE-NEXT:    [[ICMP_OR1:%.*]] = or i1 [[ICMP_OR]], [[TMP3]]
-; ICALL-VTABLE-NEXT:    br i1 [[ICMP_OR1]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4]]
-; ICALL-VTABLE:       if.true.direct_targ:
-; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = tail call i32 @_ZN4Base5func2Ev(ptr nonnull [[D]])
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP:%.*]]
-; ICALL-VTABLE:       if.false.orig_indirect:
-; ICALL-VTABLE-NEXT:    [[VFN:%.*]] = getelementptr inbounds i8, ptr [[VTABLE]], i64 8
-; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[VFN]], align 8
-; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = tail call i32 [[TMP5]](ptr nonnull [[D]])
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
-; ICALL-VTABLE:       if.end.icp:
-; ICALL-VTABLE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT]] ], [ [[TMP4]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-VTABLE-NEXT:    ret i32 [[TMP6]]
-;
-entry:
-  %vtable = load ptr, ptr %d, !prof !7
-  %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
-  tail call void @llvm.assume(i1 %0)
-  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
-  %1 = load ptr, ptr %vfn
-  %call = tail call i32 %1(ptr %d), !prof !5
-  ret i32 %call
+define void @Base1_bar(ptr %this) {
+  ret void
 }
 
+define void @Derived1_bar(ptr %this) {
+  ret void
+}
 
-declare i1 @llvm.type.test(ptr, metadata)
-declare void @llvm.assume(i1 noundef)
-declare i32 @_ZN4Base5func1Ei(ptr, i32)
-declare i32 @_ZN8Derived15func1Ei(ptr, i32)
-declare i32 @_ZN8Derived25func1Ei(ptr, i32)
-declare i32 @_ZN8Derived35func1Ei(ptr, i32)
-
-define i32 @_ZN4Base5func2Ev(ptr %this) {
-entry:
-  ret i32 0
+define void @Derived2_bar(ptr %this) {
+  ret void
 }
 
-!0 = !{i64 16, !"_ZTS4Base"}
-!1 = !{i64 16, !"_ZTS8Derived1"}
-!2 = !{i64 16, !"_ZTS8Derived2"}
-!3 = !{i64 16, !"_ZTS8Derived3"}
-!4 = !{!"VP", i32 2, i64 1600, i64 5035968517245772950, i64 1600}
-!5 = !{!"VP", i32 0, i64 1600, i64 -3104805163612457913, i64 1600}
-!6 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 1000, i64 5035968517245772950, i64 600}
-!7 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 600, i64 5035968517245772950, i64 550, i64 1960855528937986108, i64 450}
 
-; ICALL-FUNC: [[PROF4]] = !{!"VP", i32 2, i64 1600, i64 5035968517245772950, i64 1600}
-; ICALL-FUNC: [[PROF5]] = !{!"branch_weights", i32 1600, i32 0}
-; ICALL-FUNC: [[PROF6]] = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 1000, i64 5035968517245772950, i64 600}
-; ICALL-FUNC: [[PROF7]] = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 600, i64 5035968517245772950, i64 550, i64 1960855528937986108, i64 450}
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+declare i32 @Base2_foo(ptr)
+declare i32 @Base1_foo(ptr)
+declare void @Base3_foo(ptr)
+
+!0 = !{i64 16, !"Base1"}
+!1 = !{i64 40, !"Base1"}
+!2 = !{i64 16, !"Base2"}
+!3 = !{i64 16, !"Derived1"}
+!4 = !{i64 64, !"Base1"}
+!5 = !{i64 40, !"Base2"}
+!6 = !{i64 16, !"Base3"}
+!7 = !{i64 16, !"Derived2"}
+!8 = !{i64 16, !"Derived3"}
+!9 = !{!"VP", i32 2, i64 1600, i64 -4123858694673519054, i64 600, i64 -7211198353767973908, i64 500, i64 -3574436251470806727, i64 200, i64 6288809125658696740, i64 200, i64 12345678, i64 100}
+!10 = !{!"VP", i32 0, i64 1600, i64 3827408714133779784, i64 600, i64 5837445539218476403, i64 500, i64 -9064955852395570538, i64 400,  i64 56781234, i64 100}
+;.
+; VTABLE-COMMON: [[PROF9]] = !{!"VP", i32 2, i64 100, i64 12345678, i64 100}
+; VTABLE-COMMON: [[PROF10]] = !{!"branch_weights", i32 600, i32 1000}
+; VTABLE-COMMON: [[PROF11]] = !{!"branch_weights", i32 500, i32 500}
+; VTABLE-COMMON: [[PROF12]] = !{!"branch_weights", i32 400, i32 100}
 
-; ICALL-VTABLE: [[PROF4]] = !{!"branch_weights", i32 1600, i32 0}
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
index a2924420fd2a0..e82aa9f14788c 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
@@ -1,144 +1,71 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
-; RUN: opt < %s -passes=pgo-icall-prom -S  | FileCheck %s --check-prefix=ICALL-FUNC
-; RUN: opt < %s -passes='pgo-icall-prom,instcombine' -icp-enable-vtable-cmp -S | FileCheck %s --check-prefix=ICALL-VTABLE
+; RUN: opt < %s -passes='pgo-icall-prom' -icp-enable-vtable-cmp -S | FileCheck %s --check-prefix=VTABLE
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-%class.Error = type { i8 }
-
- at _ZTI5Error = dso_local constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr null, i64 2), ptr null }
 @_ZTV4Base = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base10get_ticketEv] }, !type !0, !type !1
 @_ZTV7Derived = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived10get_ticketEv] }, !type !0, !type !1, !type !2, !type !3
 
- at .str = private unnamed_addr constant [15 x i8] c"out of tickets\00"
-
-define i32 @_Z4testP4Base(ptr %b) personality ptr @__gxx_personality_v0 {
-; ICALL-FUNC-LABEL: define i32 @_Z4testP4Base(
-; ICALL-FUNC-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 {
-; ICALL-FUNC-NEXT:  entry:
-; ICALL-FUNC-NEXT:    [[E:%.*]] = alloca [[CLASS_ERROR:%.*]], align 8
-; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8, !prof [[PROF4:![0-9]+]]
-; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
-; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN7Derived10get_ticketEv
-; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF5:![0-9]+]]
-; ICALL-FUNC:       if.true.direct_targ:
-; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = invoke i32 @_ZN7Derived10get_ticketEv(ptr [[B]])
-; ICALL-FUNC-NEXT:            to label [[IF_END_ICP:%.*]] unwind label [[LPAD:%.*]]
-; ICALL-FUNC:       if.false.orig_indirect:
-; ICALL-FUNC-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base10get_ticketEv
-; ICALL-FUNC-NEXT:    br i1 [[TMP4]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[IF_FALSE_ORIG_INDIRECT2:%.*]], !prof [[PROF6:![0-9]+]]
-; ICALL-FUNC:       if.true.direct_targ1:
-; ICALL-FUNC-NEXT:    [[TMP5:%.*]] = invoke i32 @_ZN4Base10get_ticketEv(ptr [[B]])
-; ICALL-FUNC-NEXT:            to label [[IF_END_ICP3:%.*]] unwind label [[LPAD]]
-; ICALL-FUNC:       if.false.orig_indirect2:
-; ICALL-FUNC-NEXT:    [[CALL:%.*]] = invoke i32 [[TMP1]](ptr [[B]])
-; ICALL-FUNC-NEXT:            to label [[IF_END_ICP3]] unwind label [[LPAD]]
-; ICALL-FUNC:       if.end.icp3:
-; ICALL-FUNC-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT2]] ], [ [[TMP5]], [[IF_TRUE_DIRECT_TARG1]] ]
-; ICALL-FUNC-NEXT:    br label [[IF_END_ICP]]
-; ICALL-FUNC:       if.end.icp:
-; ICALL-FUNC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], [[IF_END_ICP3]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-FUNC-NEXT:    br label %try.cont
-; ICALL-FUNC:       lpad:
-
-;
-; ICALL-VTABLE-LABEL: define i32 @_Z4testP4Base(
-; ICALL-VTABLE-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 {
-; ICALL-VTABLE-NEXT:  entry:
-; ICALL-VTABLE-NEXT:    [[E:%.*]] = alloca [[CLASS_ERROR:%.*]], align 8
-; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[IF_FALSE_ORIG_INDIRECT:%.*]], !prof [[PROF4:![0-9]+]]
-; ICALL-VTABLE:       if.true.direct_targ:
-; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = invoke i32 @_ZN7Derived10get_ticketEv(ptr nonnull [[B]])
-; ICALL-VTABLE-NEXT:            to label [[IF_END_ICP:%.*]] unwind label [[LPAD:%.*]]
-; ICALL-VTABLE:       if.false.orig_indirect:
-; ICALL-VTABLE-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    br i1 [[TMP4]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[IF_FALSE_ORIG_INDIRECT2:%.*]], !prof [[PROF5:![0-9]+]]
-; ICALL-VTABLE:       if.true.direct_targ1:
-; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = invoke i32 @_ZN4Base10get_ticketEv(ptr nonnull [[B]])
-; ICALL-VTABLE-NEXT:            to label [[IF_END_ICP3:%.*]] unwind label [[LPAD]]
-; ICALL-VTABLE:       if.false.orig_indirect2:
-; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = invoke i32 [[TMP1]](ptr nonnull [[B]])
-; ICALL-VTABLE-NEXT:            to label [[IF_END_ICP3]] unwind label [[LPAD]]
-; ICALL-VTABLE:       if.end.icp3:
-; ICALL-VTABLE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], [[IF_FALSE_ORIG_INDIRECT2]] ], [ [[TMP5]], [[IF_TRUE_DIRECT_TARG1]] ]
-; ICALL-VTABLE-NEXT:    br label [[IF_END_ICP]]
-; ICALL-VTABLE:       if.end.icp:
-; ICALL-VTABLE-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], [[IF_END_ICP3]] ], [ [[TMP3]], [[IF_TRUE_DIRECT_TARG]] ]
-; ICALL-VTABLE-NEXT:    br label %try.cont
-; ICALL-VTABLE:       lpad:
+ at .str = private constant [15 x i8] c"out of tickets\00"
+
+define i32 @test(ptr %b) personality ptr @__gxx_personality_v0 {
+; VTABLE-LABEL: define i32 @test(
+; VTABLE-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 {
+; VTABLE-NEXT:  [[ENTRY:.*:]]
+; VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8
+; VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; VTABLE-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV7Derived, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP3]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF4:![0-9]+]]
+; VTABLE:       [[IF_TRUE_DIRECT_TARG]]:
+; VTABLE-NEXT:    [[TMP2:%.*]] = invoke i32 @_ZN7Derived10get_ticketEv(ptr [[B]])
+; VTABLE-NEXT:            to label %[[IF_END_ICP:.*]] unwind label %[[LPAD:.*]]
+; VTABLE:       [[IF_FALSE_ORIG_INDIRECT]]:
+; VTABLE-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV4Base, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP4]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF5:![0-9]+]]
+; VTABLE:       [[IF_TRUE_DIRECT_TARG1]]:
+; VTABLE-NEXT:    [[TMP5:%.*]] = invoke i32 @_ZN4Base10get_ticketEv(ptr [[B]])
+; VTABLE-NEXT:            to label %[[IF_END_ICP3:.*]] unwind label %[[LPAD]]
+; VTABLE:       [[IF_FALSE_ORIG_INDIRECT2]]:
+; VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; VTABLE-NEXT:    [[CALL:%.*]] = invoke i32 [[TMP1]](ptr [[B]])
+; VTABLE-NEXT:            to label %[[IF_END_ICP3]] unwind label %[[LPAD]]
+; VTABLE:       [[IF_END_ICP3]]:
+; VTABLE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[CALL]], %[[IF_FALSE_ORIG_INDIRECT2]] ], [ [[TMP5]], %[[IF_TRUE_DIRECT_TARG1]] ]
+; VTABLE-NEXT:    br label %[[IF_END_ICP]]
+; VTABLE:       [[IF_END_ICP]]:
+; VTABLE-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP6]], %[[IF_END_ICP3]] ], [ [[TMP2]], %[[IF_TRUE_DIRECT_TARG]] ]
+; VTABLE-NEXT:    br label %[[NEXT:.*]]
+; VTABLE:       [[NEXT]]:
+; VTABLE-NEXT:    ret i32 [[TMP7]]
+; VTABLE:       [[LPAD]]:
+; VTABLE-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
+; VTABLE-NEXT:            cleanup
+; VTABLE-NEXT:    unreachable
 ;
 entry:
-  %e = alloca %class.Error
   %vtable = load ptr, ptr %b, !prof !4
   %0 = tail call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS4Base")
   tail call void @llvm.assume(i1 %0)
   %1 = load ptr, ptr %vtable
-  %call = invoke i32 %1(ptr %b)
-  to label %try.cont unwind label %lpad, !prof !5
-
-lpad:
-  %2 = landingpad { ptr, i32 }
-  cleanup
-  catch ptr @_ZTI5Error
-  %3 = extractvalue { ptr, i32 } %2, 1
-  %4 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTI5Error)
-  %matches = icmp eq i32 %3, %4
-  br i1 %matches, label %catch, label %ehcleanup
+  %call = invoke i32 %1(ptr %b) to label %next unwind label %lpad, !prof !5
 
-catch:
-  %5 = extractvalue { ptr, i32 } %2, 0
-
-  %call3 = invoke i32 @_ZN5Error10error_codeEv(ptr nonnull align 1 dereferenceable(1) %e)
-  to label %invoke.cont2 unwind label %lpad1
-
-invoke.cont2:
-  call void @__cxa_end_catch()
-  br label %try.cont
-
-try.cont:
-  %ret.0 = phi i32 [ %call3, %invoke.cont2 ], [ %call, %entry ]
-  ret i32 %ret.0
+next:
+  ret i32 %call
 
-lpad1:
-  %6 = landingpad { ptr, i32 }
+lpad:
+  %exn = landingpad {ptr, i32}
   cleanup
-  invoke void @__cxa_end_catch()
-  to label %invoke.cont4 unwind label %terminate.lpad
-
-invoke.cont4:
-  br label %ehcleanup
-
-ehcleanup:
-  %lpad.val7.merged = phi { ptr, i32 } [ %6, %invoke.cont4 ], [ %2, %lpad ]
-  resume { ptr, i32 } %lpad.val7.merged
-
-terminate.lpad:
-  %7 = landingpad { ptr, i32 }
-  catch ptr null
-  %8 = extractvalue { ptr, i32 } %7, 0
   unreachable
 }
 
-declare i1 @llvm.type.test(ptr, metadata)
-declare void @llvm.assume(i1 noundef)
-declare i32 @__gxx_personality_v0(...)
-declare i32 @llvm.eh.typeid.for(ptr)
-
-declare i32 @_ZN5Error10error_codeEv(ptr nonnull align 1 dereferenceable(1))
-
-declare void @__cxa_end_catch()
+declare void @make_error(ptr, ptr, i32)
+declare i32 @get_ticket_id()
+declare ptr @__cxa_allocate_exception(i64)
 
-define i32 @_ZN4Base10get_ticketEv(ptr %this) align 2 personality ptr @__gxx_personality_v0 {
+define i32 @_ZN4Base10get_ticketEv(ptr %this) personality ptr @__gxx_personality_v0 {
 entry:
-  %call = tail call i32 @_Z13get_ticket_idv()
+  %call = tail call i32 @get_ticket_id()
   %cmp.not = icmp eq i32 %call, -1
   br i1 %cmp.not, label %if.end, label %if.then
 
@@ -147,7 +74,7 @@ if.then:
 
 if.end:
   %exception = tail call ptr @__cxa_allocate_exception(i64 1)
-  invoke void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1) %exception, ptr nonnull @.str, i32 1)
+  invoke void @make_error(ptr %exception, ptr @.str, i32 1)
   to label %invoke.cont unwind label %lpad
 
 invoke.cont:
@@ -159,9 +86,9 @@ lpad:
   resume { ptr, i32 } %0
 }
 
-define i32 @_ZN7Derived10get_ticketEv(ptr %this) align 2 personality ptr @__gxx_personality_v0 {
+define i32 @_ZN7Derived10get_ticketEv(ptr %this) personality ptr @__gxx_personality_v0 {
 entry:
-  %call = tail call i32 @_Z13get_ticket_idv()
+  %call = tail call i32 @get_ticket_id()
   %cmp.not = icmp eq i32 %call, -1
   br i1 %cmp.not, label %if.end, label %if.then
 
@@ -170,7 +97,7 @@ if.then:
 
 if.end:
   %exception = tail call ptr @__cxa_allocate_exception(i64 1)
-  invoke void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1) %exception, ptr nonnull @.str, i32 2)
+  invoke void @make_error(ptr %exception, ptr @.str, i32 2)
   to label %invoke.cont unwind label %lpad
 
 invoke.cont:
@@ -182,9 +109,11 @@ lpad:
   resume { ptr, i32 } %0
 }
 
-declare i32 @_Z13get_ticket_idv()
-declare ptr @__cxa_allocate_exception(i64)
-declare void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1), ptr, i32)
+declare i1 @llvm.type.test(ptr, metadata) #2
+declare void @llvm.assume(i1)
+declare i32 @__gxx_personality_v0(...)
+
+attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
 !0 = !{i64 16, !"_ZTS4Base"}
 !1 = !{i64 16, !"_ZTSM4BaseFivE.virtual"}
@@ -193,9 +122,6 @@ declare void @_ZN5ErrorC1EPKci(ptr nonnull align 1 dereferenceable(1), ptr, i32)
 !4 = !{!"VP", i32 2, i64 1600, i64 13870436605473471591, i64 900, i64 1960855528937986108, i64 700}
 !5 = !{!"VP", i32 0, i64 1600, i64 14811317294552474744, i64 900, i64 9261744921105590125, i64 700}
 
-; ICALL-FUNC: [[PROF4]] = !{!"VP", i32 2, i64 1600, i64 -4576307468236080025, i64 900, i64 1960855528937986108, i64 700}
-; ICALL-FUNC: [[PROF5]] = !{!"branch_weights", i32 900, i32 700}
-; ICALL-FUNC: [[PROF6]] = !{!"branch_weights", i32 700, i32 0}
-
-; ICALL-VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
-; ICALL-VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}
+; VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
+; VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
index 94ed588c5458d..1dc208c30952e 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
@@ -1,57 +1,36 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
-; RUN: opt < %s -passes=pgo-icall-prom -pass-remarks=pgo-icall-prom -S 2>&1 | FileCheck %s --check-prefix=ICALL-FUNC
-; RUN: opt < %s -passes='pgo-icall-prom,instcombine' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -S 2>&1 | FileCheck %s --check-prefix=ICALL-VTABLE
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -S 2>&1 | FileCheck %s --check-prefixes=VTABLE,REMARK
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
- at _ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, align 8, !type !0, !type !1, !type !2, !type !3
- at _ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, align 8, !type !0, !type !1
+; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN7Derived5func1Eii with count 900 out of 1600, compare 1 vtables and sink 1 instruction
+; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN4Base5func1Eii with count 700 out of 700, compare 1 vtables and sink 1 instructions
+
+ at _ZTV7Derived = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, align 8, !type !0, !type !1, !type !2, !type !3
+ at _ZTV4Base = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, align 8, !type !0, !type !1
 
 define i32 @test_tail_call(ptr %ptr, i32 %a, i32 %b) {
-; ICALL-FUNC-LABEL: define i32 @test_tail_call(
-; ICALL-FUNC-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
-; ICALL-FUNC-NEXT:  entry:
-; ICALL-FUNC-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[PTR]], align 8, !prof [[PROF4:![0-9]+]]
-; ICALL-FUNC-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-FUNC-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-FUNC-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
-; ICALL-FUNC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @_ZN7Derived5func1Eii
-; ICALL-FUNC-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[TMP4:%.*]], !prof [[PROF5:![0-9]+]]
-; ICALL-FUNC:       if.true.direct_targ:
-; ICALL-FUNC-NEXT:    [[TMP3:%.*]] = musttail call i32 @_ZN7Derived5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
-; ICALL-FUNC-NEXT:    ret i32 [[TMP3]]
-; ICALL-FUNC:       4:
-; ICALL-FUNC-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], @_ZN4Base5func1Eii
-; ICALL-FUNC-NEXT:    br i1 [[TMP5]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[TMP7:%.*]], !prof [[PROF6:![0-9]+]]
-; ICALL-FUNC:       if.true.direct_targ1:
-; ICALL-FUNC-NEXT:    [[TMP6:%.*]] = musttail call i32 @_ZN4Base5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
-; ICALL-FUNC-NEXT:    ret i32 [[TMP6]]
-; ICALL-FUNC:       7:
-; ICALL-FUNC-NEXT:    [[CALL:%.*]] = musttail call i32 [[TMP1]](ptr [[PTR]], i32 [[A]], i32 [[B]])
-; ICALL-FUNC-NEXT:    ret i32 [[CALL]]
-;
-; ICALL-VTABLE-LABEL: define i32 @test_tail_call(
-; ICALL-VTABLE-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
-; ICALL-VTABLE-NEXT:  entry:
-; ICALL-VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[PTR]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
-; ICALL-VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
-; ICALL-VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
-; ICALL-VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[TMP4:%.*]], !prof [[PROF4:![0-9]+]]
-; ICALL-VTABLE:       if.true.direct_targ:
-; ICALL-VTABLE-NEXT:    [[TMP3:%.*]] = musttail call i32 @_ZN7Derived5func1Eii(ptr nonnull [[PTR]], i32 [[A]], i32 [[B]])
-; ICALL-VTABLE-NEXT:    ret i32 [[TMP3]]
-; ICALL-VTABLE:       4:
-; ICALL-VTABLE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2)
-; ICALL-VTABLE-NEXT:    br i1 [[TMP5]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[TMP7:%.*]], !prof [[PROF5:![0-9]+]]
-; ICALL-VTABLE:       if.true.direct_targ1:
-; ICALL-VTABLE-NEXT:    [[TMP6:%.*]] = musttail call i32 @_ZN4Base5func1Eii(ptr nonnull [[PTR]], i32 [[A]], i32 [[B]])
-; ICALL-VTABLE-NEXT:    ret i32 [[TMP6]]
-; ICALL-VTABLE:       7:
-; ICALL-VTABLE-NEXT:    [[CALL:%.*]] = musttail call i32 [[TMP1]](ptr nonnull [[PTR]], i32 [[A]], i32 [[B]])
-; ICALL-VTABLE-NEXT:    ret i32 [[CALL]]
+; VTABLE-LABEL: define i32 @test_tail_call(
+; VTABLE-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; VTABLE-NEXT:  entry:
+; VTABLE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[PTR]], align 8
+; VTABLE-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.type.test(ptr [[VTABLE]], metadata !"_ZTS4Base")
+; VTABLE-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
+; VTABLE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV7Derived, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP2]], label [[IF_TRUE_DIRECT_TARG:%.*]], label [[TMP4:%.*]], !prof [[PROF4:![0-9]+]]
+; VTABLE:       if.true.direct_targ:
+; VTABLE-NEXT:    [[TMP3:%.*]] = musttail call i32 @_ZN7Derived5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
+; VTABLE-NEXT:    ret i32 [[TMP3]]
+; VTABLE:       3:
+; VTABLE-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV4Base, i32 16)
+; VTABLE-NEXT:    br i1 [[TMP4]], label [[IF_TRUE_DIRECT_TARG1:%.*]], label [[TMP7:%.*]], !prof [[PROF5:![0-9]+]]
+; VTABLE:       if.true.direct_targ1:
+; VTABLE-NEXT:    [[TMP6:%.*]] = musttail call i32 @_ZN4Base5func1Eii(ptr [[PTR]], i32 [[A]], i32 [[B]])
+; VTABLE-NEXT:    ret i32 [[TMP6]]
+; VTABLE:       6:
+; VTABLE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; VTABLE-NEXT:    [[CALL:%.*]] = musttail call i32 [[TMP1]](ptr [[PTR]], i32 [[A]], i32 [[B]])
+; VTABLE-NEXT:    ret i32 [[CALL]]
 ;
 entry:
   %vtable = load ptr, ptr %ptr, !prof !4
@@ -84,9 +63,5 @@ entry:
 !4 = !{!"VP", i32 2, i64 1600, i64 13870436605473471591, i64 900, i64 1960855528937986108, i64 700}
 !5 = !{!"VP", i32 0, i64 1600, i64 7889036118036845314, i64 900, i64 10495086226207060333, i64 700}
 
-; ICALL-FUNC: [[PROF4]] = !{!"VP", i32 2, i64 1600, i64 -4576307468236080025, i64 900, i64 1960855528937986108, i64 700}
-; ICALL-FUNC: [[PROF5]] = !{!"branch_weights", i32 900, i32 700}
-; ICALL-FUNC: [[PROF6]] = !{!"branch_weights", i32 700, i32 0}
-
-; ICALL-VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
-; ICALL-VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}
+; VTABLE: [[PROF4]] = !{!"branch_weights", i32 900, i32 700}
+; VTABLE: [[PROF5]] = !{!"branch_weights", i32 700, i32 0}

>From aefda4cd3cef1adf2817ebaafedc580ca5c48900 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Tue, 28 May 2024 14:26:39 -0700
Subject: [PATCH 07/16] Changes: 1. In Transforms/Utils/Local.{h,cpp}, remove
 debug intrinisc helper    functions, and make it a TODO to handle debug
 intrinsic when sinking    instructions. 2. In compiler-rt test, added lines
 to test IR are expected.

---
 clang/lib/CodeGen/CGVTables.cpp               |   4 +-
 .../Linux/instrprof-vtable-value-prof.cpp     |  32 ++-
 llvm/include/llvm/Transforms/Utils/Local.h    |   9 -
 .../Instrumentation/IndirectCallPromotion.cpp |  17 +-
 llvm/lib/Transforms/Utils/Local.cpp           | 184 ------------------
 5 files changed, 38 insertions(+), 208 deletions(-)

diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 8d9c22546b420..6cf1c93134c33 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -1344,7 +1344,9 @@ void CodeGenModule::EmitVTableTypeMetadata(const CXXRecordDecl *RD,
 
   ArrayRef<VTableComponent> Comps = VTLayout.vtable_components();
   for (auto AP : AddressPoints) {
-    // Create type metadata for the address point.
+    // llvm::errs() << VTable->getName() << "\n";
+    // llvm::errs() << AP.Offset << "\t" << AP.TypeName << "\n";
+    //  Create type metadata for the address point.
     AddVTableTypeMetadata(VTable, ComponentWidth * AP.Offset, AP.Base);
 
     // The class associated with each address point could also potentially be
diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index 73921adcc0c15..51f185ca93164 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -110,13 +110,43 @@
 // ICTEXT: _ZTV8Derived1:250
 
 // Test indirect call promotion transformation using vtable profiles.
-// RUN: %clangxx -fprofile-use=test.profdata -fuse-ld=lld -flto=thin -fwhole-program-vtables -O2 -mllvm -enable-vtable-value-profiling -mllvm -icp-enable-vtable-cmp -Rpass=pgo-icall-prom %s 2>&1 | FileCheck %s --check-prefix=REMARK --implicit-check-not="!VP"
+// Build with `-g` to enable debug information.
+// RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -g -flto=thin -fwhole-program-vtables -O2 -mllvm -enable-vtable-value-profiling -mllvm -icp-enable-vtable-cmp -Rpass=pgo-icall-prom -mllvm -print-after=pgo-icall-prom -mllvm -filter-print-funcs=main %s 2>&1 | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
 
 // REMARK: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, compare 1 vtables and sink 1 instructions
 // REMARK: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, compare 1 vtables and sink 1 instructions
 // REMARK: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, compare 1 vtables and sink 2 instructions
 // REMARK: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, compare 1 vtables and sink 2 instructions
 
+// IR-LABEL: @main
+// IR:   [[OBJ:%.*]] = call {{.*}} @_Z10createTypei
+// IR:   [[VTABLE:%.*]] = load ptr, ptr [[OBJ]]
+// IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E, i32 16)
+// IR:   br i1 [[CMP1]], label %[[BB1:.*]], label %[[BB2:[a-zA-Z0-9_.]+]],
+//
+// IR: [[BB1]]:
+// IR:   [[RESBB1:%.*]] = call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii
+// IR:   br label %[[MERGE0:[a-zA-Z0-9_.]+]]
+//
+// IR: [[BB2]]:
+// IR:   [[CMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTV8Derived1, i32 16)
+// IR:   br i1 [[CMP2]], label %[[BB3:.*]], label %[[BB4:[a-zA-Z0-9_.]+]],
+//
+// IR: [[BB3]]:
+// IR:   [[RESBB3:%.*]] = call {{.*}} @_ZN8Derived14funcEii
+// IR:   br label %[[MERGE1:[a-zA-Z0-9_.]+]],
+//
+// IR: [[BB4]]:
+// IR:   [[FUNCPTR:%.*]] = load ptr, ptr [[VTABLE]]
+// IR:   [[RESBB4:%.*]] = call {{.*}} [[FUNCPTR]]
+// IR:   br label %[[MERGE1]]
+//
+// IR: [[MERGE1]]:
+// IR:    [[RES1:%.*]] = phi i32 [ [[RESBB4]], %[[BB4]] ], [ [[RESBB3]], %[[BB3]] ]
+// IR:    br label %[[MERGE0]]
+//
+// IR: [[MERGE0]]:
+// IR:    [[RES2:%.*]] = phi i32 [ [[RES1]], %[[MERGE1]] ], [ [[RESBB1]], %[[BB1]] ]
 #include <cstdio>
 #include <cstdlib>
 class Base {
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 5535a722a40fe..6937ec8dfd21c 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -316,15 +316,6 @@ void salvageDebugInfoForDbgValues(Instruction &I,
                                   ArrayRef<DbgVariableIntrinsic *> Insns,
                                   ArrayRef<DbgVariableRecord *> DPInsns);
 
-void tryToSinkInstructionDbgValues(
-    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers);
-
-void tryToSinkInstructionDPValues(
-    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock,
-    SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords);
-
 /// Given an instruction \p I and DIExpression \p DIExpr operating on
 /// it, append the effects of \p I to the DIExpression operand list
 /// \p Ops, or return \p nullptr if it cannot be salvaged.
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 4de0aaef8d7ca..6c239a96828f4 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -233,7 +233,6 @@ static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
     return false;
 
   assert(DestBlock->getUniquePredecessor() == I->getParent());
-  BasicBlock *SrcBlock = I->getParent();
 
   // Do not move control-flow-involving, volatile loads, vaarg, etc.
   // Do not sink static or dynamic alloca instructions. Static allocas must
@@ -267,18 +266,10 @@ static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
   I->moveBefore(*DestBlock, InsertPos);
 
-  // Also sink all related debug uses from the source basic block. Otherwise we
-  // get debug use before the def. Attempt to salvage debug uses first, to
-  // maximise the range variables have location for. If we cannot salvage, then
-  // mark the location undef: we know it was supposed to receive a new location
-  // here, but that computation has been sunk.
-  SmallVector<DbgVariableIntrinsic *> DbgUsers;
-  SmallVector<DbgVariableRecord *> DPValues;
-  findDbgUsers(DbgUsers, I, &DPValues);
-  if (!DbgUsers.empty())
-    tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers);
-  if (!DPValues.empty())
-    tryToSinkInstructionDPValues(I, InsertPos, SrcBlock, DestBlock, DPValues);
+  // TODO: Sink debug intrinsic users of I to 'DestBlock'.
+  // 'InstCombinerImpl::tryToSinkInstructionDbgValues' and
+  // 'InstCombinerImpl::tryToSinkInstructionDbgVariableRecords' already have
+  // the core logic to do this.
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 305770d2b7c91..f3cd3104c3128 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2538,190 +2538,6 @@ Value *getSalvageOpsForIcmpOp(ICmpInst *Icmp, uint64_t CurrentLocOps,
   return Icmp->getOperand(0);
 }
 
-void llvm::tryToSinkInstructionDbgValues(
-    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers) {
-  // For all debug values in the destination block, the sunk instruction
-  // will still be available, so they do not need to be dropped.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSalvage;
-  for (auto &DbgUser : DbgUsers)
-    if (DbgUser->getParent() != DestBlock)
-      DbgUsersToSalvage.push_back(DbgUser);
-
-  // Process the sinking DbgUsersToSalvage in reverse order, as we only want
-  // to clone the last appearing debug intrinsic for each given variable.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSink;
-  for (DbgVariableIntrinsic *DVI : DbgUsersToSalvage)
-    if (DVI->getParent() == SrcBlock)
-      DbgUsersToSink.push_back(DVI);
-  llvm::sort(DbgUsersToSink,
-             [](auto *A, auto *B) { return B->comesBefore(A); });
-
-  SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
-  SmallSet<DebugVariable, 4> SunkVariables;
-  for (auto *User : DbgUsersToSink) {
-    // A dbg.declare instruction should not be cloned, since there can only be
-    // one per variable fragment. It should be left in the original place
-    // because the sunk instruction is not an alloca (otherwise we could not be
-    // here).
-    if (isa<DbgDeclareInst>(User))
-      continue;
-
-    DebugVariable DbgUserVariable =
-        DebugVariable(User->getVariable(), User->getExpression(),
-                      User->getDebugLoc()->getInlinedAt());
-
-    if (!SunkVariables.insert(DbgUserVariable).second)
-      continue;
-
-    // Leave dbg.assign intrinsics in their original positions and there should
-    // be no need to insert a clone.
-    if (isa<DbgAssignIntrinsic>(User))
-      continue;
-
-    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
-    if (isa<DbgDeclareInst>(User) && isa<CastInst>(I))
-      DIIClones.back()->replaceVariableLocationOp(I, I->getOperand(0));
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
-  }
-
-  // Perform salvaging without the clones, then sink the clones.
-  if (!DIIClones.empty()) {
-    salvageDebugInfoForDbgValues(*I, DbgUsersToSalvage, {});
-    // The clones are in reverse order of original appearance, reverse again to
-    // maintain the original order.
-    for (auto &DIIClone : llvm::reverse(DIIClones)) {
-      DIIClone->insertBefore(&*InsertPos);
-      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
-    }
-  }
-}
-
-void llvm::tryToSinkInstructionDPValues(
-    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock,
-    SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
-  // Fetch all DbgVariableRecords not already in the destination.
-  SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage;
-  for (auto &DVR : DbgVariableRecords)
-    if (DVR->getParent() != DestBlock)
-      DbgVariableRecordsToSalvage.push_back(DVR);
-
-  // Fetch a second collection, of DbgVariableRecords in the source block that
-  // we're going to sink.
-  SmallVector<DbgVariableRecord *> DbgVariableRecordsToSink;
-  for (DbgVariableRecord *DVR : DbgVariableRecordsToSalvage)
-    if (DVR->getParent() == SrcBlock)
-      DbgVariableRecordsToSink.push_back(DVR);
-
-  // Sort DbgVariableRecords according to their position in the block. This is a
-  // partial order: DbgVariableRecords attached to different instructions will
-  // be ordered by the instruction order, but DbgVariableRecords attached to the
-  // same instruction won't have an order.
-  auto Order = [](DbgVariableRecord *A, DbgVariableRecord *B) -> bool {
-    return B->getInstruction()->comesBefore(A->getInstruction());
-  };
-  llvm::stable_sort(DbgVariableRecordsToSink, Order);
-
-  // If there are two assignments to the same variable attached to the same
-  // instruction, the ordering between the two assignments is important. Scan
-  // for this (rare) case and establish which is the last assignment.
-  using InstVarPair = std::pair<const Instruction *, DebugVariable>;
-  SmallDenseMap<InstVarPair, DbgVariableRecord *> FilterOutMap;
-  if (DbgVariableRecordsToSink.size() > 1) {
-    SmallDenseMap<InstVarPair, unsigned> CountMap;
-    // Count how many assignments to each variable there is per instruction.
-    for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
-      DebugVariable DbgUserVariable =
-          DebugVariable(DVR->getVariable(), DVR->getExpression(),
-                        DVR->getDebugLoc()->getInlinedAt());
-      CountMap[std::make_pair(DVR->getInstruction(), DbgUserVariable)] += 1;
-    }
-
-    // If there are any instructions with two assignments, add them to the
-    // FilterOutMap to record that they need extra filtering.
-    SmallPtrSet<const Instruction *, 4> DupSet;
-    for (auto It : CountMap) {
-      if (It.second > 1) {
-        FilterOutMap[It.first] = nullptr;
-        DupSet.insert(It.first.first);
-      }
-    }
-
-    // For all instruction/variable pairs needing extra filtering, find the
-    // latest assignment.
-    for (const Instruction *Inst : DupSet) {
-      for (DbgVariableRecord &DVR :
-           llvm::reverse(filterDbgVars(Inst->getDbgRecordRange()))) {
-        DebugVariable DbgUserVariable =
-            DebugVariable(DVR.getVariable(), DVR.getExpression(),
-                          DVR.getDebugLoc()->getInlinedAt());
-        auto FilterIt =
-            FilterOutMap.find(std::make_pair(Inst, DbgUserVariable));
-        if (FilterIt == FilterOutMap.end())
-          continue;
-        if (FilterIt->second != nullptr)
-          continue;
-        FilterIt->second = &DVR;
-      }
-    }
-  }
-
-  // Perform cloning of the DbgVariableRecords that we plan on sinking, filter
-  // out any duplicate assignments identified above.
-  SmallVector<DbgVariableRecord *, 2> DVRClones;
-  SmallSet<DebugVariable, 4> SunkVariables;
-  for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
-    if (DVR->Type == DbgVariableRecord::LocationType::Declare)
-      continue;
-
-    DebugVariable DbgUserVariable =
-        DebugVariable(DVR->getVariable(), DVR->getExpression(),
-                      DVR->getDebugLoc()->getInlinedAt());
-
-    // For any variable where there were multiple assignments in the same place,
-    // ignore all but the last assignment.
-    if (!FilterOutMap.empty()) {
-      InstVarPair IVP = std::make_pair(DVR->getInstruction(), DbgUserVariable);
-      auto It = FilterOutMap.find(IVP);
-
-      // Filter out.
-      if (It != FilterOutMap.end() && It->second != DVR)
-        continue;
-    }
-
-    if (!SunkVariables.insert(DbgUserVariable).second)
-      continue;
-
-    if (DVR->isDbgAssign())
-      continue;
-
-    DVRClones.emplace_back(DVR->clone());
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DVRClones.back() << '\n');
-  }
-
-  // Perform salvaging without the clones, then sink the clones.
-  if (DVRClones.empty())
-    return;
-
-  salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
-
-  // The clones are in reverse order of original appearance. Assert that the
-  // head bit is set on the iterator as we _should_ have received it via
-  // getFirstInsertionPt. Inserting like this will reverse the clone order as
-  // we'll repeatedly insert at the head, such as:
-  //   DVR-3 (third insertion goes here)
-  //   DVR-2 (second insertion goes here)
-  //   DVR-1 (first insertion goes here)
-  //   Any-Prior-DVRs
-  //   InsertPtInst
-  assert(InsertPos.getHeadBit());
-  for (DbgVariableRecord *DVRClone : DVRClones) {
-    InsertPos->getParent()->insertDbgRecordBefore(DVRClone, InsertPos);
-    LLVM_DEBUG(dbgs() << "SINK: " << *DVRClone << '\n');
-  }
-}
-
 Value *llvm::salvageDebugInfoImpl(Instruction &I, uint64_t CurrentLocOps,
                                   SmallVectorImpl<uint64_t> &Ops,
                                   SmallVectorImpl<Value *> &AdditionalValues) {

>From 2c87c4785ebe763d4c8654bb542d0d26f4c8b39b Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Tue, 28 May 2024 14:33:04 -0700
Subject: [PATCH 08/16] undo changes to clang/lib/CodeGen/CGVTbles.cpp

---
 clang/lib/CodeGen/CGVTables.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 6cf1c93134c33..8d9c22546b420 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -1344,9 +1344,7 @@ void CodeGenModule::EmitVTableTypeMetadata(const CXXRecordDecl *RD,
 
   ArrayRef<VTableComponent> Comps = VTLayout.vtable_components();
   for (auto AP : AddressPoints) {
-    // llvm::errs() << VTable->getName() << "\n";
-    // llvm::errs() << AP.Offset << "\t" << AP.TypeName << "\n";
-    //  Create type metadata for the address point.
+    // Create type metadata for the address point.
     AddVTableTypeMetadata(VTable, ComponentWidth * AP.Offset, AP.Base);
 
     // The class associated with each address point could also potentially be

>From 142845ce3830a43ad80a5ad81d4c6518c2eebd8e Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Thu, 30 May 2024 11:13:31 -0700
Subject: [PATCH 09/16] follow up on review feedback

---
 .../profile/Linux/instrprof-vtable-value-prof.cpp    | 12 ++++++++----
 .../Instrumentation/IndirectCallPromotion.cpp        | 11 ++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index 51f185ca93164..19def403cca7a 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -113,11 +113,15 @@
 // Build with `-g` to enable debug information.
 // RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -g -flto=thin -fwhole-program-vtables -O2 -mllvm -enable-vtable-value-profiling -mllvm -icp-enable-vtable-cmp -Rpass=pgo-icall-prom -mllvm -print-after=pgo-icall-prom -mllvm -filter-print-funcs=main %s 2>&1 | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
 
-// REMARK: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, compare 1 vtables and sink 1 instructions
-// REMARK: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, compare 1 vtables and sink 1 instructions
-// REMARK: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, compare 1 vtables and sink 2 instructions
-// REMARK: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, compare 1 vtables and sink 2 instructions
+// For the indirect call site `ptr->func`
+// REMARK: instrprof-vtable-value-prof.cpp:191:19: remark: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, compare 1 vtables and sink 1 instructions
+// REMARK: instrprof-vtable-value-prof.cpp:191:19: remark: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, compare 1 vtables and sink 1 instructions
+//
+// For the indirect call site `delete ptr`
+// REMARK: instrprof-vtable-value-prof.cpp:193:5: remark: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, compare 1 vtables and sink 2 instructions
+// REMARK: instrprof-vtable-value-prof.cpp:193:5: remark: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, compare 1 vtables and sink 2 instructions
 
+// The IR matchers for indirect callsite `ptr->func`.
 // IR-LABEL: @main
 // IR:   [[OBJ:%.*]] = call {{.*}} @_Z10createTypei
 // IR:   [[VTABLE:%.*]] = load ptr, ptr [[OBJ]]
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 6c239a96828f4..5384fa4e37946 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -182,11 +182,9 @@ static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
 }
 
 // Returns the basic block in which `Inst` by `Use`.
-static BasicBlock *getUserBasicBlock(Instruction *Inst, unsigned int OperandNo,
-                                     Instruction *UserInst) {
+static BasicBlock *getUserBasicBlock(Use &U, Instruction *UserInst) {
   if (PHINode *PN = dyn_cast<PHINode>(UserInst))
-    return PN->getIncomingBlock(
-        PHINode::getIncomingValueNumForOperand(OperandNo));
+    return PN->getIncomingBlock(U);
 
   return UserInst->getParent();
 }
@@ -216,7 +214,7 @@ static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
     // We can sink debug or pseudo instructions together with Inst.
     if (UserInst->isDebugOrPseudoInst())
       continue;
-    UserBB = getUserBasicBlock(Inst, Use.getOperandNo(), UserInst);
+    UserBB = getUserBasicBlock(Use, UserInst);
     // Do not sink if Inst is used in a basic block that is not DestBB.
     // TODO: Sink to the common dominator of all user blocks.
     if (UserBB != DestBB)
@@ -673,8 +671,6 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
     MutableArrayRef<InstrProfValueData> ICallProfDataRef,
     VTableGUIDCountsMap &VTableGUIDCounts) {
   SmallVector<uint64_t, 4> PromotedFuncCount;
-  // TODO: Explain the branch accuracy (-fstrict-vtable-pointer) with a
-  // compiler-rt test.
   for (const auto &Candidate : Candidates) {
     uint64_t IfCount = 0;
     for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts) {
@@ -682,6 +678,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
       VTableGUIDCounts[GUID] -= Count;
     }
 
+    // Use indirect call counters to compute branch weights.
     BasicBlock *OriginalBB = CB.getParent();
     promoteCallWithVTableCmp(
         CB, VPtr, Candidate.TargetFunction, Candidate.AddressPoints,

>From 4f6b7ab1e5640a36316cffc3f879b6c6f4408d54 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Thu, 6 Jun 2024 23:12:04 -0700
Subject: [PATCH 10/16] Changes 1. Resolve review comments. 2. Handle vtable's
 PGO name, like what we do for indirect-call    promotion.    -
 InstrProf.h/cpp and PGOInstrumentation.cpp are modified. 3. Make use of
 'MaxNumVTableAnnotations' in PGOInstrumentation.cpp

---
 .../Linux/instrprof-vtable-value-prof.cpp     |  34 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |  10 +
 llvm/lib/ProfileData/InstrProf.cpp            |  40 ++-
 .../Instrumentation/IndirectCallPromotion.cpp | 301 ++++++++++--------
 .../Instrumentation/PGOInstrumentation.cpp    |  30 +-
 .../Transforms/PGOProfile/icp_vtable_cmp.ll   |  34 +-
 .../PGOProfile/icp_vtable_invoke.ll           |  10 +-
 .../PGOProfile/icp_vtable_tail_call.ll        |   7 +-
 8 files changed, 279 insertions(+), 187 deletions(-)

diff --git a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
index 19def403cca7a..0a32034f182a4 100644
--- a/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
+++ b/compiler-rt/test/profile/Linux/instrprof-vtable-value-prof.cpp
@@ -110,26 +110,40 @@
 // ICTEXT: _ZTV8Derived1:250
 
 // Test indirect call promotion transformation using vtable profiles.
-// Build with `-g` to enable debug information.
-// RUN: %clangxx -m64 -fprofile-use=test.profdata -fuse-ld=lld -g -flto=thin -fwhole-program-vtables -O2 -mllvm -enable-vtable-value-profiling -mllvm -icp-enable-vtable-cmp -Rpass=pgo-icall-prom -mllvm -print-after=pgo-icall-prom -mllvm -filter-print-funcs=main %s 2>&1 | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
+// - Build with `-g` to enable debug information.
+// - In real world settings, ICP pass is disabled in prelink pipeline. In
+//   the postlink pipeline, ICP is enabled after whole-program-devirtualization
+//   pass. Do the same thing in this test.
+// - Enable `-fwhole-program-vtables` generate type metadata and intrincis.
+// - Enable `-fno-split-lto-unit` and `-Wl,-lto-whole-program-visibility` to
+//   preserve type intrinsics for ICP pass.
+// RUN: %clangxx -m64  -fprofile-use=test.profdata -Wl,--lto-whole-program-visibility \
+// RUN:    -mllvm -disable-icp=true -Wl,-mllvm,-disable-icp=false -fuse-ld=lld \
+// RUN:    -g -flto=thin -fwhole-program-vtables -fno-split-lto-unit -O2 \
+// RUN:    -mllvm -enable-vtable-value-profiling -Wl,-mllvm,-enable-vtable-value-profiling \
+// RUN:    -mllvm -enable-vtable-profile-use \
+// RUN:    -Wl,-mllvm,-enable-vtable-profile-use -Rpass=pgo-icall-prom \
+// RUN:    -Wl,-mllvm,-print-after=pgo-icall-prom \
+// RUN:    -Wl,-mllvm,-filter-print-funcs=main %s 2>&1 \
+// RUN:    | FileCheck %s --check-prefixes=REMARK,IR --implicit-check-not="!VP"
 
 // For the indirect call site `ptr->func`
-// REMARK: instrprof-vtable-value-prof.cpp:191:19: remark: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, compare 1 vtables and sink 1 instructions
-// REMARK: instrprof-vtable-value-prof.cpp:191:19: remark: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, compare 1 vtables and sink 1 instructions
+// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN12_GLOBAL__N_18Derived24funcEii with count 150 out of 200, compare 1 vtables and sink 1 instructions
+// REMARK: instrprof-vtable-value-prof.cpp:205:19: Promote indirect call to _ZN8Derived14funcEii with count 50 out of 50, compare 1 vtables and sink 1 instructions
 //
 // For the indirect call site `delete ptr`
-// REMARK: instrprof-vtable-value-prof.cpp:193:5: remark: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, compare 1 vtables and sink 2 instructions
-// REMARK: instrprof-vtable-value-prof.cpp:193:5: remark: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, compare 1 vtables and sink 2 instructions
+// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN12_GLOBAL__N_18Derived2D0Ev with count 750 out of 1000, compare 1 vtables and sink 2 instructions
+// REMARK: instrprof-vtable-value-prof.cpp:207:5: Promote indirect call to _ZN8Derived1D0Ev with count 250 out of 250, compare 1 vtables and sink 2 instructions
 
 // The IR matchers for indirect callsite `ptr->func`.
 // IR-LABEL: @main
-// IR:   [[OBJ:%.*]] = call {{.*}} @_Z10createTypei
+// IR:   [[OBJ:%.*]] = {{.*}}call {{.*}} @_Z10createTypei
 // IR:   [[VTABLE:%.*]] = load ptr, ptr [[OBJ]]
 // IR:   [[CMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @_ZTVN12_GLOBAL__N_18Derived2E, i32 16)
 // IR:   br i1 [[CMP1]], label %[[BB1:.*]], label %[[BB2:[a-zA-Z0-9_.]+]],
 //
 // IR: [[BB1]]:
-// IR:   [[RESBB1:%.*]] = call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii
+// IR:   [[RESBB1:%.*]] = {{.*}}call {{.*}} @_ZN12_GLOBAL__N_18Derived24funcEii
 // IR:   br label %[[MERGE0:[a-zA-Z0-9_.]+]]
 //
 // IR: [[BB2]]:
@@ -137,12 +151,12 @@
 // IR:   br i1 [[CMP2]], label %[[BB3:.*]], label %[[BB4:[a-zA-Z0-9_.]+]],
 //
 // IR: [[BB3]]:
-// IR:   [[RESBB3:%.*]] = call {{.*}} @_ZN8Derived14funcEii
+// IR:   [[RESBB3:%.*]] = {{.*}}call {{.*}} @_ZN8Derived14funcEii
 // IR:   br label %[[MERGE1:[a-zA-Z0-9_.]+]],
 //
 // IR: [[BB4]]:
 // IR:   [[FUNCPTR:%.*]] = load ptr, ptr [[VTABLE]]
-// IR:   [[RESBB4:%.*]] = call {{.*}} [[FUNCPTR]]
+// IR:   [[RESBB4:%.*]] = {{.*}}call {{.*}} [[FUNCPTR]]
 // IR:   br label %[[MERGE1]]
 //
 // IR: [[MERGE1]]:
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 88c7fe425b5a5..817005bd28d88 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -304,8 +304,12 @@ getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind,
                          uint32_t MaxNumValueData, uint32_t &ActualNumValueData,
                          uint64_t &TotalC, bool GetNoICPValue = false);
 
+// TODO: Unify metadata name 'PGOFuncName' and 'PGOName', by supporting read
+// of this metadata for backward compatibility and generating 'PGOName' only.
 inline StringRef getPGOFuncNameMetadataName() { return "PGOFuncName"; }
 
+inline StringRef getPGONameMetadataName() { return "PGOName"; }
+
 /// Return the PGOFuncName meta data associated with a function.
 MDNode *getPGOFuncNameMetadata(const Function &F);
 
@@ -314,8 +318,14 @@ std::string getPGOName(const GlobalVariable &V, bool InLTO = false);
 /// Create the PGOFuncName meta data if PGOFuncName is different from
 /// function's raw name. This should only apply to internal linkage functions
 /// declared by users only.
+/// TODO: Update all callers to 'createPGONameMetadata' and deprecate this
+/// function.
 void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName);
 
+/// Create the PGOName metadata if a global object's PGO name is different from
+/// its mangled name. This should apply to local-linkage global objects only.
+void createPGONameMetadata(GlobalObject &GO, StringRef PGOName);
+
 /// Check if we can use Comdat for profile variables. This will eliminate
 /// the duplicated profile variables for Comdat functions.
 bool needsComdatForCounter(const GlobalObject &GV, const Module &M);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 806d01de1ada5..4649db2d92ec5 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -228,6 +228,12 @@ cl::opt<bool> EnableVTableValueProfiling(
              "the types of a C++ pointer. The information is used in indirect "
              "call promotion to do selective vtable-based comparison."));
 
+cl::opt<bool> EnableVTableProfileUse(
+    "enable-vtable-profile-use", cl::init(false),
+    cl::desc("If ThinLTO and WPD is enabled and this option is true, vtable "
+             "profiles will be used by ICP pass for more efficient indirect "
+             "call sequence. If false, type profiles won't be used."));
+
 std::string getInstrProfSectionName(InstrProfSectKind IPSK,
                                     Triple::ObjectFormatType OF,
                                     bool AddSegmentInfo) {
@@ -391,7 +397,7 @@ std::string getPGOName(const GlobalVariable &V, bool InLTO) {
   // PGONameMetadata should be set by compiler at profile use time
   // and read by symtab creation to look up symbols corresponding to
   // a MD5 hash.
-  return getIRPGOObjectName(V, InLTO, /*PGONameMetadata=*/nullptr);
+  return getIRPGOObjectName(V, InLTO, V.getMetadata(getPGONameMetadataName()));
 }
 
 // See getIRPGOObjectName() for a discription of the format.
@@ -480,8 +486,7 @@ Error InstrProfSymtab::create(Module &M, bool InLTO) {
   for (GlobalVariable &G : M.globals()) {
     if (!G.hasName() || !G.hasMetadata(LLVMContext::MD_type))
       continue;
-    if (Error E = addVTableWithName(
-            G, getIRPGOObjectName(G, InLTO, /* PGONameMetadata */ nullptr)))
+    if (Error E = addVTableWithName(G, getPGOName(G, InLTO)))
       return E;
   }
 
@@ -1393,16 +1398,27 @@ MDNode *getPGOFuncNameMetadata(const Function &F) {
   return F.getMetadata(getPGOFuncNameMetadataName());
 }
 
-void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName) {
-  // Only for internal linkage functions.
-  if (PGOFuncName == F.getName())
-      return;
-  // Don't create duplicated meta-data.
-  if (getPGOFuncNameMetadata(F))
+static void createPGONameMetadata(GlobalObject &GO, StringRef MetadataName,
+                                  StringRef PGOName) {
+  // For internal linkage objects, its name is not the same as its PGO name.
+  if (GO.getName() == PGOName)
     return;
-  LLVMContext &C = F.getContext();
-  MDNode *N = MDNode::get(C, MDString::get(C, PGOFuncName));
-  F.setMetadata(getPGOFuncNameMetadataName(), N);
+
+  // Don't created duplictaed metadata.
+  if (GO.getMetadata(MetadataName))
+    return;
+
+  LLVMContext &C = GO.getContext();
+  MDNode *N = MDNode::get(C, MDString::get(C, PGOName));
+  GO.setMetadata(MetadataName, N);
+}
+
+void createPGOFuncNameMetadata(Function &F, StringRef PGOFuncName) {
+  return createPGONameMetadata(F, getPGOFuncNameMetadataName(), PGOFuncName);
+}
+
+void createPGONameMetadata(GlobalObject &GO, StringRef PGOName) {
+  return createPGONameMetadata(GO, getPGONameMetadataName(), PGOName);
 }
 
 bool needsComdatForCounter(const GlobalObject &GO, const Module &M) {
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 5384fa4e37946..c805cf3f22f36 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -58,6 +58,10 @@ STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
+namespace llvm {
+extern cl::opt<bool> EnableVTableProfileUse;
+}
+
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
 static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
@@ -110,29 +114,31 @@ static cl::opt<bool>
     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
                  cl::desc("Dump IR after transformation happens"));
 
-// This option is meant to be used by LLVM regression test and test the
-// transformation that compares vtables.
-static cl::opt<bool> ICPEnableVTableCmp(
-    "icp-enable-vtable-cmp", cl::init(false), cl::Hidden,
-    cl::desc("If ThinLTO and WPD is enabled and this option is true, "
-             "indirect-call promotion pass will compare vtables rather than "
-             "functions for speculative devirtualization of virtual calls."
-             " If set to false, indirect-call promotion pass will always "
-             "compare functions."));
-
-static cl::opt<float>
-    ICPVTableCountPercentage("icp-vtable-count-percentage", cl::init(0.99),
-                             cl::Hidden,
-                             cl::desc("Percentage of vtable count to compare"));
-
-static cl::opt<int> ICPNumAdditionalVTableLast(
-    "icp-num-additional-vtable-last", cl::init(0), cl::Hidden,
-    cl::desc("The number of additional instruction for the last candidate"));
+// Indirect call promotion pass will fall back to function-based comparison if
+// vtable-count / function-count is smaller than this threshold.
+static cl::opt<float> ICPVTablePercentageThreshold(
+    "icp-vtable-percentage-threshold", cl::init(0.99), cl::Hidden,
+    cl::desc("The percentage threshold of vtable-count / function-count for "
+             "cost-benefit analysis. "));
+
+// Although comparing vtables can save a vtable load, we may need to compare
+// vtable pointer with multiple vtable address points due to class inheritance.
+// Comparing with multiple vtables inserts additional instructions on hot code
+// path; and doing so for earlier candidate of one icall can affect later
+// function candidate in an undesired way. We allow multiple vtable comparison
+// for the last function candidate and use the option below to cap the number
+// of vtables.
+static cl::opt<int> ICPMaxNumVTableLastCandidate(
+    "icp-max-num-vtable-last-candidate", cl::init(1), cl::Hidden,
+    cl::desc("The maximum number of vtable for the last candidate."));
 
 namespace {
 
+// The key is a vtable global variable, and the value is a map.
+// In the inner map, the key represents address point offsets and the value is a
+// constant for this address point.
 using VTableAddressPointOffsetValMap =
-    SmallDenseMap<const GlobalVariable *, SmallDenseMap<int, Constant *, 4>, 8>;
+    SmallDenseMap<const GlobalVariable *, SmallDenseMap<int, Constant *>>;
 
 // A struct to collect type information for a virtual call site.
 struct VirtualCallSiteInfo {
@@ -146,19 +152,25 @@ struct VirtualCallSiteInfo {
 
 // The key is a virtual call, and value is its type information.
 using VirtualCallSiteTypeInfoMap =
-    SmallDenseMap<const CallBase *, VirtualCallSiteInfo, 8>;
+    SmallDenseMap<const CallBase *, VirtualCallSiteInfo>;
+
+// The key is vtable GUID, and value is its value profile count.
+using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t>;
 
-// Find the offset where type string is `CompatibleType`.
+// Returns the address point offset of the given compatible type.
+//
+// Type metadata of a vtable specifies the types that can container a pointer to
+// this vtable, for example, `Base*` can be a pointer to an instantiated type
+// but not vice versa. See also https://llvm.org/docs/TypeMetadata.html
 static std::optional<uint64_t>
-getCompatibleTypeOffset(const GlobalVariable &VTableVar,
-                        StringRef CompatibleType) {
-  SmallVector<MDNode *, 2> Types; // type metadata associated with a vtable.
+getAddressPointOffset(const GlobalVariable &VTableVar,
+                      StringRef CompatibleType) {
+  SmallVector<MDNode *> Types;
   VTableVar.getMetadata(LLVMContext::MD_type, Types);
 
   for (MDNode *Type : Types)
     if (auto *TypeId = dyn_cast<MDString>(Type->getOperand(1).get());
         TypeId && TypeId->getString() == CompatibleType)
-
       return cast<ConstantInt>(
                  cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
           ->getZExtValue();
@@ -181,7 +193,7 @@ static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
       llvm::ConstantInt::get(Type::getInt32Ty(Context), AddressPointOffset));
 }
 
-// Returns the basic block in which `Inst` by `Use`.
+// Returns the basic block in which `Inst` is used via its `UserInst`.
 static BasicBlock *getUserBasicBlock(Use &U, Instruction *UserInst) {
   if (PHINode *PN = dyn_cast<PHINode>(UserInst))
     return PN->getIncomingBlock(U);
@@ -199,7 +211,7 @@ static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
   BasicBlock *BB = Inst->getParent();
   assert(Inst->getParent() != DestBB &&
          BB->getTerminator()->getNumSuccessors() == 2 &&
-         "Caller should guarantee");
+         "Guaranteed by ICP transformation");
   // Do not sink across a critical edge for simplicity.
   if (DestBB->getUniquePredecessor() != BB)
     return false;
@@ -225,18 +237,14 @@ static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
 
 // For the virtual call dispatch sequence, try to sink vtable load instructions
 // to the cold indirect call fallback.
+// FIXME: Move the sink eligibility check below to a utility function in
+// Transforms/Utils/ directory.
 static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
-  assert(!I->isTerminator());
   if (!isDestBBSuitableForSink(I, DestBlock))
     return false;
 
-  assert(DestBlock->getUniquePredecessor() == I->getParent());
-
-  // Do not move control-flow-involving, volatile loads, vaarg, etc.
-  // Do not sink static or dynamic alloca instructions. Static allocas must
-  // remain in the entry block, and dynamic allocas must not be sunk in between
-  // a stacksave / stackrestore pair, which would incorrectly shorten its
-  // lifetime.
+  // Do not move control-flow-involving, volatile loads, vaarg, alloca
+  // instructions, etc.
   if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() ||
       isa<AllocaInst>(I))
     return false;
@@ -253,12 +261,16 @@ static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
-    // We know that SrcBlock is the unique predecessor of DestBlock.
+    // We already know that SrcBlock is the unique predecessor of DestBlock.
     for (BasicBlock::iterator Scan = std::next(I->getIterator()),
                               E = I->getParent()->end();
-         Scan != E; ++Scan)
+         Scan != E; ++Scan) {
+      // Note analysis analysis can tell whether two pointers can point to the
+      // same object in memory or not thereby find further opportunities to
+      // sink.
       if (Scan->mayWriteToMemory())
         return false;
+    }
   }
 
   BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
@@ -273,12 +285,10 @@ static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 
 // Try to sink instructions after VPtr to the indirect call fallback.
 // Returns the number of sunk IR instructions.
-static int tryToSinkInstructions(Instruction *VPtr,
+static int tryToSinkInstructions(BasicBlock *OriginalBB,
                                  BasicBlock *IndirectCallBB) {
-  BasicBlock *OriginalBB = VPtr->getParent();
-
   int SinkCount = 0;
-  // FIXME: Find a way to bail out of the loop.
+  // Sink all eligible instructions in OriginalBB in reverse order.
   for (Instruction &I :
        llvm::make_early_inc_range(llvm::drop_begin(llvm::reverse(*OriginalBB))))
     if (tryToSinkInstruction(&I, IndirectCallBB))
@@ -314,16 +324,19 @@ class IndirectCallPromoter {
     Function *const TargetFunction;
     const uint64_t Count;
 
-    // The byte offset of TargetFunction starting from the vtable address point.
-    uint64_t FunctionOffset;
-    SmallVector<std::pair<uint64_t, uint64_t>, 2> VTableGUIDAndCounts;
-    SmallVector<Constant *, 2> AddressPoints;
+    // The following fields only exists for promotion candidates with vtable
+    // information.
+    //
+    // Due to class inheritance, one virtual call candidate can come from
+    // multiple vtables. `VTableGUIDAndCounts` tracks the vtable GUIDs and
+    // counts for 'TargetFunction'. `AddressPoints` stores the vtable address
+    // points for comparison.
+    VTableGUIDCountsMap VTableGUIDAndCounts;
+    SmallVector<Constant *> AddressPoints;
 
     PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
   };
 
-  using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t, 4>;
-
   // Check if the indirect-call call site should be promoted. Return the number
   // of promotions. Inst is the candidate indirect call, ValueDataRef
   // contains the array of value profile data for profiled targets,
@@ -356,9 +369,13 @@ class IndirectCallPromoter {
   bool isProfitableToCompareVTables(
       const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount);
 
-  // Populate `VTableGUIDCounts` vtable GUIDs and their counts and each
-  // candidate with vtable information. Returns the vtable instruction if not
-  // null.
+  // Given an indirect callsite and the list of function candidates, compute
+  // the following vtable information in output parameters and returns vtable
+  // pointer if type profiles exist.
+  // - Populate `VTableGUIDCounts` with <vtable-guid, count> with !prof metadata
+  // attached on the vtable pointer.
+  // - For each function candidate, finds out the vtables from which it get
+  // called and stores the <vtable-guid, count> there.
   Instruction *computeVTableInfos(const CallBase *CB,
                                   VTableGUIDCountsMap &VTableGUIDCounts,
                                   std::vector<PromotionCandidate> &Candidates);
@@ -490,9 +507,32 @@ Constant *IndirectCallPromoter::getOrCreateVTableAddressPointVar(
 Instruction *IndirectCallPromoter::computeVTableInfos(
     const CallBase *CB, VTableGUIDCountsMap &GUIDCountsMap,
     std::vector<PromotionCandidate> &Candidates) {
-  if (!ICPEnableVTableCmp)
+  if (!EnableVTableProfileUse)
     return nullptr;
 
+  // Take the following code sequence as an example, here is how the code works
+  //   @vtable1 = {[n x ptr] [... ptr @func1]}
+  //   @vtable2 = {[m x ptr] [... ptr @func2]}
+  //
+  //   %vptr = load ptr, ptr %d, !prof !0
+  //   %0 = tail call i1 @llvm.type.test(ptr %vptr, metadata !"vtable1")
+  //   tail call void @llvm.assume(i1 %0)
+  //   %vfn = getelementptr inbounds ptr, ptr %vptr, i64 1
+  //   %1 = load ptr, ptr %vfn
+  //   call void %1(ptr %d), !prof !1
+  //
+  //   !0 = !{!"VP", i32 2, i64 100, i64 123, i64 50, i64 456, i64 50}
+  //   !1 = !{!"VP", i32 0, i64 100, i64 789, i64 50, i64 579, i64 50}
+  //
+  // Step 1. Find out the %vptr instruction for indirect call and use its !prof
+  // to populate `GUIDCountsMap`.
+  // Step 2. For each vtable-guid, look up its definition from symtab. LTO can
+  // make vtable definitions visible across modules.
+  // Step 3. Compute the byte offset of the virtual call, by adding vtable
+  // address point offset and function's offset relative to vtable address
+  // point. For each function candidate, this step tells us the vtable from
+  // which it comes from, and the vtable address point to compare %vptr with.
+
   // Only virtual calls have virtual call site info.
   auto Iter = VirtualCSInfo.find(CB);
   if (Iter == VirtualCSInfo.end())
@@ -525,7 +565,7 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
     }
 
     std::optional<uint64_t> MaybeAddressPointOffset =
-        getCompatibleTypeOffset(*VTableVar, VirtualCallInfo.CompatibleTypeStr);
+        getAddressPointOffset(*VTableVar, VirtualCallInfo.CompatibleTypeStr);
     if (!MaybeAddressPointOffset)
       continue;
 
@@ -541,8 +581,9 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
       continue;
 
     auto &Candidate = Candidates[CalleeIndexIter->second];
-    Candidate.VTableGUIDAndCounts.push_back(
-        {VTableVal, VTableValueDataArray[j].Count});
+    // There shouldn't be duplicate GUIDs in one !prof metadata, so assign
+    // counters directly won't cause overwrite or counter loss.
+    Candidate.VTableGUIDAndCounts[VTableVal] = VTableValueDataArray[j].Count;
     Candidate.AddressPoints.push_back(
         getOrCreateVTableAddressPointVar(VTableVar, AddressPointOffset));
   }
@@ -550,23 +591,23 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
   return VPtr;
 }
 
-static MDNode *getBranchWeights(LLVMContext &Context, uint64_t IfCount,
-                                uint64_t ElseCount) {
+// Creates 'branch_weights' prof metadata using TrueWeight and FalseWeight.
+// Scales uint64_t counters down to uint32_t if necessary to prevent overflow.
+static MDNode *createBranchWeights(LLVMContext &Context, uint64_t TrueWeight,
+                                   uint64_t FalseWeight) {
   MDBuilder MDB(Context);
-  uint64_t Scale = calculateCountScale(std::max(IfCount, ElseCount));
-  return MDB.createBranchWeights(scaleBranchCount(IfCount, Scale),
-                                 scaleBranchCount(ElseCount, Scale));
+  uint64_t Scale = calculateCountScale(std::max(TrueWeight, FalseWeight));
+  return MDB.createBranchWeights(scaleBranchCount(TrueWeight, Scale),
+                                 scaleBranchCount(FalseWeight, Scale));
 }
 
 CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
                                          uint64_t Count, uint64_t TotalCount,
                                          bool AttachProfToDirectCall,
                                          OptimizationRemarkEmitter *ORE) {
-  MDNode *BranchWeights =
-      getBranchWeights(CB.getContext(), Count, TotalCount - Count);
-
-  CallBase &NewInst =
-      promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
+  CallBase &NewInst = promoteCallWithIfThenElse(
+      CB, DirectCallee,
+      createBranchWeights(CB.getContext(), Count, TotalCount - Count));
 
   if (AttachProfToDirectCall)
     setBranchWeights(NewInst, {static_cast<uint32_t>(Count)});
@@ -600,10 +641,13 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
     NumOfPGOICallPromotion++;
     NumPromoted++;
 
-    if (!ICPEnableVTableCmp || C.VTableGUIDAndCounts.empty())
+    if (!EnableVTableProfileUse || C.VTableGUIDAndCounts.empty())
       continue;
 
-    // Update VTableGUIDCounts
+    // After a virtual call candidate gets promoted, update the vtable's counts
+    // proportionally. Each vtable-guid in `C.VTableGUIDAndCounts` represents
+    // a vtable from which the virtual call is loaded. Compute the sum and use
+    // 128-bit APInt to improve accuracy.
     uint64_t SumVTableCount = 0;
     for (const auto &[GUID, VTableCount] : C.VTableGUIDAndCounts)
       SumVTableCount += VTableCount;
@@ -671,22 +715,20 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
     MutableArrayRef<InstrProfValueData> ICallProfDataRef,
     VTableGUIDCountsMap &VTableGUIDCounts) {
   SmallVector<uint64_t, 4> PromotedFuncCount;
+
   for (const auto &Candidate : Candidates) {
-    uint64_t IfCount = 0;
-    for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts) {
-      IfCount += Count;
+    for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
       VTableGUIDCounts[GUID] -= Count;
-    }
 
-    // Use indirect call counters to compute branch weights.
+    // 'OriginalBB' is the basic block of indirect call before indirect call
+    // promotion.
     BasicBlock *OriginalBB = CB.getParent();
     promoteCallWithVTableCmp(
         CB, VPtr, Candidate.TargetFunction, Candidate.AddressPoints,
-        getBranchWeights(CB.getContext(), IfCount, TotalFuncCount - IfCount));
+        createBranchWeights(CB.getContext(), Candidate.Count,
+                            TotalFuncCount - Candidate.Count));
 
-    int SinkCount = tryToSinkInstructions(
-        PromotedFuncCount.empty() ? VPtr : OriginalBB->getFirstNonPHI(),
-        CB.getParent());
+    int SinkCount = tryToSinkInstructions(OriginalBB, CB.getParent());
 
     ORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
@@ -700,9 +742,9 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
              << " instructions";
     });
 
-    PromotedFuncCount.push_back(IfCount);
+    PromotedFuncCount.push_back(Candidate.Count);
 
-    TotalFuncCount -= IfCount;
+    TotalFuncCount -= Candidate.Count;
     NumOfPGOICallPromotion++;
   }
 
@@ -711,8 +753,10 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
 
   // Update value profiles for 'CB' and 'VPtr', assuming that each 'CB' has a
   // a distinct 'VPtr'.
-  // TODO: Handle profile update properly when Clang `-fstrict-vtable-pointers`
-  // is enabled and a vtable is used to load multiple virtual functions.
+  // FIXME: When Clang `-fstrict-vtable-pointers` is enabled, a vtable might be
+  // used to load multiple virtual functions. The vtable profiles needs to be
+  // updated properly in that case (e.g, annotate type profiles per indirect
+  // call).
   for (size_t I = 0; I < PromotedFuncCount.size(); I++)
     ICallProfDataRef[I].Count -=
         std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count);
@@ -770,7 +814,7 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
 // cannot sink to indirect fallback.
 bool IndirectCallPromoter::isProfitableToCompareVTables(
     const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount) {
-  if (!ICPEnableVTableCmp || Candidates.empty())
+  if (!EnableVTableProfileUse || Candidates.empty())
     return false;
   uint64_t RemainingVTableCount = TotalCount;
   for (size_t I = 0; I < Candidates.size(); I++) {
@@ -779,17 +823,16 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
     for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
       VTableSumCount += Count;
 
-    if (VTableSumCount < Candidate.Count * ICPVTableCountPercentage)
+    if (VTableSumCount < Candidate.Count * ICPVTablePercentageThreshold)
       return false;
 
     RemainingVTableCount -= Candidate.Count;
 
-    int NumAdditionalVTable = 0;
+    int MaxNumVTable = 1;
     if (I == Candidates.size() - 1)
-      NumAdditionalVTable = ICPNumAdditionalVTableLast;
+      MaxNumVTable = ICPMaxNumVTableLastCandidate;
 
-    int ActualNumAdditionalInst = Candidate.AddressPoints.size() - 1;
-    if (ActualNumAdditionalInst > NumAdditionalVTable) {
+    if ((int)Candidate.AddressPoints.size() > MaxNumVTable) {
       return false;
     }
   }
@@ -810,45 +853,6 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
     return FAM.getResult<DominatorTreeAnalysis>(F);
   };
 
-  auto compute = [&](Function *Func) {
-    if (!Func || Func->use_empty())
-      return;
-    // Iterate all type.test calls and find all indirect calls.
-    // TODO: Add llvm.public.type.test
-    for (Use &U : llvm::make_early_inc_range(Func->uses())) {
-      auto *CI = dyn_cast<CallInst>(U.getUser());
-      if (!CI)
-        continue;
-      auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
-      if (!TypeMDVal)
-        continue;
-      auto *CompatibleTypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
-      if (!CompatibleTypeId)
-        continue;
-
-      // Find out all devirtualizable call sites given a llvm.type.test
-      // intrinsic call.
-      SmallVector<DevirtCallSite, 1> DevirtCalls;
-      SmallVector<CallInst *, 1> Assumes;
-      auto &DT = LookupDomTree(*CI->getFunction());
-      findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
-
-      // type-id, offset from the address point
-      // combined with type metadata to compute function offset
-      for (auto &DevirtCall : DevirtCalls) {
-        CallBase &CB = DevirtCall.CB;
-        // Given an indirect call, try find the instruction which loads a
-        // pointer to virtual table.
-        Instruction *VTablePtr =
-            PGOIndirectCallVisitor::tryGetVTableInstruction(&CB);
-        if (!VTablePtr)
-          continue;
-        VirtualCSInfo[&CB] = {DevirtCall.Offset, VTablePtr,
-                              CompatibleTypeId->getString()};
-      }
-    }
-  };
-
   // Right now only llvm.type.test is used to find out virtual call sites.
   // With ThinLTO and whole-program-devirtualization, llvm.type.test and
   // llvm.public.type.test are emitted, and llvm.public.type.test is either
@@ -859,12 +863,39 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
   // that case.
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+  if (!TypeTestFunc || TypeTestFunc->use_empty())
+    return;
+  // Iterate all type.test calls and find all indirect calls.
+  for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
+    if (!CI)
+      continue;
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+    if (!TypeMDVal)
+      continue;
+    auto *CompatibleTypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!CompatibleTypeId)
+      continue;
 
-  compute(TypeTestFunc);
-
-  Function *PublicTypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
-  compute(PublicTypeTestFunc);
+    // Find out all devirtualizable call sites given a llvm.type.test
+    // intrinsic call.
+    SmallVector<DevirtCallSite, 1> DevirtCalls;
+    SmallVector<CallInst *, 1> Assumes;
+    auto &DT = LookupDomTree(*CI->getFunction());
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
+
+    for (auto &DevirtCall : DevirtCalls) {
+      CallBase &CB = DevirtCall.CB;
+      // Given an indirect call, try find the instruction which loads a
+      // pointer to virtual table.
+      Instruction *VTablePtr =
+          PGOIndirectCallVisitor::tryGetVTableInstruction(&CB);
+      if (!VTablePtr)
+        continue;
+      VirtualCSInfo[&CB] = {DevirtCall.Offset, VTablePtr,
+                            CompatibleTypeId->getString()};
+    }
+  }
 }
 
 // A wrapper function that does the actual work.
@@ -883,11 +914,13 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
 
   computeVirtualCallSiteTypeInfoMap(M, MAM, VirtualCSInfo);
 
-  // This map records states across functions in an LLVM IR module.
-  // IndirectCallPromoter processes one
-  // function at a time and updates this map with new entries the first time
-  // the entry is needed in the module; the subsequent functions could re-use
-  // map entries inserted when processing prior functions.
+  // VTableAddressPointOffsetVal stores the vtable address points. The vtable
+  // address point of a given <vtable, address point offset> is static (doesn't
+  // change after being computed once).
+  // IndirectCallPromoter::getOrCreateVTableAddressPointVar creates the map
+  // entry the first time a <vtable, offset> pair is seen, as
+  // promoteIndirectCalls processes an IR module and calls IndirectCallPromoter
+  // repeatedly on each function.
   VTableAddressPointOffsetValMap VTableAddressPointOffsetVal;
 
   for (auto &F : M) {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 2269c2e0fffae..26444a5f3d138 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -319,6 +319,8 @@ static cl::opt<unsigned> PGOFunctionCriticalEdgeThreshold(
     cl::desc("Do not instrument functions with the number of critical edges "
              " greater than this threshold."));
 
+extern cl::opt<unsigned> MaxNumVTableAnnotations;
+
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
@@ -331,6 +333,7 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
 // Command line option to enable vtable value profiling. Defined in
 // ProfileData/InstrProf.cpp: -enable-vtable-value-profiling=
 extern cl::opt<bool> EnableVTableValueProfiling;
+extern cl::opt<bool> EnableVTableProfileUse;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
@@ -1726,6 +1729,14 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
+static uint32_t getMaxNumAnnotations(InstrProfValueKind ValueProfKind) {
+  if (ValueProfKind == IPVK_MemOPSize)
+    return MaxNumMemOPAnnotations;
+  if (ValueProfKind == llvm::IPVK_VTableTarget)
+    return MaxNumVTableAnnotations;
+  return MaxNumAnnotations;
+}
+
 // Traverse all valuesites and annotate the instructions for all value kind.
 void PGOUseFunc::annotateValueSites() {
   if (isValueProfilingDisabled())
@@ -1760,10 +1771,10 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) {
     LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
                       << "): Index = " << ValueSiteIndex << " out of "
                       << NumValueSites << "\n");
-    annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
-                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
-                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
-                                             : MaxNumAnnotations);
+    annotateValueSite(
+        *M, *I.AnnotatedInst, ProfileRecord,
+        static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+        getMaxNumAnnotations(static_cast<InstrProfValueKind>(Kind)));
     ValueSiteIndex++;
   }
 }
@@ -2052,6 +2063,16 @@ static bool annotateAllFunctions(
     return false;
   }
 
+  if (EnableVTableProfileUse) {
+    for (GlobalVariable &G : M.globals()) {
+      if (!G.hasName() || !G.hasMetadata(LLVMContext::MD_type))
+        continue;
+
+      // Create the PGOFuncName meta data.
+      createPGONameMetadata(G, getPGOName(G, false /* InLTO*/));
+    }
+  }
+
   // Add the profile summary (read from the header of the indexed summary) here
   // so that we can use it below when reading counters (which checks if the
   // function should be marked with a cold or inlinehint attribute).
@@ -2227,7 +2248,6 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
   };
 
   auto *PSI = &MAM.getResult<ProfileSummaryAnalysis>(M);
-
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, *FS,
                             LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
     return PreservedAnalyses::all();
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
index 96a2b2360787b..7b7f6d17d59f0 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 
-; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -icp-num-additional-vtable-last=2 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,VTABLE-CMP
-; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -icp-num-additional-vtable-last=0 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,FUNC-CMP
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=2 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,VTABLE-CMP
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -icp-max-num-vtable-last-candidate=1 -S 2>&1 | FileCheck %s --check-prefixes=VTABLE-COMMON,FUNC-CMP
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
- at Base1 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0
- at Base2 = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo] }, !type !2
- at Base3 = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo] }, !type !6
+ at Base1 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0
+ at Base2 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo] }, !type !2
+ at Base3 = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo] }, !type !6
 
- at Derived1 = dso_local constant { [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived1_bar] }, !type !1, !type !2, !type !3
- at Derived2 = dso_local constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo], [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived2_bar] }, !type !4, !type !5, !type !6, !type !7
- at Derived3 = dso_local constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0, !type !8
+ at Derived1 = constant { [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived1_bar] }, !type !1, !type !2, !type !3
+ at Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo], [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived2_bar] }, !type !4, !type !5, !type !6, !type !7
+ at Derived3 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0, !type !8
 
 ; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived1_bar with count 600 out of 1600, compare 1 vtables and sink 2 instructions
 ; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived2_bar with count 500 out of 1000, compare 1 vtables and sink 2 instructions
@@ -28,13 +28,13 @@ define void @test(ptr %d) {
 ; VTABLE-CMP-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived1, i32 40)
 ; VTABLE-CMP-NEXT:    br i1 [[TMP1]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF10:![0-9]+]]
 ; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG]]:
-; VTABLE-CMP-NEXT:    tail call void @Derived1_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    call void @Derived1_bar(ptr [[D]])
 ; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP:.*]]
 ; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT]]:
 ; VTABLE-CMP-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Derived2, i32 64)
 ; VTABLE-CMP-NEXT:    br i1 [[TMP2]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF11:![0-9]+]]
 ; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG1]]:
-; VTABLE-CMP-NEXT:    tail call void @Derived2_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    call void @Derived2_bar(ptr [[D]])
 ; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP3:.*]]
 ; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT2]]:
 ; VTABLE-CMP-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds (i8, ptr @Base1, i32 16)
@@ -42,12 +42,12 @@ define void @test(ptr %d) {
 ; VTABLE-CMP-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VTABLE-CMP-NEXT:    br i1 [[TMP5]], label %[[IF_TRUE_DIRECT_TARG4:.*]], label %[[IF_FALSE_ORIG_INDIRECT5:.*]], !prof [[PROF12:![0-9]+]]
 ; VTABLE-CMP:       [[IF_TRUE_DIRECT_TARG4]]:
-; VTABLE-CMP-NEXT:    tail call void @Base1_bar(ptr [[D]])
+; VTABLE-CMP-NEXT:    call void @Base1_bar(ptr [[D]])
 ; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP6:.*]]
 ; VTABLE-CMP:       [[IF_FALSE_ORIG_INDIRECT5]]:
 ; VTABLE-CMP-NEXT:    [[VFN:%.*]] = getelementptr inbounds ptr, ptr [[VTABLE]], i64 1
 ; VTABLE-CMP-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[VFN]], align 8
-; VTABLE-CMP-NEXT:    tail call void [[TMP6]](ptr [[D]])
+; VTABLE-CMP-NEXT:    call void [[TMP6]](ptr [[D]])
 ; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP6]]
 ; VTABLE-CMP:       [[IF_END_ICP6]]:
 ; VTABLE-CMP-NEXT:    br label %[[IF_END_ICP3]]
@@ -67,22 +67,22 @@ define void @test(ptr %d) {
 ; FUNC-CMP-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], @Derived1_bar
 ; FUNC-CMP-NEXT:    br i1 [[TMP2]], label %[[IF_TRUE_DIRECT_TARG:.*]], label %[[IF_FALSE_ORIG_INDIRECT:.*]], !prof [[PROF10:![0-9]+]]
 ; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG]]:
-; FUNC-CMP-NEXT:    tail call void @Derived1_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    call void @Derived1_bar(ptr [[D]])
 ; FUNC-CMP-NEXT:    br label %[[IF_END_ICP:.*]]
 ; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT]]:
 ; FUNC-CMP-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[TMP1]], @Derived2_bar
 ; FUNC-CMP-NEXT:    br i1 [[TMP3]], label %[[IF_TRUE_DIRECT_TARG1:.*]], label %[[IF_FALSE_ORIG_INDIRECT2:.*]], !prof [[PROF11:![0-9]+]]
 ; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG1]]:
-; FUNC-CMP-NEXT:    tail call void @Derived2_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    call void @Derived2_bar(ptr [[D]])
 ; FUNC-CMP-NEXT:    br label %[[IF_END_ICP3:.*]]
 ; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT2]]:
 ; FUNC-CMP-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[TMP1]], @Base1_bar
 ; FUNC-CMP-NEXT:    br i1 [[TMP4]], label %[[IF_TRUE_DIRECT_TARG4:.*]], label %[[IF_FALSE_ORIG_INDIRECT5:.*]], !prof [[PROF12:![0-9]+]]
 ; FUNC-CMP:       [[IF_TRUE_DIRECT_TARG4]]:
-; FUNC-CMP-NEXT:    tail call void @Base1_bar(ptr [[D]])
+; FUNC-CMP-NEXT:    call void @Base1_bar(ptr [[D]])
 ; FUNC-CMP-NEXT:    br label %[[IF_END_ICP6:.*]]
 ; FUNC-CMP:       [[IF_FALSE_ORIG_INDIRECT5]]:
-; FUNC-CMP-NEXT:    tail call void [[TMP1]](ptr [[D]])
+; FUNC-CMP-NEXT:    call void [[TMP1]](ptr [[D]])
 ; FUNC-CMP-NEXT:    br label %[[IF_END_ICP6]]
 ; FUNC-CMP:       [[IF_END_ICP6]]:
 ; FUNC-CMP-NEXT:    br label %[[IF_END_ICP3]]
@@ -97,7 +97,7 @@ entry:
   tail call void @llvm.assume(i1 %0)
   %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
   %1 = load ptr, ptr %vfn
-  tail call void %1(ptr %d), !prof !10
+  call void %1(ptr %d), !prof !10
   ret void
 }
 
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
index e82aa9f14788c..6d3a6972f6885 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_invoke.ll
@@ -1,10 +1,10 @@
-; RUN: opt < %s -passes='pgo-icall-prom' -icp-enable-vtable-cmp -S | FileCheck %s --check-prefix=VTABLE
+; RUN: opt < %s -passes='pgo-icall-prom' -enable-vtable-profile-use -S | FileCheck %s --check-prefix=VTABLE
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
- at _ZTV4Base = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base10get_ticketEv] }, !type !0, !type !1
- at _ZTV7Derived = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived10get_ticketEv] }, !type !0, !type !1, !type !2, !type !3
+ at _ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base10get_ticketEv] }, !type !0, !type !1
+ at _ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived10get_ticketEv] }, !type !0, !type !1, !type !2, !type !3
 
 @.str = private constant [15 x i8] c"out of tickets\00"
 
@@ -109,12 +109,10 @@ lpad:
   resume { ptr, i32 } %0
 }
 
-declare i1 @llvm.type.test(ptr, metadata) #2
+declare i1 @llvm.type.test(ptr, metadata)
 declare void @llvm.assume(i1)
 declare i32 @__gxx_personality_v0(...)
 
-attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
 !0 = !{i64 16, !"_ZTS4Base"}
 !1 = !{i64 16, !"_ZTSM4BaseFivE.virtual"}
 !2 = !{i64 16, !"_ZTS7Derived"}
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
index 1dc208c30952e..d9126aec3d94e 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -icp-enable-vtable-cmp -S 2>&1 | FileCheck %s --check-prefixes=VTABLE,REMARK
+; RUN: opt < %s -passes='pgo-icall-prom' -pass-remarks=pgo-icall-prom -enable-vtable-profile-use -S 2>&1 | FileCheck %s --check-prefixes=VTABLE,REMARK
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN7Derived5func1Eii with count 900 out of 1600, compare 1 vtables and sink 1 instruction
 ; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN4Base5func1Eii with count 700 out of 700, compare 1 vtables and sink 1 instructions
 
- at _ZTV7Derived = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, align 8, !type !0, !type !1, !type !2, !type !3
- at _ZTV4Base = dso_local constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, align 8, !type !0, !type !1
+ at _ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, !type !0, !type !1, !type !2, !type !3
+ at _ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, !type !0, !type !1
 
 define i32 @test_tail_call(ptr %ptr, i32 %a, i32 %b) {
 ; VTABLE-LABEL: define i32 @test_tail_call(
@@ -43,6 +43,7 @@ entry:
 
 declare i1 @llvm.type.test(ptr, metadata)
 declare void @llvm.assume(i1)
+
 define i32 @_ZN7Derived5func1Eii(ptr %this, i32 %a, i32 %b) {
 entry:
   %sub = sub nsw i32 %a, %b

>From de2b9a3b728c948621fc6464e582feaa132b1faa Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Tue, 11 Jun 2024 17:37:41 -0700
Subject: [PATCH 11/16] Resolve review feedback

---
 .../Instrumentation/IndirectCallPromotion.cpp | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index c805cf3f22f36..58c074493d3f2 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -720,8 +720,9 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
     for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
       VTableGUIDCounts[GUID] -= Count;
 
-    // 'OriginalBB' is the basic block of indirect call before indirect call
-    // promotion.
+    // 'OriginalBB' is the basic block of indirect call. After each candidate
+    // is promoted, a new basic block is created for the indirect fallback basic
+    // block and indirect call `CB` is moved into this new BB.
     BasicBlock *OriginalBB = CB.getParent();
     promoteCallWithVTableCmp(
         CB, VPtr, Candidate.TargetFunction, Candidate.AddressPoints,
@@ -744,7 +745,13 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
 
     PromotedFuncCount.push_back(Candidate.Count);
 
-    TotalFuncCount -= Candidate.Count;
+    assert(TotalFuncCount >= Candidate.Count &&
+           "Within one prof metadata, total count is the sum of counts from "
+           "individual <target, count> pairs");
+    // Use std::min since 'TotalFuncCount' is the saturating sum of individual
+    // counts, see
+    // https://github.com/llvm/llvm-project/blob/abedb3b8356d5d56f1c575c4f7682fba2cb19787/llvm/lib/ProfileData/InstrProf.cpp#L1281-L1288
+    TotalFuncCount -= std::min(TotalFuncCount, Candidate.Count);
     NumOfPGOICallPromotion++;
   }
 
@@ -817,7 +824,8 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
   if (!EnableVTableProfileUse || Candidates.empty())
     return false;
   uint64_t RemainingVTableCount = TotalCount;
-  for (size_t I = 0; I < Candidates.size(); I++) {
+  const size_t CandidateSize = Candidates.size();
+  for (size_t I = 0; I < CandidateSize; I++) {
     auto &Candidate = Candidates[I];
     uint64_t VTableSumCount = 0;
     for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
@@ -828,8 +836,11 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
 
     RemainingVTableCount -= Candidate.Count;
 
+    // Allowing more than one vtables for non last candidates may or may not
+    // elongates dependency chain for the subsequent candidates, so do this for
+    // the last candidate conservatively.
     int MaxNumVTable = 1;
-    if (I == Candidates.size() - 1)
+    if (I == CandidateSize - 1)
       MaxNumVTable = ICPMaxNumVTableLastCandidate;
 
     if ((int)Candidate.AddressPoints.size() > MaxNumVTable) {
@@ -845,14 +856,15 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
   return true;
 }
 
+// For virtual calls in the module, collect per-callsite information which will
+// be used to associate an ICP candidate with a vtable and a specific function
+// in the vtable. With type intrinsics (llvm.type.test), we can find virtual
+// calls in a compile-time efficient manner (by iterating its users) and more
+// importantly use the compatible type later to figure out the function byte
+// offset relative to the start of vtables.
 static void
 computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
                                   VirtualCallSiteTypeInfoMap &VirtualCSInfo) {
-  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
-    return FAM.getResult<DominatorTreeAnalysis>(F);
-  };
-
   // Right now only llvm.type.test is used to find out virtual call sites.
   // With ThinLTO and whole-program-devirtualization, llvm.type.test and
   // llvm.public.type.test are emitted, and llvm.public.type.test is either
@@ -865,7 +877,12 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
   if (!TypeTestFunc || TypeTestFunc->use_empty())
     return;
-  // Iterate all type.test calls and find all indirect calls.
+
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+  // Iterate all type.test calls to find all indirect calls.
   for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
     auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
@@ -912,7 +929,8 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, bool InLTO,
   bool Changed = false;
   VirtualCallSiteTypeInfoMap VirtualCSInfo;
 
-  computeVirtualCallSiteTypeInfoMap(M, MAM, VirtualCSInfo);
+  if (EnableVTableProfileUse)
+    computeVirtualCallSiteTypeInfoMap(M, MAM, VirtualCSInfo);
 
   // VTableAddressPointOffsetVal stores the vtable address points. The vtable
   // address point of a given <vtable, address point offset> is static (doesn't

>From b607ac36cac2d2cd276df8ec5a9879ae5ef426fe Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Wed, 12 Jun 2024 23:22:26 -0700
Subject: [PATCH 12/16] Changes: 1. Resolve review comments 2. Use
 unordered_map rather than SmallDenseMap for a couple of maps.    -
 unordered_map calls allocator for each element, and (Small)DenseMap     
 allocate elements in batch. But DenseMap size grows aggressively      under
 size 64 [1] so not memory efficient. 3. Use stable_sort when sorting <target,
 count> pairs by count. 4. Only update VPtr value profiles if
 'EnableVTableValueProfile' is true    and 'VPtr' has profiles.

[1] DenseMap https://github.com/llvm/llvm-project/blob/092dbfaad257885692fa64559e9eb43a5c466798/llvm/include/llvm/ADT/DenseMap.h#L849
    SmallDenseMap https://github.com/llvm/llvm-project/blob/092dbfaad257885692fa64559e9eb43a5c466798/llvm/include/llvm/ADT/DenseMap.h#L1088
---
 .../llvm/Analysis/IndirectCallVisitor.h       |  1 -
 llvm/lib/ProfileData/InstrProf.cpp            |  2 +-
 .../Instrumentation/IndirectCallPromotion.cpp | 67 +++++++++++--------
 llvm/lib/Transforms/Utils/InlineFunction.cpp  | 32 ++++-----
 4 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IndirectCallVisitor.h b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
index f070e83c41689..6c424038070dc 100644
--- a/llvm/include/llvm/Analysis/IndirectCallVisitor.h
+++ b/llvm/include/llvm/Analysis/IndirectCallVisitor.h
@@ -41,7 +41,6 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
       return nullptr;
 
     LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());
-
     if (LI != nullptr) {
       Value *FuncPtr = LI->getPointerOperand(); // GEP (or bitcast)
       Value *VTablePtr = FuncPtr->stripInBoundsConstantOffsets();
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 4649db2d92ec5..e5064297dad4c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1404,7 +1404,7 @@ static void createPGONameMetadata(GlobalObject &GO, StringRef MetadataName,
   if (GO.getName() == PGOName)
     return;
 
-  // Don't created duplictaed metadata.
+  // Don't created duplicated metadata.
   if (GO.getMetadata(MetadataName))
     return;
 
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 58c074493d3f2..f99c4e3e967a4 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -46,6 +46,7 @@
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -138,7 +139,8 @@ namespace {
 // In the inner map, the key represents address point offsets and the value is a
 // constant for this address point.
 using VTableAddressPointOffsetValMap =
-    SmallDenseMap<const GlobalVariable *, SmallDenseMap<int, Constant *>>;
+    std::unordered_map<const GlobalVariable *,
+                       std::unordered_map<int, Constant *>>;
 
 // A struct to collect type information for a virtual call site.
 struct VirtualCallSiteInfo {
@@ -152,10 +154,10 @@ struct VirtualCallSiteInfo {
 
 // The key is a virtual call, and value is its type information.
 using VirtualCallSiteTypeInfoMap =
-    SmallDenseMap<const CallBase *, VirtualCallSiteInfo>;
+    std::unordered_map<const CallBase *, VirtualCallSiteInfo>;
 
 // The key is vtable GUID, and value is its value profile count.
-using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t>;
+using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t, 16>;
 
 // Returns the address point offset of the given compatible type.
 //
@@ -365,15 +367,16 @@ class IndirectCallPromoter {
       MutableArrayRef<InstrProfValueData> ICallProfDataRef,
       VTableGUIDCountsMap &VTableGUIDCounts);
 
-  // Returns true if it's profitable to compare vtables.
+  // Returns true if it's profitable to compare vtables for the callsite.
   bool isProfitableToCompareVTables(
-      const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount);
+      const CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+      uint64_t TotalCount);
 
   // Given an indirect callsite and the list of function candidates, compute
   // the following vtable information in output parameters and returns vtable
   // pointer if type profiles exist.
-  // - Populate `VTableGUIDCounts` with <vtable-guid, count> with !prof metadata
-  // attached on the vtable pointer.
+  // - Populate `VTableGUIDCounts` with <vtable-guid, count> using !prof
+  // metadata attached on the vtable pointer.
   // - For each function candidate, finds out the vtables from which it get
   // called and stores the <vtable-guid, count> there.
   Instruction *computeVTableInfos(const CallBase *CB,
@@ -559,7 +562,7 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
     GUIDCountsMap[VTableVal] = VTableValueDataArray[j].Count;
     GlobalVariable *VTableVar = Symtab->getGlobalVariable(VTableVal);
     if (!VTableVar) {
-      LLVM_DEBUG(dbgs() << "\tCannot find vtable definition for " << VTableVal
+      LLVM_DEBUG(dbgs() << "Cannot find vtable definition for " << VTableVal
                         << "; maybe the vtable isn't imported\n");
       continue;
     }
@@ -666,13 +669,9 @@ bool IndirectCallPromoter::tryToPromoteWithFuncCmp(
          "of values in profile metadata");
 
   // Update value profiles on the indirect call.
-  // TODO: Handle profile update properly when Clang `-fstrict-vtable-pointers`
-  // is enabled and a vtable is used to load multiple virtual functions.
   updateFuncValueProfiles(CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
                           NumCandidates);
-  // Update value profiles on the vtable pointer if it exists.
-  if (VPtr)
-    updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
+  updateVPtrValueProfiles(VPtr, VTableGUIDCounts);
   return true;
 }
 
@@ -689,6 +688,9 @@ void IndirectCallPromoter::updateFuncValueProfiles(
 
 void IndirectCallPromoter::updateVPtrValueProfiles(
     Instruction *VPtr, VTableGUIDCountsMap &VTableGUIDCounts) {
+  if (!EnableVTableProfileUse || VPtr == nullptr ||
+      !VPtr->getMetadata(LLVMContext::MD_prof))
+    return;
   VPtr->setMetadata(LLVMContext::MD_prof, nullptr);
   std::vector<InstrProfValueData> VTableValueProfiles;
   uint64_t TotalVTableCount = 0;
@@ -762,17 +764,17 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
   // a distinct 'VPtr'.
   // FIXME: When Clang `-fstrict-vtable-pointers` is enabled, a vtable might be
   // used to load multiple virtual functions. The vtable profiles needs to be
-  // updated properly in that case (e.g, annotate type profiles per indirect
-  // call).
+  // updated properly in that case (e.g, for each indirect call annotate both
+  // type profiles and function profiles in one !prof).
   for (size_t I = 0; I < PromotedFuncCount.size(); I++)
     ICallProfDataRef[I].Count -=
         std::max(PromotedFuncCount[I], ICallProfDataRef[I].Count);
   // Sort value profiles by count in descending order.
-  llvm::sort(ICallProfDataRef.begin(), ICallProfDataRef.end(),
-             [](const InstrProfValueData &LHS, const InstrProfValueData &RHS) {
-               return LHS.Count > RHS.Count;
-             });
-  // Drop the <target-value, count> pair if count is not greater than zero.
+  llvm::stable_sort(ICallProfDataRef, [](const InstrProfValueData &LHS,
+                                         const InstrProfValueData &RHS) {
+    return LHS.Count > RHS.Count;
+  });
+  // Drop the <target-value, count> pair if count is zero.
   ArrayRef<InstrProfValueData> VDs(
       ICallProfDataRef.begin(),
       llvm::upper_bound(ICallProfDataRef, 0U,
@@ -805,7 +807,7 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
     Instruction *VPtr =
         computeVTableInfos(CB, VTableGUIDCounts, PromotionCandidates);
 
-    if (isProfitableToCompareVTables(PromotionCandidates, TotalCount))
+    if (isProfitableToCompareVTables(*CB, PromotionCandidates, TotalCount))
       Changed |= tryToPromoteWithVTableCmp(*CB, VPtr, PromotionCandidates,
                                            TotalCount, NumCandidates,
                                            ICallProfDataRef, VTableGUIDCounts);
@@ -820,25 +822,34 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
 // TODO: Returns false if the function addressing and vtable load instructions
 // cannot sink to indirect fallback.
 bool IndirectCallPromoter::isProfitableToCompareVTables(
-    const std::vector<PromotionCandidate> &Candidates, uint64_t TotalCount) {
+    const CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+    uint64_t TotalCount) {
   if (!EnableVTableProfileUse || Candidates.empty())
     return false;
   uint64_t RemainingVTableCount = TotalCount;
   const size_t CandidateSize = Candidates.size();
   for (size_t I = 0; I < CandidateSize; I++) {
     auto &Candidate = Candidates[I];
-    uint64_t VTableSumCount = 0;
+    uint64_t CandidateVTableCount = 0;
     for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
-      VTableSumCount += Count;
+      CandidateVTableCount += Count;
 
-    if (VTableSumCount < Candidate.Count * ICPVTablePercentageThreshold)
+    if (CandidateVTableCount < Candidate.Count * ICPVTablePercentageThreshold) {
+      LLVM_DEBUG(dbgs() << "For callsite #" << NumOfPGOICallsites << CB << I
+                        << "-th candidate, function count " << Candidate.Count
+                        << " and its vtable count " << CandidateVTableCount
+                        << " have discrepancies\n");
       return false;
+    }
 
     RemainingVTableCount -= Candidate.Count;
 
-    // Allowing more than one vtables for non last candidates may or may not
-    // elongates dependency chain for the subsequent candidates, so do this for
-    // the last candidate conservatively.
+    // 'MaxNumVTable' limits the number of vtables to make vtable comparison
+    // profitable. Comparing multiple vtables for one function candidate will
+    // insert additional instructions on the hot path, and allowing more than
+    // one vtable for non last candidates may or may not elongates dependency
+    // chain for the subsequent candidates. Set its value to 1 for non-last
+    // candidate and allow option to override it for the last candidate.
     int MaxNumVTable = 1;
     if (I == CandidateSize - 1)
       MaxNumVTable = ICPMaxNumVTableLastCandidate;
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 308a07ddf8d2e..bb5bc76869ccf 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1964,6 +1964,13 @@ void llvm::updateProfileCallee(
           ? 0
           : PriorEntryCount + EntryDelta;
 
+  auto updateVTableProfWeight = [](CallBase *CB, const uint64_t NewEntryCount,
+                                   const uint64_t PriorEntryCount) {
+    Instruction *VPtr = PGOIndirectCallVisitor::tryGetVTableInstruction(CB);
+    if (VPtr)
+      scaleProfData(*VPtr, NewEntryCount, PriorEntryCount);
+  };
+
   // During inlining ?
   if (VMap) {
     uint64_t CloneEntryCount = PriorEntryCount - NewEntryCount;
@@ -1971,20 +1978,13 @@ void llvm::updateProfileCallee(
       if (isa<CallInst>(Entry.first))
         if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second)) {
           CI->updateProfWeight(CloneEntryCount, PriorEntryCount);
-
-          Instruction *VPtr =
-              PGOIndirectCallVisitor::tryGetVTableInstruction(CI);
-          if (VPtr)
-            scaleProfData(*VPtr, CloneEntryCount, PriorEntryCount);
+          updateVTableProfWeight(CI, CloneEntryCount, PriorEntryCount);
         }
+
       if (isa<InvokeInst>(Entry.first))
         if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second)) {
           II->updateProfWeight(CloneEntryCount, PriorEntryCount);
-
-          Instruction *VPtr =
-              PGOIndirectCallVisitor::tryGetVTableInstruction(II);
-          if (VPtr)
-            scaleProfData(*VPtr, CloneEntryCount, PriorEntryCount);
+          updateVTableProfWeight(II, CloneEntryCount, PriorEntryCount);
         }
     }
   }
@@ -1998,19 +1998,11 @@ void llvm::updateProfileCallee(
         for (Instruction &I : BB) {
           if (CallInst *CI = dyn_cast<CallInst>(&I)) {
             CI->updateProfWeight(NewEntryCount, PriorEntryCount);
-
-            Instruction *VPtr =
-                PGOIndirectCallVisitor::tryGetVTableInstruction(CI);
-            if (VPtr)
-              scaleProfData(*VPtr, NewEntryCount, PriorEntryCount);
+            updateVTableProfWeight(CI, NewEntryCount, PriorEntryCount);
           }
           if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
             II->updateProfWeight(NewEntryCount, PriorEntryCount);
-
-            Instruction *VPtr =
-                PGOIndirectCallVisitor::tryGetVTableInstruction(II);
-            if (VPtr)
-              scaleProfData(*VPtr, NewEntryCount, PriorEntryCount);
+            updateVTableProfWeight(II, NewEntryCount, PriorEntryCount);
           }
         }
   }

>From 360e5e682921ff418b85885fe5e13eb2d57f0b17 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Thu, 13 Jun 2024 09:56:15 -0700
Subject: [PATCH 13/16] Changes 1. remove unused headers 2. use SmallDenseMap
 for outer map and unorderd_map for inner map (the    latter is more
 memory-efficient)

---
 .../Transforms/Instrumentation/IndirectCallPromotion.cpp  | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index f99c4e3e967a4..757543ff7c439 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -41,12 +40,10 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -139,8 +136,7 @@ namespace {
 // In the inner map, the key represents address point offsets and the value is a
 // constant for this address point.
 using VTableAddressPointOffsetValMap =
-    std::unordered_map<const GlobalVariable *,
-                       std::unordered_map<int, Constant *>>;
+    SmallDenseMap<const GlobalVariable *, std::unordered_map<int, Constant *>>;
 
 // A struct to collect type information for a virtual call site.
 struct VirtualCallSiteInfo {
@@ -154,7 +150,7 @@ struct VirtualCallSiteInfo {
 
 // The key is a virtual call, and value is its type information.
 using VirtualCallSiteTypeInfoMap =
-    std::unordered_map<const CallBase *, VirtualCallSiteInfo>;
+    SmallDenseMap<const CallBase *, VirtualCallSiteInfo>;
 
 // The key is vtable GUID, and value is its value profile count.
 using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t, 16>;

>From 53be1a63733a41c9aeb51db383d715909df344d3 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Sun, 23 Jun 2024 21:55:44 -0700
Subject: [PATCH 14/16] update after 'git merge main'

---
 llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 757543ff7c439..2b0d6f14e8585 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -609,7 +609,8 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
       createBranchWeights(CB.getContext(), Count, TotalCount - Count));
 
   if (AttachProfToDirectCall)
-    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)});
+    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)},
+                     /*IsExpected=*/false);
 
   using namespace ore;
 

>From 7cd8630980fd42e6d9594487c5d71d6db1f4fad6 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Wed, 26 Jun 2024 15:10:21 -0700
Subject: [PATCH 15/16] resolve review feedback

---
 llvm/lib/ProfileData/InstrProf.cpp            |  5 +-
 .../Instrumentation/IndirectCallPromotion.cpp | 92 +++++++++++++------
 .../Transforms/PGOProfile/icp_vtable_cmp.ll   |  6 +-
 .../PGOProfile/icp_vtable_tail_call.ll        |  4 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  7 --
 5 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index fe6018f170cb5..9dbaa2ca0f020 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1432,11 +1432,12 @@ MDNode *getPGOFuncNameMetadata(const Function &F) {
 
 static void createPGONameMetadata(GlobalObject &GO, StringRef MetadataName,
                                   StringRef PGOName) {
-  // For internal linkage objects, its name is not the same as its PGO name.
+  // Only for internal linkage functions or global variables. The name is not
+  // the same as PGO name for these global objects.
   if (GO.getName() == PGOName)
     return;
 
-  // Don't created duplicated metadata.
+  // Don't create duplicated metadata.
   if (GO.getMetadata(MetadataName))
     return;
 
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 529db260d7eb9..86f817f949cdb 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -122,10 +122,10 @@ static cl::opt<float> ICPVTablePercentageThreshold(
 // Although comparing vtables can save a vtable load, we may need to compare
 // vtable pointer with multiple vtable address points due to class inheritance.
 // Comparing with multiple vtables inserts additional instructions on hot code
-// path; and doing so for earlier candidate of one icall can affect later
-// function candidate in an undesired way. We allow multiple vtable comparison
-// for the last function candidate and use the option below to cap the number
-// of vtables.
+// path, and doing so for an earlier candidate delays the comparisons for later
+// candidates. For the last candidate, only the fallback path is affected.
+// We allow multiple vtable comparison for the last function candidate and use
+// the option below to cap the number of vtables.
 static cl::opt<int> ICPMaxNumVTableLastCandidate(
     "icp-max-num-vtable-last-candidate", cl::init(1), cl::Hidden,
     cl::desc("The maximum number of vtable for the last candidate."));
@@ -157,8 +157,8 @@ using VTableGUIDCountsMap = SmallDenseMap<uint64_t, uint64_t, 16>;
 
 // Returns the address point offset of the given compatible type.
 //
-// Type metadata of a vtable specifies the types that can container a pointer to
-// this vtable, for example, `Base*` can be a pointer to an instantiated type
+// Type metadata of a vtable specifies the types that can contain a pointer to
+// this vtable, for example, `Base*` can be a pointer to an derived type
 // but not vice versa. See also https://llvm.org/docs/TypeMetadata.html
 static std::optional<uint64_t>
 getAddressPointOffset(const GlobalVariable &VTableVar,
@@ -191,7 +191,7 @@ static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
       llvm::ConstantInt::get(Type::getInt32Ty(Context), AddressPointOffset));
 }
 
-// Returns the basic block in which `Inst` is used via its `UserInst`.
+// Returns the basic block in which Use `U` is used via its `UserInst`.
 static BasicBlock *getUserBasicBlock(Use &U, Instruction *UserInst) {
   if (PHINode *PN = dyn_cast<PHINode>(UserInst))
     return PN->getIncomingBlock(U);
@@ -209,10 +209,8 @@ static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
   BasicBlock *BB = Inst->getParent();
   assert(Inst->getParent() != DestBB &&
          BB->getTerminator()->getNumSuccessors() == 2 &&
+         DestBB->getUniquePredecessor() == BB &&
          "Guaranteed by ICP transformation");
-  // Do not sink across a critical edge for simplicity.
-  if (DestBB->getUniquePredecessor() != BB)
-    return false;
 
   // Now we know BB dominates DestBB.
   BasicBlock *UserBB = nullptr;
@@ -286,6 +284,9 @@ static bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 static int tryToSinkInstructions(BasicBlock *OriginalBB,
                                  BasicBlock *IndirectCallBB) {
   int SinkCount = 0;
+  // Do not sink across a critical edge for simplicity.
+  if (IndirectCallBB->getUniquePredecessor() != OriginalBB)
+    return SinkCount;
   // Sink all eligible instructions in OriginalBB in reverse order.
   for (Instruction &I :
        llvm::make_early_inc_range(llvm::drop_begin(llvm::reverse(*OriginalBB))))
@@ -373,8 +374,8 @@ class IndirectCallPromoter {
   // pointer if type profiles exist.
   // - Populate `VTableGUIDCounts` with <vtable-guid, count> using !prof
   // metadata attached on the vtable pointer.
-  // - For each function candidate, finds out the vtables from which it get
-  // called and stores the <vtable-guid, count> there.
+  // - For each function candidate, finds out the vtables from which it gets
+  // called and stores the <vtable-guid, count> in promotion candidate.
   Instruction *computeVTableInfos(const CallBase *CB,
                                   VTableGUIDCountsMap &VTableGUIDCounts,
                                   std::vector<PromotionCandidate> &Candidates);
@@ -537,6 +538,9 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
   if (Iter == VirtualCSInfo.end())
     return nullptr;
 
+  LLVM_DEBUG(dbgs() << "\nComputing vtable infos for callsite #"
+                    << NumOfPGOICallsites << "\n");
+
   const auto &VirtualCallInfo = Iter->second;
   Instruction *VPtr = VirtualCallInfo.VPtr;
 
@@ -558,7 +562,7 @@ Instruction *IndirectCallPromoter::computeVTableInfos(
     GUIDCountsMap[VTableVal] = VTableValueDataArray[j].Count;
     GlobalVariable *VTableVar = Symtab->getGlobalVariable(VTableVal);
     if (!VTableVar) {
-      LLVM_DEBUG(dbgs() << "Cannot find vtable definition for " << VTableVal
+      LLVM_DEBUG(dbgs() << "  Cannot find vtable definition for " << VTableVal
                         << "; maybe the vtable isn't imported\n");
       continue;
     }
@@ -731,15 +735,28 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
     int SinkCount = tryToSinkInstructions(OriginalBB, CB.getParent());
 
     ORE.emit([&]() {
-      return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
-             << "Promote indirect call to "
+      OptimizationRemark Remark(DEBUG_TYPE, "Promoted", &CB);
+
+      const auto &VTableGUIDAndCounts = Candidate.VTableGUIDAndCounts;
+      Remark << "Promote indirect call to "
              << ore::NV("DirectCallee", Candidate.TargetFunction)
              << " with count " << ore::NV("Count", Candidate.Count)
-             << " out of " << ore::NV("TotalCount", TotalFuncCount)
-             << ", compare "
-             << ore::NV("VTable", Candidate.VTableGUIDAndCounts.size())
-             << " vtables and sink " << ore::NV("SinkCount", SinkCount)
-             << " instructions";
+             << " out of " << ore::NV("TotalCount", TotalFuncCount) << ", sink "
+             << ore::NV("SinkCount", SinkCount)
+             << " instruction(s) and compare "
+             << ore::NV("VTable", VTableGUIDAndCounts.size())
+             << " vtable(s): {";
+
+      for (auto Iter = VTableGUIDAndCounts.begin();
+           Iter != VTableGUIDAndCounts.end(); Iter++) {
+        if (Iter != VTableGUIDAndCounts.begin())
+          Remark << ", ";
+        Remark << ore::NV("VTable", Symtab->getGlobalVariable(Iter->first));
+      }
+
+      Remark << "}";
+
+      return Remark;
     });
 
     PromotedFuncCount.push_back(Candidate.Count);
@@ -747,7 +764,7 @@ bool IndirectCallPromoter::tryToPromoteWithVTableCmp(
     assert(TotalFuncCount >= Candidate.Count &&
            "Within one prof metadata, total count is the sum of counts from "
            "individual <target, count> pairs");
-    // Use std::min since 'TotalFuncCount' is the saturating sum of individual
+    // Use std::min since 'TotalFuncCount' is the saturated sum of individual
     // counts, see
     // https://github.com/llvm/llvm-project/blob/abedb3b8356d5d56f1c575c4f7682fba2cb19787/llvm/lib/ProfileData/InstrProf.cpp#L1281-L1288
     TotalFuncCount -= std::min(TotalFuncCount, Candidate.Count);
@@ -816,26 +833,37 @@ bool IndirectCallPromoter::processFunction(ProfileSummaryInfo *PSI) {
   return Changed;
 }
 
-// TODO: Returns false if the function addressing and vtable load instructions
+// TODO: Return false if the function addressing and vtable load instructions
 // cannot sink to indirect fallback.
 bool IndirectCallPromoter::isProfitableToCompareVTables(
     const CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
     uint64_t TotalCount) {
   if (!EnableVTableProfileUse || Candidates.empty())
     return false;
+  LLVM_DEBUG(dbgs() << "\nEvaluating vtable profitability for callsite #"
+                    << NumOfPGOICallsites << CB << "\n");
   uint64_t RemainingVTableCount = TotalCount;
   const size_t CandidateSize = Candidates.size();
   for (size_t I = 0; I < CandidateSize; I++) {
     auto &Candidate = Candidates[I];
+    auto &VTableGUIDAndCounts = Candidate.VTableGUIDAndCounts;
+
+    LLVM_DEBUG(dbgs() << "  Candidate " << I << " FunctionCount: "
+                      << Candidate.Count << ", VTableCounts:");
+    for (auto &[GUID, Count] : VTableGUIDAndCounts)
+      LLVM_DEBUG(dbgs() << " {" << Symtab->getGlobalVariable(GUID)->getName()
+                        << ", " << Count << "}");
+    LLVM_DEBUG(dbgs() << "\n");
+
     uint64_t CandidateVTableCount = 0;
-    for (auto &[GUID, Count] : Candidate.VTableGUIDAndCounts)
+    for (auto &[GUID, Count] : VTableGUIDAndCounts)
       CandidateVTableCount += Count;
 
     if (CandidateVTableCount < Candidate.Count * ICPVTablePercentageThreshold) {
-      LLVM_DEBUG(dbgs() << "For callsite #" << NumOfPGOICallsites << CB << I
-                        << "-th candidate, function count " << Candidate.Count
-                        << " and its vtable count " << CandidateVTableCount
-                        << " have discrepancies\n");
+      LLVM_DEBUG(
+          dbgs() << "    function count " << Candidate.Count
+                 << " and its vtable sum count " << CandidateVTableCount
+                 << " have discrepancies. Bail out vtable comparison.\n");
       return false;
     }
 
@@ -844,7 +872,7 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
     // 'MaxNumVTable' limits the number of vtables to make vtable comparison
     // profitable. Comparing multiple vtables for one function candidate will
     // insert additional instructions on the hot path, and allowing more than
-    // one vtable for non last candidates may or may not elongates dependency
+    // one vtable for non last candidates may or may not elongate the dependency
     // chain for the subsequent candidates. Set its value to 1 for non-last
     // candidate and allow option to override it for the last candidate.
     int MaxNumVTable = 1;
@@ -852,14 +880,20 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
       MaxNumVTable = ICPMaxNumVTableLastCandidate;
 
     if ((int)Candidate.AddressPoints.size() > MaxNumVTable) {
+      LLVM_DEBUG(dbgs() << "    allow at most " << MaxNumVTable << " and got "
+                        << Candidate.AddressPoints.size()
+                        << " vtables. Bail out for vtable comparison.\n");
       return false;
     }
   }
 
   // If the indirect fallback is not cold, don't compare vtables.
   if (PSI && PSI->hasProfileSummary() &&
-      !PSI->isColdCount(RemainingVTableCount))
+      !PSI->isColdCount(RemainingVTableCount)) {
+    LLVM_DEBUG(dbgs() << "    Indirect fallback basic block is not cold. Bail "
+                         "out for vtable comparison.\n");
     return false;
+  }
 
   return true;
 }
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
index 7b7f6d17d59f0..a3db6eaa62163 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_cmp.ll
@@ -14,9 +14,9 @@ target triple = "x86_64-unknown-linux-gnu"
 @Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @Base3_foo], [3 x ptr] [ptr null, ptr null, ptr @Base2_foo], [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Derived2_bar] }, !type !4, !type !5, !type !6, !type !7
 @Derived3 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @Base1_foo, ptr @Base1_bar] }, !type !0, !type !8
 
-; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived1_bar with count 600 out of 1600, compare 1 vtables and sink 2 instructions
-; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived2_bar with count 500 out of 1000, compare 1 vtables and sink 2 instructions
-; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Base1_bar with count 400 out of 500, compare 2 vtables and sink 2 instructions
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived1_bar with count 600 out of 1600, sink 2 instruction(s) and compare 1 vtable(s): {Derived1}
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Derived2_bar with count 500 out of 1000, sink 2 instruction(s) and compare 1 vtable(s): {Derived2}
+; VTABLE-CMP: remark: <unknown>:0:0: Promote indirect call to Base1_bar with count 400 out of 500, sink 2 instruction(s) and compare 2 vtable(s): {Base1, Derived3}
 
 define void @test(ptr %d) {
 ; VTABLE-CMP-LABEL: define void @test(
diff --git a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
index d9126aec3d94e..fb9ec0d0c85ff 100644
--- a/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
+++ b/llvm/test/Transforms/PGOProfile/icp_vtable_tail_call.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN7Derived5func1Eii with count 900 out of 1600, compare 1 vtables and sink 1 instruction
-; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN4Base5func1Eii with count 700 out of 700, compare 1 vtables and sink 1 instructions
+; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN7Derived5func1Eii with count 900 out of 1600, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV7Derived}
+; REMARK: remark: <unknown>:0:0: Promote indirect call to _ZN4Base5func1Eii with count 700 out of 700, sink 1 instruction(s) and compare 1 vtable(s): {_ZTV4Base}
 
 @_ZTV7Derived = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN7Derived5func1Eii] }, !type !0, !type !1, !type !2, !type !3
 @_ZTV4Base = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN4Base5func1Eii] }, !type !0, !type !1
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 6c8ab14e7c245..0c53d91799aa4 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -767,13 +767,6 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
     });
   }
 
-  const InstrProfSymtab &symtab = Reader->getSymtab();
-  const auto &VTableNames = symtab.getVTableNames();
-
-  for (const auto &kv : VTableNames) {
-    WC->Writer.addVTableName(kv.getKey());
-  }
-
   if (Reader->hasTemporalProfile()) {
     auto &Traces = Reader->getTemporalProfTraces(Input.Weight);
     if (!Traces.empty())

>From 534332027662c22bdaf0d8dbc26c0b2e53f4b153 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Fri, 28 Jun 2024 09:05:20 -0700
Subject: [PATCH 16/16] fix unused variable

---
 llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 86f817f949cdb..9de27c2291fb0 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -206,7 +206,6 @@ static BasicBlock *getUserBasicBlock(Use &U, Instruction *UserInst) {
 //    critical edge.
 // 2) `Inst` have users and all users are in `DestBB`.
 static bool isDestBBSuitableForSink(Instruction *Inst, BasicBlock *DestBB) {
-  BasicBlock *BB = Inst->getParent();
   assert(Inst->getParent() != DestBB &&
          BB->getTerminator()->getNumSuccessors() == 2 &&
          DestBB->getUniquePredecessor() == BB &&
@@ -850,7 +849,8 @@ bool IndirectCallPromoter::isProfitableToCompareVTables(
 
     LLVM_DEBUG(dbgs() << "  Candidate " << I << " FunctionCount: "
                       << Candidate.Count << ", VTableCounts:");
-    for (auto &[GUID, Count] : VTableGUIDAndCounts)
+    // Add [[maybe_unused]] since <GUID, Count> are only used by LLVM_DEBUG.
+    for ([[maybe_unused]] auto &[GUID, Count] : VTableGUIDAndCounts)
       LLVM_DEBUG(dbgs() << " {" << Symtab->getGlobalVariable(GUID)->getName()
                         << ", " << Count << "}");
     LLVM_DEBUG(dbgs() << "\n");



More information about the llvm-commits mailing list