[clang] 7ed8124 - [HeapProf] Clang and LLVM support for heap profiling instrumentation

Teresa Johnson via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 27 08:51:08 PDT 2020


Author: Teresa Johnson
Date: 2020-08-27T08:50:35-07:00
New Revision: 7ed8124d46f94601d5f1364becee9cee8538265e

URL: https://github.com/llvm/llvm-project/commit/7ed8124d46f94601d5f1364becee9cee8538265e
DIFF: https://github.com/llvm/llvm-project/commit/7ed8124d46f94601d5f1364becee9cee8538265e.diff

LOG: [HeapProf] Clang and LLVM support for heap profiling instrumentation

See RFC for background:
http://lists.llvm.org/pipermail/llvm-dev/2020-June/142744.html

Note that the runtime changes will be sent separately (hopefully this
week, need to add some tests).

This patch includes the LLVM pass to instrument memory accesses with
either inline sequences to increment the access count in the shadow
location, or alternatively to call into the runtime. It also changes
calls to memset/memcpy/memmove to the equivalent runtime version.
The pass is modeled on the address sanitizer pass.

The clang changes add the driver option to invoke the new pass, and to
link with the upcoming heap profiling runtime libraries.

Currently there is no attempt to optimize the instrumentation, e.g. to
aggregate updates to the same memory allocation. That will be
implemented as follow on work.

Differential Revision: https://reviews.llvm.org/D85948

Added: 
    clang/test/Driver/fmemprof.cpp
    llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h
    llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp
    llvm/test/Instrumentation/HeapProfiler/basic.ll
    llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
    llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
    llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll
    llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll

Modified: 
    clang/include/clang/Basic/CodeGenOptions.def
    clang/include/clang/Driver/Options.td
    clang/include/clang/Driver/SanitizerArgs.h
    clang/lib/CodeGen/BackendUtil.cpp
    clang/lib/Driver/SanitizerArgs.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Driver/ToolChains/CommonArgs.cpp
    clang/lib/Frontend/CompilerInvocation.cpp
    llvm/include/llvm/InitializePasses.h
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Passes/PassRegistry.def
    llvm/lib/Transforms/Instrumentation/CMakeLists.txt
    llvm/lib/Transforms/Instrumentation/Instrumentation.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 0f03373c4e25..8b89aac8d6d5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -145,6 +145,7 @@ CODEGENOPT(IncrementalLinkerCompatible, 1, 0) ///< Emit an object file which can
                                               ///< linker.
 CODEGENOPT(MergeAllConstants , 1, 1) ///< Merge identical constants.
 CODEGENOPT(MergeFunctions    , 1, 0) ///< Set when -fmerge-functions is enabled.
+CODEGENOPT(HeapProf          , 1, 0) ///< Set when -fmemprof is enabled.
 CODEGENOPT(MSVolatile        , 1, 0) ///< Set when /volatile:ms is enabled.
 CODEGENOPT(NoCommon          , 1, 0) ///< Set when -fno-common or C++ is enabled.
 CODEGENOPT(NoDwarfDirectoryAsm , 1, 0) ///< Set when -fno-dwarf-directory-asm is

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0bb5df726712..ff7b4aa9320c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -995,6 +995,8 @@ defm cxx_static_destructors : OptOutFFlag<"c++-static-destructors", "",
 def fsymbol_partition_EQ : Joined<["-"], "fsymbol-partition=">, Group<f_Group>,
   Flags<[CC1Option]>;
 
+defm memprof : OptInFFlag<"memprof", "Enable", "Disable", " heap memory profiling">;
+
 // Begin sanitizer flags. These should all be core options exposed in all driver
 // modes.
 let Flags = [CC1Option, CoreOption] in {

diff  --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 563d6c3ff9de..95d6bcf35c78 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -55,13 +55,15 @@ class SanitizerArgs {
   bool MinimalRuntime = false;
   // True if cross-dso CFI support if provided by the system (i.e. Android).
   bool ImplicitCfiRuntime = false;
+  bool NeedsHeapProfRt = false;
 
- public:
+public:
   /// Parses the sanitizer arguments from an argument list.
   SanitizerArgs(const ToolChain &TC, const llvm::opt::ArgList &Args);
 
   bool needsSharedRt() const { return SharedRuntime; }
 
+  bool needsHeapProfRt() const { return NeedsHeapProfRt; }
   bool needsAsanRt() const { return Sanitizers.has(SanitizerKind::Address); }
   bool needsHwasanRt() const {
     return Sanitizers.has(SanitizerKind::HWAddress);

diff  --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 093650ac0066..1d4763fff80e 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -67,6 +67,7 @@
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
@@ -267,6 +268,12 @@ static bool asanUseGlobalsGC(const Triple &T, const CodeGenOptions &CGOpts) {
   return false;
 }
 
+static void addHeapProfilerPasses(const PassManagerBuilder &Builder,
+                                  legacy::PassManagerBase &PM) {
+  PM.add(createHeapProfilerFunctionPass());
+  PM.add(createModuleHeapProfilerLegacyPassPass());
+}
+
 static void addAddressSanitizerPasses(const PassManagerBuilder &Builder,
                                       legacy::PassManagerBase &PM) {
   const PassManagerBuilderWrapper &BuilderWrapper =
@@ -662,6 +669,13 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM,
   if (LangOpts.Coroutines)
     addCoroutinePassesToExtensionPoints(PMBuilder);
 
+  if (CodeGenOpts.HeapProf) {
+    PMBuilder.addExtension(PassManagerBuilder::EP_OptimizerLast,
+                           addHeapProfilerPasses);
+    PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0,
+                           addHeapProfilerPasses);
+  }
+
   if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds)) {
     PMBuilder.addExtension(PassManagerBuilder::EP_ScalarOptimizerLate,
                            addBoundsCheckingPass);
@@ -1367,6 +1381,11 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
       }
     }
 
+    if (CodeGenOpts.HeapProf) {
+      MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass()));
+      MPM.addPass(ModuleHeapProfilerPass());
+    }
+
     if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) {
       bool Recover = CodeGenOpts.SanitizeRecover.has(SanitizerKind::HWAddress);
       MPM.addPass(HWAddressSanitizerPass(

diff  --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 8c49e92b2c0f..cce0eb557a9c 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -866,6 +866,9 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
                                 LinkCXXRuntimes) ||
                     D.CCCIsCXX();
 
+  NeedsHeapProfRt =
+      Args.hasFlag(options::OPT_fmemprof, options::OPT_fno_memprof, false);
+
   // Finally, initialize the set of available and recoverable sanitizers.
   Sanitizers.Mask |= Kinds;
   RecoverableSanitizers.Mask |= RecoverableKinds;

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c7f8f0fea5a9..976502c3ca73 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4223,6 +4223,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.getLastArg(options::OPT_save_temps_EQ))
     Args.AddLastArg(CmdArgs, options::OPT_save_temps_EQ);
 
+  if (Args.hasFlag(options::OPT_fmemprof, options::OPT_fno_memprof, false))
+    Args.AddLastArg(CmdArgs, options::OPT_fmemprof);
+
   // Embed-bitcode option.
   // Only white-listed flags below are allowed to be embedded.
   if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() &&

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 912fb2e22abf..39308f8ea5b7 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -686,6 +686,11 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
       if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid())
         HelperStaticRuntimes.push_back("asan-preinit");
     }
+    if (SanArgs.needsHeapProfRt() && SanArgs.linkRuntimes()) {
+      SharedRuntimes.push_back("heapprof");
+      if (!Args.hasArg(options::OPT_shared) && !TC.getTriple().isAndroid())
+        HelperStaticRuntimes.push_back("heapprof-preinit");
+    }
     if (SanArgs.needsUbsanRt() && SanArgs.linkRuntimes()) {
       if (SanArgs.requiresMinimalRuntime())
         SharedRuntimes.push_back("ubsan_minimal");
@@ -721,6 +726,13 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
       StaticRuntimes.push_back("asan_cxx");
   }
 
+  if (!SanArgs.needsSharedRt() && SanArgs.needsHeapProfRt() &&
+      SanArgs.linkRuntimes()) {
+    StaticRuntimes.push_back("heapprof");
+    if (SanArgs.linkCXXRuntimes())
+      StaticRuntimes.push_back("heapprof_cxx");
+  }
+
   if (!SanArgs.needsSharedRt() && SanArgs.needsHwasanRt() && SanArgs.linkRuntimes()) {
     StaticRuntimes.push_back("hwasan");
     if (SanArgs.linkCXXRuntimes())

diff  --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 89dce7515dd1..1cd392f65009 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1033,6 +1033,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.ThinLinkBitcodeFile =
       std::string(Args.getLastArgValue(OPT_fthin_link_bitcode_EQ));
 
+  Opts.HeapProf = Args.hasArg(OPT_fmemprof);
+
   Opts.MSVolatile = Args.hasArg(OPT_fms_volatile);
 
   Opts.VectorizeLoop = Args.hasArg(OPT_vectorize_loops);

diff  --git a/clang/test/Driver/fmemprof.cpp b/clang/test/Driver/fmemprof.cpp
new file mode 100644
index 000000000000..049067803e2b
--- /dev/null
+++ b/clang/test/Driver/fmemprof.cpp
@@ -0,0 +1,6 @@
+// RUN: %clangxx -target x86_64-linux-gnu -fmemprof %s -### 2>&1 | FileCheck %s
+// RUN: %clangxx -target x86_64-linux-gnu -fmemprof -fno-memprof %s -### 2>&1 | FileCheck %s --check-prefix=OFF
+// CHECK: "-cc1" {{.*}} "-fmemprof"
+// CHECK: ld{{.*}}libclang_rt.heapprof{{.*}}libclang_rt.heapprof_cxx
+// OFF-NOT: "-fmemprof"
+// OFF-NOT: libclang_rt.heapprof

diff  --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index f1b4d2f71bde..e8637b5cd454 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -176,6 +176,7 @@ void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeHardwareLoopsPass(PassRegistry&);
+void initializeHeapProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeIPSCCPLegacyPassPass(PassRegistry&);
@@ -303,6 +304,7 @@ void initializeMergeICmpsLegacyPassPass(PassRegistry &);
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
+void initializeModuleHeapProfilerLegacyPassPass(PassRegistry &);
 void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
 void initializeModuloScheduleTestPass(PassRegistry&);
 void initializeMustExecutePrinterPass(PassRegistry&);

diff  --git a/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h
new file mode 100644
index 000000000000..af905bbecad8
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/HeapProfiler.h
@@ -0,0 +1,51 @@
+//===--------- Definition of the HeapProfiler class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the HeapProfiler class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// Public interface to the heap profiler pass for instrumenting code to
+/// profile heap memory accesses.
+///
+/// The profiler itself is a function pass that works by inserting various
+/// calls to the HeapProfiler runtime library functions. The runtime library
+/// essentially replaces malloc() and free() with custom implementations that
+/// record data about the allocations.
+class HeapProfilerPass : public PassInfoMixin<HeapProfilerPass> {
+public:
+  explicit HeapProfilerPass();
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Public interface to the heap profiler module pass for instrumenting code
+/// to profile heap memory allocations and accesses.
+class ModuleHeapProfilerPass : public PassInfoMixin<ModuleHeapProfilerPass> {
+public:
+  explicit ModuleHeapProfilerPass();
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+};
+
+// Insert HeapProfiler instrumentation
+FunctionPass *createHeapProfilerFunctionPass();
+ModulePass *createModuleHeapProfilerLegacyPassPass();
+
+} // namespace llvm
+
+#endif

diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 23288bb1ac07..515aaea44dea 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -110,6 +110,7 @@
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
@@ -258,6 +259,10 @@ static cl::opt<bool>
                             cl::Hidden,
                             cl::desc("Enable inline deferral during PGO"));
 
+static cl::opt<bool> EnableHeapProfiler("enable-heap-prof", cl::init(false),
+                                        cl::Hidden, cl::ZeroOrMore,
+                                        cl::desc("Enable heap profiler"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1034,6 +1039,12 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
     MPM.addPass(SyntheticCountsPropagation());
 
   MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging));
+
+  if (EnableHeapProfiler && Phase != ThinLTOPhase::PreLink) {
+    MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass()));
+    MPM.addPass(ModuleHeapProfilerPass());
+  }
+
   return MPM;
 }
 

diff  --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 4bbecfeb82a9..406a41967e4a 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -97,6 +97,7 @@ MODULE_PASS("msan-module", MemorySanitizerPass({}))
 MODULE_PASS("tsan-module", ThreadSanitizerPass())
 MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
+MODULE_PASS("heapprof-module", ModuleHeapProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 #undef MODULE_PASS
 
@@ -276,6 +277,7 @@ FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false))
 FUNCTION_PASS("msan", MemorySanitizerPass({}))
 FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
+FUNCTION_PASS("heapprof", HeapProfilerPass())
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_PASS_WITH_PARAMS

diff  --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 9316de4eb32c..1fc0b140be03 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMInstrumentation
   ControlHeightReduction.cpp
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
+  HeapProfiler.cpp
   MemorySanitizer.cpp
   IndirectCallPromotion.cpp
   Instrumentation.cpp

diff  --git a/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp b/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp
new file mode 100644
index 000000000000..93d9f0c6362d
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/HeapProfiler.cpp
@@ -0,0 +1,613 @@
+//===- HeapProfiler.cpp - heap allocation and access profiler
+//--------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HeapProfiler. Memory accesses are instrumented
+// to increment the access count held in a shadow memory location, or
+// alternatively to call into the runtime. Memory intrinsic calls (memmove,
+// memcpy, memset) are changed to call the heap profiling runtime version
+// instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "heapprof"
+
+constexpr int LLVM_HEAP_PROFILER_VERSION = 1;
+
+// Size of memory mapped to a single shadow location.
+constexpr uint64_t DefaultShadowGranularity = 64;
+
+// Scale from granularity down to shadow size.
+constexpr uint64_t DefaultShadowScale = 3;
+
+constexpr char HeapProfModuleCtorName[] = "heapprof.module_ctor";
+constexpr uint64_t HeapProfCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+constexpr uint64_t HeapProfEmscriptenCtorAndDtorPriority = 50;
+constexpr char HeapProfInitName[] = "__heapprof_init";
+constexpr char HeapProfVersionCheckNamePrefix[] =
+    "__heapprof_version_mismatch_check_v";
+
+constexpr char HeapProfShadowMemoryDynamicAddress[] =
+    "__heapprof_shadow_memory_dynamic_address";
+
+// Command-line flags.
+
+static cl::opt<bool> ClInsertVersionCheck(
+    "heapprof-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden,
+    cl::init(true));
+
+// This flag may need to be replaced with -f[no-]memprof-reads.
+static cl::opt<bool> ClInstrumentReads("heapprof-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentWrites("heapprof-instrument-writes",
+                       cl::desc("instrument write instructions"), cl::Hidden,
+                       cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "heapprof-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClUseCalls(
+    "heapprof-use-callbacks",
+    cl::desc("Use callbacks instead of inline instrumentation sequences."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>
+    ClMemoryAccessCallbackPrefix("heapprof-memory-access-callback-prefix",
+                                 cl::desc("Prefix for memory access callbacks"),
+                                 cl::Hidden, cl::init("__heapprof_"));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = ((Mem & mask) >> scale) + offset
+
+static cl::opt<int> ClMappingScale("heapprof-mapping-scale",
+                                   cl::desc("scale of heapprof shadow mapping"),
+                                   cl::Hidden, cl::init(DefaultShadowScale));
+
+static cl::opt<int>
+    ClMappingGranularity("heapprof-mapping-granularity",
+                         cl::desc("granularity of heapprof shadow mapping"),
+                         cl::Hidden, cl::init(DefaultShadowGranularity));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("heapprof-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("heapprof-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("heapprof-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("heapprof-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = ((mem & mask) >> Scale) ADD DynamicShadowOffset.
+struct ShadowMapping {
+  ShadowMapping() {
+    Scale = ClMappingScale;
+    Granularity = ClMappingGranularity;
+    Mask = ~(Granularity - 1);
+  }
+
+  int Scale;
+  int Granularity;
+  uint64_t Mask; // Computed as ~(Granularity-1)
+};
+
+static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) {
+  return TargetTriple.isOSEmscripten() ? HeapProfEmscriptenCtorAndDtorPriority
+                                       : HeapProfCtorAndDtorPriority;
+}
+
+struct InterestingMemoryAccess {
+  Value *Addr = nullptr;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+  Value *MaybeMask = nullptr;
+};
+
+/// Instrument the code in module to profile heap accesses.
+class HeapProfiler {
+public:
+  HeapProfiler(Module &M) {
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+  }
+
+  /// If it is an interesting memory access, populate information
+  /// about the access and return a InterestingMemoryAccess struct.
+  /// Otherwise return None.
+  Optional<InterestingMemoryAccess> isInterestingMemoryAccess(Instruction *I);
+
+  void instrumentMop(Instruction *I, const DataLayout &DL,
+                     InterestingMemoryAccess &Access);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
+                                   Instruction *I, Value *Addr,
+                                   unsigned Alignment, uint32_t TypeSize,
+                                   bool IsWrite);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFunction(Function &F);
+  bool maybeInsertHeapProfInitAtFunctionEntry(Function &F);
+  void insertDynamicShadowAtFunctionEntry(Function &F);
+
+private:
+  void initializeCallbacks(Module &M);
+
+  LLVMContext *C;
+  int LongSize;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+
+  // These arrays is indexed by AccessIsWrite
+  FunctionCallee HeapProfMemoryAccessCallback[2];
+  FunctionCallee HeapProfMemoryAccessCallbackSized[2];
+
+  FunctionCallee HeapProfMemmove, HeapProfMemcpy, HeapProfMemset;
+  Value *DynamicShadowOffset = nullptr;
+};
+
+class HeapProfilerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit HeapProfilerLegacyPass() : FunctionPass(ID) {
+    initializeHeapProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "HeapProfilerFunctionPass"; }
+
+  bool runOnFunction(Function &F) override {
+    HeapProfiler Profiler(*F.getParent());
+    return Profiler.instrumentFunction(F);
+  }
+};
+
+class ModuleHeapProfiler {
+public:
+  ModuleHeapProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); }
+
+  bool instrumentModule(Module &);
+
+private:
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  Function *HeapProfCtorFunction = nullptr;
+};
+
+class ModuleHeapProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit ModuleHeapProfilerLegacyPass() : ModulePass(ID) {
+    initializeModuleHeapProfilerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ModuleHeapProfiler"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+  bool runOnModule(Module &M) override {
+    ModuleHeapProfiler HeapProfiler(M);
+    return HeapProfiler.instrumentModule(M);
+  }
+};
+
+} // end anonymous namespace
+
+HeapProfilerPass::HeapProfilerPass() {}
+
+PreservedAnalyses HeapProfilerPass::run(Function &F,
+                                        AnalysisManager<Function> &AM) {
+  Module &M = *F.getParent();
+  HeapProfiler Profiler(M);
+  if (Profiler.instrumentFunction(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+
+  return PreservedAnalyses::all();
+}
+
+ModuleHeapProfilerPass::ModuleHeapProfilerPass() {}
+
+PreservedAnalyses ModuleHeapProfilerPass::run(Module &M,
+                                              AnalysisManager<Module> &AM) {
+  ModuleHeapProfiler Profiler(M);
+  if (Profiler.instrumentModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char HeapProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HeapProfilerLegacyPass, "heapprof",
+                      "HeapProfiler: profile heap allocations and accesses.",
+                      false, false)
+INITIALIZE_PASS_END(HeapProfilerLegacyPass, "heapprof",
+                    "HeapProfiler: profile heap allocations and accesses.",
+                    false, false)
+
+FunctionPass *llvm::createHeapProfilerFunctionPass() {
+  return new HeapProfilerLegacyPass();
+}
+
+char ModuleHeapProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ModuleHeapProfilerLegacyPass, "heapprof-module",
+                "HeapProfiler: profile heap allocations and accesses."
+                "ModulePass",
+                false, false)
+
+ModulePass *llvm::createModuleHeapProfilerLegacyPassPass() {
+  return new ModuleHeapProfilerLegacyPass();
+}
+
+Value *HeapProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // (Shadow & mask) >> scale
+  Shadow = IRB.CreateAnd(Shadow, Mapping.Mask);
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  // (Shadow >> scale) | offset
+  assert(DynamicShadowOffset);
+  return IRB.CreateAdd(Shadow, DynamicShadowOffset);
+}
+
+// Instrument memset/memmove/memcpy
+void HeapProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? HeapProfMemmove : HeapProfMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        HeapProfMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+Optional<InterestingMemoryAccess>
+HeapProfiler::isInterestingMemoryAccess(Instruction *I) {
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (DynamicShadowOffset == I)
+    return None;
+
+  InterestingMemoryAccess Access;
+
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads)
+      return None;
+    Access.IsWrite = false;
+    Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    Access.Alignment = LI->getAlignment();
+    Access.Addr = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    Access.Alignment = SI->getAlignment();
+    Access.Addr = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = XCHG->getPointerOperand();
+  } else if (auto *CI = dyn_cast<CallInst>(I)) {
+    auto *F = CI->getCalledFunction();
+    if (F && (F->getIntrinsicID() == Intrinsic::masked_load ||
+              F->getIntrinsicID() == Intrinsic::masked_store)) {
+      unsigned OpOffset = 0;
+      if (F->getIntrinsicID() == Intrinsic::masked_store) {
+        if (!ClInstrumentWrites)
+          return None;
+        // Masked store has an initial operand for the value.
+        OpOffset = 1;
+        Access.IsWrite = true;
+      } else {
+        if (!ClInstrumentReads)
+          return None;
+        Access.IsWrite = false;
+      }
+
+      auto *BasePtr = CI->getOperand(0 + OpOffset);
+      auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      Access.TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto *AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        Access.Alignment = 1; // No alignment guarantees. We probably got Undef
+      Access.MaybeMask = CI->getOperand(2 + OpOffset);
+      Access.Addr = BasePtr;
+    }
+  }
+
+  if (!Access.Addr)
+    return None;
+
+  // Do not instrument acesses from 
diff erent address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Access.Addr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return None;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Access.Addr->isSwiftError())
+    return None;
+
+  return Access;
+}
+
+void HeapProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL,
+                                               Value *Mask, Instruction *I,
+                                               Value *Addr, unsigned Alignment,
+                                               uint32_t TypeSize,
+                                               bool IsWrite) {
+  auto *VTy =
+      cast<VectorType>(cast<PointerType>(Addr->getType())->getElementType());
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getNumElements();
+  auto *Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to instrumentAddress.
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    instrumentAddress(I, InsertBefore, InstrumentedAddress, ElemTypeSize,
+                      IsWrite);
+  }
+}
+
+void HeapProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
+                                 InterestingMemoryAccess &Access) {
+  if (Access.IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  if (Access.MaybeMask) {
+    instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
+                                Access.Alignment, Access.TypeSize,
+                                Access.IsWrite);
+  } else {
+    // Since the access counts will be accumulated across the entire allocation,
+    // we only update the shadow access count for the first location and thus
+    // don't need to worry about alignment and type size.
+    instrumentAddress(I, I, Access.Addr, Access.TypeSize, Access.IsWrite);
+  }
+}
+
+void HeapProfiler::instrumentAddress(Instruction *OrigIns,
+                                     Instruction *InsertBefore, Value *Addr,
+                                     uint32_t TypeSize, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  if (ClUseCalls) {
+    IRB.CreateCall(HeapProfMemoryAccessCallback[IsWrite], AddrLong);
+    return;
+  }
+
+  // Create an inline sequence to compute shadow location, and increment the
+  // value by one.
+  Type *ShadowTy = Type::getInt64Ty(*C);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
+  Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
+  Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1);
+  ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
+  IRB.CreateStore(ShadowValue, ShadowAddr);
+}
+
+bool ModuleHeapProfiler::instrumentModule(Module &M) {
+  // Create a module constructor.
+  std::string HeapProfVersion = std::to_string(LLVM_HEAP_PROFILER_VERSION);
+  std::string VersionCheckName =
+      ClInsertVersionCheck ? (HeapProfVersionCheckNamePrefix + HeapProfVersion)
+                           : "";
+  std::tie(HeapProfCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, HeapProfModuleCtorName,
+                                          HeapProfInitName, /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{}, VersionCheckName);
+
+  const uint64_t Priority = getCtorAndDtorPriority(TargetTriple);
+  appendToGlobalCtors(M, HeapProfCtorFunction, Priority);
+
+  return true;
+}
+
+void HeapProfiler::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    const std::string TypeStr = AccessIsWrite ? "store" : "load";
+
+    SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+    SmallVector<Type *, 2> Args1{1, IntptrTy};
+    HeapProfMemoryAccessCallbackSized[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr + "N",
+                              FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+    HeapProfMemoryAccessCallback[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr,
+                              FunctionType::get(IRB.getVoidTy(), Args1, false));
+  }
+  HeapProfMemmove = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  HeapProfMemcpy = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  HeapProfMemset = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
+}
+
+bool HeapProfiler::maybeInsertHeapProfInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __heapprof_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    FunctionCallee HeapProfInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), HeapProfInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(HeapProfInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+void HeapProfiler::insertDynamicShadowAtFunctionEntry(Function &F) {
+  IRBuilder<> IRB(&F.front().front());
+  Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+      HeapProfShadowMemoryDynamicAddress, IntptrTy);
+  DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+}
+
+bool HeapProfiler::instrumentFunction(Function &F) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return false;
+  if (ClDebugFunc == F.getName())
+    return false;
+  if (F.getName().startswith("__heapprof_"))
+    return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __heapprof_init.
+  // This function needs to be called even if the function body is not
+  // instrumented.
+  if (maybeInsertHeapProfInitAtFunctionEntry(F))
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "HEAPPROF instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+
+  insertDynamicShadowAtFunctionEntry(F);
+
+  SmallVector<Instruction *, 16> ToInstrument;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isInterestingMemoryAccess(&Inst) || isa<MemIntrinsic>(Inst))
+        ToInstrument.push_back(&Inst);
+    }
+  }
+
+  int NumInstrumented = 0;
+  for (auto *Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      Optional<InterestingMemoryAccess> Access =
+          isInterestingMemoryAccess(Inst);
+      if (Access)
+        instrumentMop(Inst, F.getParent()->getDataLayout(), *Access);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  if (NumInstrumented > 0)
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "HEAPPROF done instrumenting: " << FunctionModified
+                    << " " << F << "\n");
+
+  return FunctionModified;
+}

diff  --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index eda38e7da4f4..5cf3c2e3e11b 100644
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -105,6 +105,8 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerLegacyPassPass(Registry);
   initializeModuleAddressSanitizerLegacyPassPass(Registry);
+  initializeHeapProfilerLegacyPassPass(Registry);
+  initializeModuleHeapProfilerLegacyPassPass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);

diff  --git a/llvm/test/Instrumentation/HeapProfiler/basic.ll b/llvm/test/Instrumentation/HeapProfiler/basic.ll
new file mode 100644
index 000000000000..a26dae15f509
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/basic.ll
@@ -0,0 +1,179 @@
+; Test basic address sanitizer instrumentation.
+;
+; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+
+; We need the requires since both heapprof and heapprof-module require reading module level metadata which is done once by the heapprof-globals-md analysis
+; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; CHECK: @llvm.global_ctors = {{.*}}@heapprof.module_ctor
+
+define i32 @test_load(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @test_load
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -64
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[LOAD_SHADOW:[^ ]*]] = load i64, i64* %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i64 %[[LOAD_SHADOW]], 1
+; CHECK-NEXT:    store i64 %[[NEW_SHADOW]], i64* %[[LOAD_SHADOW_PTR]]
+; The actual load.
+; CHECK-NEXT:    %tmp1 = load i32, i32* %a
+; CHECK-NEXT:    ret i32 %tmp1
+
+define void @test_store(i32* %a) {
+entry:
+  store i32 42, i32* %a, align 4
+  ret void
+}
+; CHECK-LABEL: @test_store
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[STORE_ADDR]], -64
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[STORE_SHADOW:[^ ]*]] = load i64, i64* %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i64 %[[STORE_SHADOW]], 1
+; CHECK-NEXT:    store i64 %[[NEW_SHADOW]], i64* %[[STORE_SHADOW_PTR]]
+; The actual store.
+; CHECK-NEXT:    store i32 42, i32* %a
+; CHECK-NEXT:    ret void
+
+define void @FP80Test(x86_fp80* nocapture %a) nounwind uwtable {
+entry:
+    store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a, align 16
+    ret void
+}
+; CHECK-LABEL: @FP80Test
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a
+; CHECK:      ret void
+
+define void @i40test(i40* %a, i40* %b) nounwind uwtable {
+entry:
+  %t = load i40, i40* %a
+  store i40 %t, i40* %b, align 8
+  ret void
+}
+; CHECK-LABEL: @i40test
+; Exactly one shadow update for load access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual load.
+; CHECK:      %t = load i40, i40* %a
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i40 %t, i40* %b
+; CHECK:      ret void
+
+define void @i64test_align1(i64* %b) nounwind uwtable {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}
+; CHECK-LABEL: @i64test
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i64 0, i64* %b
+; CHECK:      ret void
+
+define void @i80test(i80* %a, i80* %b) nounwind uwtable {
+  entry:
+  %t = load i80, i80* %a
+  store i80 %t, i80* %b, align 8
+  ret void
+}
+; CHECK-LABEL: i80test
+; Exactly one shadow update for load access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual load.
+; CHECK:      %t = load i80, i80* %a
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i80 %t, i80* %b
+; CHECK:      ret void
+
+; heapprof should not instrument functions with available_externally linkage.
+define available_externally i32 @f_available_externally(i32* %a)  {
+entry:
+  %tmp1 = load i32, i32* %a
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @f_available_externally
+; CHECK-NOT: __heapprof_shadow_memory_dynamic_address
+; CHECK: ret i32
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind
+
+define void @memintr_test(i8* %a, i8* %b) nounwind uwtable {
+  entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 100, i1 false)
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memintr_test
+; CHECK: __heapprof_memset
+; CHECK: __heapprof_memmove
+; CHECK: __heapprof_memcpy
+; CHECK: ret void
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable {
+  ; This is a canary test to make sure that these don't get lowered into calls that don't
+  ; have the element-atomic property. Eventually, heapprof will have to be enhanced to lower
+  ; these properly.
+  ; CHECK-LABEL: memintr_element_atomic_test
+  ; CHECK: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  ; CHECK: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ret void
+}
+
+
+; CHECK: define internal void @heapprof.module_ctor()
+; CHECK: call void @__heapprof_init()

diff  --git a/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
new file mode 100644
index 000000000000..9df3df47d3d0
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
@@ -0,0 +1,36 @@
+; Test heapprof internal compiler flags:
+;   -heapprof-use-callbacks
+;   -heapprof-memory-access-callback-prefix
+
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-DEFAULT
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -heapprof-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-CUSTOM
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks=false -S | FileCheck %s --check-prefix=CHECK-INLINE
+; RUN: opt < %s -heapprof -heapprof-module  -S | FileCheck %s --check-prefix=CHECK-INLINE
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_load(i32* %a, i64* %b, i512* %c, i80* %d) {
+entry:
+; CHECK-CALL:             %[[LOAD_ADDR1:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR1]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR1]])
+; CHECK-CALL:             %[[LOAD_ADDR2:[^ ]*]] = ptrtoint i64* %b to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR2]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR2]])
+; CHECK-CALL:             %[[LOAD_ADDR3:[^ ]*]] = ptrtoint i512* %c to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR3]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR3]])
+; CHECK-CALL:             %[[LOAD_ADDR4:[^ ]*]] = ptrtoint i80* %d to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR4]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR4]])
+; CHECK-CALL-DEFAULT-NOT: call void @__heapprof_load
+; CHECK-CALL-CUSTOM-NOT:  call void @__foo_load
+; CHECK-INLINE-NOT:       call void @__heapprof_load
+  %tmp1 = load i32, i32* %a, align 4
+  %tmp2 = load i64, i64* %b, align 8
+  %tmp3 = load i512, i512* %c, align 32
+  %tmp4 = load i80, i80* %d, align 8
+  ret void
+}
+
+

diff  --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
new file mode 100644
index 000000000000..fa493a454ef1
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
@@ -0,0 +1,246 @@
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -S \
+; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -S \
+; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-writes=0 -S \
+; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -heapprof-instrument-writes=0 -S \
+; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL
+; Support heap profiling instrumentation for constant-mask llvm.masked.{load,store}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at v4f32 = global <4 x float>* zeroinitializer, align 8
+ at v8i32 = global <8 x i32>* zeroinitializer, align 8
+ at v4i64 = global <4 x i32*>* zeroinitializer, align 8
+
+;;;;;;;;;;;;;;;; STORE
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) argmemonly nounwind
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) argmemonly nounwind
+declare void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*>, <4 x i32*>*, i32, <4 x i1>) argmemonly nounwind
+
+define void @store.v4f32.1110(<4 x float> %arg) {
+; ALL-LABEL: @store.v4f32.1110
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP2]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+  ret void
+}
+
+define void @store.v8i32.10010110(<8 x i32> %arg) {
+; ALL-LABEL: @store.v8i32.10010110
+  %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5
+; STORE: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP5]])
+; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6
+; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP6]])
+; STORE: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+  ret void
+}
+
+define void @store.v4i64.0001(<4 x i32*> %arg) {
+; ALL-LABEL: @store.v4i64.0001
+  %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+  tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+  ret void
+}
+
+define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
+; ALL-LABEL: @store.v4f32.variable
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; STORE: [[THEN0]]:
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: br label %[[AFTER0]]
+; STORE: [[AFTER0]]:
+
+; STORE: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; STORE: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; STORE: [[THEN1]]:
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: br label %[[AFTER1]]
+; STORE: [[AFTER1]]:
+
+; STORE: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; STORE: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; STORE: [[THEN2]]:
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP2]])
+; STORE: br label %[[AFTER2]]
+; STORE: [[AFTER2]]:
+
+; STORE: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; STORE: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; STORE: [[THEN3]]:
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: br label %[[AFTER3]]
+; STORE: [[AFTER3]]:
+
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+;; Store using two masked.stores, which should instrument them both.
+define void @store.v4f32.1010.split(<4 x float> %arg) {
+; BOTH-LABEL: @store.v4f32.1010.split
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+;;;;;;;;;;;;;;;; LOAD
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) argmemonly nounwind
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) argmemonly nounwind
+declare <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>*, i32, <4 x i1>, <4 x i32*>) argmemonly nounwind
+
+define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) {
+; ALL-LABEL: @load.v8i32.11100001
+  %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP1]])
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP2]])
+; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7
+; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP7]])
+; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+  %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+  ret <8 x i32> %res
+}
+
+define <4 x float> @load.v4f32.1001(<4 x float> %arg) {
+; ALL-LABEL: @load.v4f32.1001
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+  ret <4 x float> %res
+}
+
+define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) {
+; ALL-LABEL: @load.v4i64.0001
+  %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+  %res = tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+  ret <4 x i32*> %res
+}
+
+define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
+; ALL-LABEL: @load.v4f32.variable
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; LOAD: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; LOAD: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; LOAD: [[THEN0]]:
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: br label %[[AFTER0]]
+; LOAD: [[AFTER0]]:
+
+; LOAD: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; LOAD: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; LOAD: [[THEN1]]:
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP1]])
+; LOAD: br label %[[AFTER1]]
+; LOAD: [[AFTER1]]:
+
+; LOAD: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; LOAD: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; LOAD: [[THEN2]]:
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP2]])
+; LOAD: br label %[[AFTER2]]
+; LOAD: [[AFTER2]]:
+
+; LOAD: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; LOAD: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; LOAD: [[THEN3]]:
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: br label %[[AFTER3]]
+; LOAD: [[AFTER3]]:
+
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+  ret <4 x float> %res
+}
+
+;; Load using two masked.loads, which should instrument them both.
+define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) {
+; BOTH-LABEL: @load.v4f32.1001
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+  %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+  ret <4 x float> %res2
+}

diff  --git a/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll
new file mode 100644
index 000000000000..c8c3a6d605db
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/scale-granularity.ll
@@ -0,0 +1,29 @@
+; Test that the scale (-heapprof-mapping-scale) and granularity (-heapprof-mapping-granularity) command-line options work as expected
+;
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 32 -S | FileCheck --check-prefix=CHECK-GRAN %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale 1 -S | FileCheck --check-prefix=CHECK-SCALE %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 16 -heapprof-mapping-scale 0 -S | FileCheck --check-prefix=CHECK-BOTH %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @read(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-GRAN-LABEL: @read
+; CHECK-GRAN-NOT:     ret
+; CHECK-GRAN:         and {{.*}} -32
+; CHECK-GRAN-NEXT:    lshr {{.*}} 3
+; CHECK-GRAN:         ret
+
+; CHECK-SCALE-LABEL: @read
+; CHECK-SCALE-NOT:     ret
+; CHECK-SCALE:         and {{.*}} -64
+; CHECK-SCALE-NEXT:    lshr {{.*}} 1
+; CHECK-SCALE:         ret
+
+; CHECK-BOTH-LABEL: @read
+; CHECK-BOTH-NOT:     ret
+; CHECK-BOTH:         and {{.*}} -16
+; CHECK-BOTH-NEXT:    lshr {{.*}} 0
+; CHECK-BOTH:         ret

diff  --git a/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll
new file mode 100644
index 000000000000..84e039551d70
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/version-mismatch-check.ll
@@ -0,0 +1,12 @@
+; Check that the HeapProf module constructor guards against compiler/runtime version
+; mismatch.
+
+; RUN: opt < %s -heapprof-module -S | FileCheck %s
+; RUN: opt < %s -heapprof-module -heapprof-guard-against-version-mismatch=0 -S | FileCheck %s --check-prefix=NOGUARD
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: define internal void @heapprof.module_ctor()
+; CHECK:         call void @__heapprof_version_mismatch_check_v1
+; NOGUARD-NOT:   call void @__heapprof_version_mismatch_check_


        


More information about the cfe-commits mailing list