[llvm-branch-commits] [llvm] SelectionDAG: Avoid using MachineFunction::getMMI (PR #99696)

Fri Jul 19 23:14:06 PDT 2024

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/99696

>From a8b90c835291df9a6c152fd32de666192aa0da77 Mon Sep 17 00:00:00 2001
From: aaryanshukla <53713108+aaryanshukla at users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:18:25 -0700
Subject: [PATCH 01/39] [libc][newheadergen]: adding entry_point testing
 (#99587)

---
 libc/newhdrgen/tests/input/test_small.yaml |  9 +++++++
 libc/newhdrgen/tests/test_integration.py   | 29 +++++++++++++---------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/libc/newhdrgen/tests/input/test_small.yaml b/libc/newhdrgen/tests/input/test_small.yaml
index f1f9f020ce6a4..a2e96939b4be0 100644
--- a/libc/newhdrgen/tests/input/test_small.yaml
+++ b/libc/newhdrgen/tests/input/test_small.yaml
@@ -54,4 +54,13 @@ functions:
     standards: 
       - stdc
     guard: LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128
+  - name: func_f
+    return_type: _Float16
+    arguments:
+      - type: int
+      - type: double
+      - type: float
+    standards: 
+      - stdc
+
 
diff --git a/libc/newhdrgen/tests/test_integration.py b/libc/newhdrgen/tests/test_integration.py
index 628a37b11c309..33353785af233 100644
--- a/libc/newhdrgen/tests/test_integration.py
+++ b/libc/newhdrgen/tests/test_integration.py
@@ -8,7 +8,6 @@
 
 class TestHeaderGenIntegration(unittest.TestCase):
     def setUp(self):
-
         self.output_dir = Path(
             args.output_dir if args.output_dir else "libc/newhdrgen/tests/output"
         )
@@ -17,19 +16,24 @@ def setUp(self):
 
         self.source_dir = Path(__file__).resolve().parent.parent.parent.parent
 
-    def run_script(self, yaml_file, h_def_file, output_dir):
+    def run_script(self, yaml_file, h_def_file, output_dir, entry_points):
         yaml_file = self.source_dir / yaml_file
         h_def_file = self.source_dir / h_def_file
+        command = [
+            "python3",
+            str(self.source_dir / "libc/newhdrgen/yaml_to_classes.py"),
+            str(yaml_file),
+            "--h_def_file",
+            str(h_def_file),
+            "--output_dir",
+            str(output_dir),
+        ]
+
+        for entry_point in entry_points:
+            command.extend(["--e", entry_point])
+
         result = subprocess.run(
-            [
-                "python3",
-                str(self.source_dir / "libc/newhdrgen/yaml_to_classes.py"),
-                str(yaml_file),
-                "--h_def_file",
-                str(h_def_file),
-                "--output_dir",
-                str(output_dir),
-            ],
+            command,
             capture_output=True,
             text=True,
         )
@@ -53,11 +57,12 @@ def test_generate_header(self):
             self.source_dir / "libc/newhdrgen/tests/expected_output/test_header.h"
         )
         output_file = self.output_dir / "test_small.h"
+        entry_points = {"func_b", "func_a", "func_c", "func_d", "func_e"}
 
         if not self.output_dir.exists():
             self.output_dir.mkdir(parents=True)
 
-        self.run_script(yaml_file, h_def_file, self.output_dir)
+        self.run_script(yaml_file, h_def_file, self.output_dir, entry_points)
 
         self.compare_files(output_file, expected_output_file)
 

>From a96c906102e8d0284c7a402eac4fa1ad9ab3e871 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Fri, 19 Jul 2024 11:39:56 -0700
Subject: [PATCH 02/39] [lldb/Target] Add GetStartSymbol method to
 DynamicLoader plugins (#99673)

This patch introduces a new method to the dynamic loader plugin, to
fetch its `start` symbol.

This can be useful to resolve the `start` symbol address for instance.

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>
---
 lldb/include/lldb/Target/DynamicLoader.h      |  9 ++++++++-
 .../MacOSX-DYLD/DynamicLoaderDarwin.cpp       | 20 +++++++++++++++++++
 .../MacOSX-DYLD/DynamicLoaderDarwin.h         |  2 ++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h
index e508d192d2759..15384245194b0 100644
--- a/lldb/include/lldb/Target/DynamicLoader.h
+++ b/lldb/include/lldb/Target/DynamicLoader.h
@@ -10,6 +10,7 @@
 #define LLDB_TARGET_DYNAMICLOADER_H
 
 #include "lldb/Core/PluginInterface.h"
+#include "lldb/Symbol/Symbol.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
@@ -24,7 +25,6 @@ namespace lldb_private {
 class ModuleList;
 class Process;
 class SectionList;
-class Symbol;
 class SymbolContext;
 class SymbolContextList;
 class Thread;
@@ -329,6 +329,13 @@ class DynamicLoader : public PluginInterface {
   /// safe to call certain APIs or SPIs.
   virtual bool IsFullyInitialized() { return true; }
 
+  /// Return the `start` function \b symbol in the dynamic loader module.
+  /// This is the symbol the process will begin executing with
+  /// `process launch --stop-at-entry`.
+  virtual std::optional<lldb_private::Symbol> GetStartSymbol() {
+    return std::nullopt;
+  }
+
 protected:
   // Utility methods for derived classes
 
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 0e17d57fa9c4f..5c6331735bde8 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -609,6 +609,26 @@ void DynamicLoaderDarwin::UpdateDYLDImageInfoFromNewImageInfo(
   }
 }
 
+std::optional<lldb_private::Symbol> DynamicLoaderDarwin::GetStartSymbol() {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  auto log_err = [log](llvm::StringLiteral err_msg) -> std::nullopt_t {
+    LLDB_LOGV(log, "{}", err_msg);
+    return std::nullopt;
+  };
+
+  ModuleSP dyld_sp = GetDYLDModule();
+  if (!dyld_sp)
+    return log_err("Couldn't retrieve DYLD module. Cannot get `start` symbol.");
+
+  const Symbol *symbol =
+      dyld_sp->FindFirstSymbolWithNameAndType(ConstString("_dyld_start"));
+  if (!symbol)
+    return log_err("Cannot find `start` symbol in DYLD module.");
+
+  return *symbol;
+}
+
 void DynamicLoaderDarwin::SetDYLDModule(lldb::ModuleSP &dyld_module_sp) {
   m_dyld_module_wp = dyld_module_sp;
 }
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
index 8f9a29c94173f..4ac55fdf6f3af 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
@@ -67,6 +67,8 @@ class DynamicLoaderDarwin : public lldb_private::DynamicLoader {
   // Clear method for classes derived from this one
   virtual void DoClear() = 0;
 
+  std::optional<lldb_private::Symbol> GetStartSymbol() override;
+
   void SetDYLDModule(lldb::ModuleSP &dyld_module_sp);
 
   lldb::ModuleSP GetDYLDModule();

>From 9b02b75c4d035192d9f24038f70107d7ffbb0f77 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail at igalia.com>
Date: Fri, 19 Jul 2024 20:44:35 +0200
Subject: [PATCH 03/39] [libc] Increase test timeout (#99678)

This patch increases the timeout of the libc test from 1s to 10s, mostly because rv32 runs in a qemu image and sometimes 1s is just not enough for the test to finish when there are other tests running in parallel
---
 libc/test/UnitTest/LibcDeathTestExecutors.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libc/test/UnitTest/LibcDeathTestExecutors.cpp b/libc/test/UnitTest/LibcDeathTestExecutors.cpp
index e43dbb19ad152..77b0559c7acea 100644
--- a/libc/test/UnitTest/LibcDeathTestExecutors.cpp
+++ b/libc/test/UnitTest/LibcDeathTestExecutors.cpp
@@ -14,13 +14,18 @@
 
 #include <cassert>
 
+namespace {
+constexpr unsigned TIMEOUT_MS = 10000;
+} // Anonymous namespace
+
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
 
 bool Test::testProcessKilled(testutils::FunctionCaller *Func, int Signal,
                              const char *LHSStr, const char *RHSStr,
                              internal::Location Loc) {
-  testutils::ProcessStatus Result = testutils::invoke_in_subprocess(Func, 1000);
+  testutils::ProcessStatus Result =
+      testutils::invoke_in_subprocess(Func, TIMEOUT_MS);
 
   if (const char *error = Result.get_error()) {
     Ctx->markFail();
@@ -32,7 +37,7 @@ bool Test::testProcessKilled(testutils::FunctionCaller *Func, int Signal,
   if (Result.timed_out()) {
     Ctx->markFail();
     tlog << Loc;
-    tlog << "Process timed out after " << 1000 << " milliseconds.\n";
+    tlog << "Process timed out after " << TIMEOUT_MS << " milliseconds.\n";
     return false;
   }
 
@@ -63,7 +68,8 @@ bool Test::testProcessKilled(testutils::FunctionCaller *Func, int Signal,
 bool Test::testProcessExits(testutils::FunctionCaller *Func, int ExitCode,
                             const char *LHSStr, const char *RHSStr,
                             internal::Location Loc) {
-  testutils::ProcessStatus Result = testutils::invoke_in_subprocess(Func, 1000);
+  testutils::ProcessStatus Result =
+      testutils::invoke_in_subprocess(Func, TIMEOUT_MS);
 
   if (const char *error = Result.get_error()) {
     Ctx->markFail();
@@ -75,7 +81,7 @@ bool Test::testProcessExits(testutils::FunctionCaller *Func, int ExitCode,
   if (Result.timed_out()) {
     Ctx->markFail();
     tlog << Loc;
-    tlog << "Process timed out after " << 1000 << " milliseconds.\n";
+    tlog << "Process timed out after " << TIMEOUT_MS << " milliseconds.\n";
     return false;
   }
 

>From 0ca98af7f96fcc5a1e16c314b3da8eb76c510207 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen at gmail.com>
Date: Fri, 19 Jul 2024 21:22:33 +0100
Subject: [PATCH 04/39] Dump regs fbsd fix (#99676)

---
 .../lib/safestack/safestack_platform.h        |  2 +
 .../lib/sanitizer_common/sanitizer_linux.cpp  | 45 ++++++++++++++++---
 .../FreeBSD/dump_registers_x86_64.cpp         | 19 ++++++++
 3 files changed, 59 insertions(+), 7 deletions(-)
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp

diff --git a/compiler-rt/lib/safestack/safestack_platform.h b/compiler-rt/lib/safestack/safestack_platform.h
index 68d26813bef4a..41c7c25fdaf4d 100644
--- a/compiler-rt/lib/safestack/safestack_platform.h
+++ b/compiler-rt/lib/safestack/safestack_platform.h
@@ -143,6 +143,8 @@ inline void *Mmap(void *addr, size_t length, int prot, int flags, int fd,
   return __mmap(addr, length, prot, flags, fd, 0, offset);
 #elif SANITIZER_FREEBSD && (defined(__aarch64__) || defined(__x86_64__))
   return (void *)__syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
+#elif SANITIZER_FREEBSD && (defined(__i386__))
+  return (void *)syscall(SYS_mmap, addr, length, prot, flags, fd, offset);
 #elif SANITIZER_SOLARIS
   return _REAL64(mmap)(addr, length, prot, flags, fd, offset);
 #elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 47f9f0c4590fb..40c46a8949636 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -2121,7 +2121,8 @@ bool SignalContext::IsTrueFaultingAddress() const {
 UNUSED
 static const char *RegNumToRegName(int reg) {
   switch (reg) {
-#  if defined(__x86_64__)
+#  if SANITIZER_LINUX
+#    if defined(__x86_64__)
     case REG_RAX:
       return "rax";
     case REG_RBX:
@@ -2154,7 +2155,7 @@ static const char *RegNumToRegName(int reg) {
       return "r14";
     case REG_R15:
       return "r15";
-#  elif defined(__i386__)
+#    elif defined(__i386__)
     case REG_EAX:
       return "eax";
     case REG_EBX:
@@ -2171,6 +2172,7 @@ static const char *RegNumToRegName(int reg) {
       return "ebp";
     case REG_ESP:
       return "esp";
+#    endif
 #  endif
     default:
       return NULL;
@@ -2178,22 +2180,24 @@ static const char *RegNumToRegName(int reg) {
   return NULL;
 }
 
+#  if SANITIZER_LINUX
 UNUSED
 static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
   const char *RegName = RegNumToRegName(RegNum);
-#  if defined(__x86_64__)
+#    if defined(__x86_64__)
   Printf("%s%s = 0x%016llx  ", internal_strlen(RegName) == 2 ? " " : "",
          RegName, ctx->uc_mcontext.gregs[RegNum]);
-#  elif defined(__i386__)
+#    elif defined(__i386__)
   Printf("%s = 0x%08x  ", RegName, ctx->uc_mcontext.gregs[RegNum]);
-#  else
+#    else
   (void)RegName;
-#  endif
+#    endif
 }
+#  endif
 
 void SignalContext::DumpAllRegisters(void *context) {
-#  if SANITIZER_LINUX
   ucontext_t *ucontext = (ucontext_t *)context;
+#  if SANITIZER_LINUX
 #    if defined(__x86_64__)
   Report("Register values:\n");
   DumpSingleReg(ucontext, REG_RAX);
@@ -2232,8 +2236,35 @@ void SignalContext::DumpAllRegisters(void *context) {
   DumpSingleReg(ucontext, REG_EBP);
   DumpSingleReg(ucontext, REG_ESP);
   Printf("\n");
+#    else
+  (void)ucontext;
 #    endif
+#  elif SANITIZER_FREEBSD
+#    if defined(__x86_64__)
+  Report("Register values:\n");
+  Printf("rax = 0x%016llx  ", ucontext->uc_mcontext.mc_rax);
+  Printf("rbx = 0x%016llx  ", ucontext->uc_mcontext.mc_rbx);
+  Printf("rcx = 0x%016llx  ", ucontext->uc_mcontext.mc_rcx);
+  Printf("rdx = 0x%016llx  ", ucontext->uc_mcontext.mc_rdx);
+  Printf("\n");
+  Printf("rdi = 0x%016llx  ", ucontext->uc_mcontext.mc_rdi);
+  Printf("rsi = 0x%016llx  ", ucontext->uc_mcontext.mc_rsi);
+  Printf("rbp = 0x%016llx  ", ucontext->uc_mcontext.mc_rbp);
+  Printf("rsp = 0x%016llx  ", ucontext->uc_mcontext.mc_rsp);
+  Printf("\n");
+  Printf(" r8 = 0x%016llx  ", ucontext->uc_mcontext.mc_r8);
+  Printf(" r9 = 0x%016llx  ", ucontext->uc_mcontext.mc_r9);
+  Printf("r10 = 0x%016llx  ", ucontext->uc_mcontext.mc_r10);
+  Printf("r11 = 0x%016llx  ", ucontext->uc_mcontext.mc_r11);
+  Printf("\n");
+  Printf("r12 = 0x%016llx  ", ucontext->uc_mcontext.mc_r12);
+  Printf("r13 = 0x%016llx  ", ucontext->uc_mcontext.mc_r13);
+  Printf("r14 = 0x%016llx  ", ucontext->uc_mcontext.mc_r14);
+  Printf("r15 = 0x%016llx  ", ucontext->uc_mcontext.mc_r15);
+  Printf("\n");
+#    else
   (void)ucontext;
+#    endif
 #  endif
   // FIXME: Implement this for other OSes and architectures.
 }
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp
new file mode 100644
index 0000000000000..3d11ef0e098f1
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_x86_64.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: x86_64-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: rax = {{0x[0-9a-f]+}}  rbx = {{0x[0-9a-f]+}}  rcx = {{0x[0-9a-f]+}}  rdx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: rdi = {{0x[0-9a-f]+}}  rsi = {{0x[0-9a-f]+}}  rbp = {{0x[0-9a-f]+}}  rsp = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:  r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r12 = {{0x[0-9a-f]+}}  r13 = {{0x[0-9a-f]+}}  r14 = {{0x[0-9a-f]+}}  r15 = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}

>From 49ef0a8dc2d3b8e9d7c515f5eb350d1cc0336ef0 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe at google.com>
Date: Fri, 19 Jul 2024 13:27:40 -0700
Subject: [PATCH 05/39] Revert "[NVPTX] Add Volta Load/Store Atomics (.relaxed,
 .acquire, .release) and Volatile (.mmio/.volatile) support" (#99695)

Reverts llvm/llvm-project#98022
---
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |  28 +-
 llvm/lib/Target/NVPTX/NVPTX.h                 |   8 -
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   | 314 ++----
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 144 +--
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |   7 +-
 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll   | 951 ------------------
 llvm/test/CodeGen/NVPTX/load-store.ll         | 553 +---------
 7 files changed, 157 insertions(+), 1848 deletions(-)
 delete mode 100644 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll

diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index a004d64c21cc6..380d878c1f532 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -227,33 +227,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
   if (Modifier) {
     const MCOperand &MO = MI->getOperand(OpNum);
     int Imm = (int) MO.getImm();
-    if (!strcmp(Modifier, "sem")) {
-      switch (Imm) {
-      case NVPTX::PTXLdStInstCode::NotAtomic:
-        break;
-      case NVPTX::PTXLdStInstCode::Volatile:
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
         O << ".volatile";
-        break;
-      case NVPTX::PTXLdStInstCode::Relaxed:
-        O << ".relaxed.sys";
-        break;
-      case NVPTX::PTXLdStInstCode::Acquire:
-        O << ".acquire.sys";
-        break;
-      case NVPTX::PTXLdStInstCode::Release:
-        O << ".release.sys";
-        break;
-      case NVPTX::PTXLdStInstCode::RelaxedMMIO:
-        O << ".mmio.relaxed.sys";
-        break;
-      default:
-        SmallString<256> Msg;
-        raw_svector_ostream OS(Msg);
-        OS << "NVPTX LdStCode Printer does not support \"" << Imm
-           << "\" sem modifier.";
-        report_fatal_error(OS.str());
-        break;
-      }
     } else if (!strcmp(Modifier, "addsp")) {
       switch (Imm) {
       case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 3c7167b157025..b0cb24c63c3ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -107,14 +107,6 @@ enum LoadStore {
 };
 
 namespace PTXLdStInstCode {
-enum MemorySemantic {
-  NotAtomic = 0, // PTX calls these: "Weak"
-  Volatile = 1,
-  Relaxed = 2,
-  Acquire = 3,
-  Release = 4,
-  RelaxedMMIO = 5
-};
 enum AddressSpace {
   GENERIC = 0,
   GLOBAL = 1,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 371ec8596ef63..11193c11ede3b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,170 +714,6 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
-static unsigned int getCodeMemorySemantic(MemSDNode *N,
-                                          const NVPTXSubtarget *Subtarget) {
-  AtomicOrdering Ordering = N->getSuccessOrdering();
-  auto CodeAddrSpace = getCodeAddrSpace(N);
-
-  bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
-  bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
-
-  // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
-  // TODO: lowering for AcquireRelease Operations: for now, we error.
-  //
-
-  // clang-format off
-
-  // Lowering for non-SequentiallyConsistent Operations
-  //
-  // | Atomic  | Volatile | Statespace         | PTX sm_60- | PTX sm_70+                   |
-  // |---------|----------|--------------------|------------|------------------------------|
-  // | No      | No       | All                | plain      | .weak                        |
-  // | No      | Yes      | Generic,Shared,    | .volatile  | .volatile                    |
-  // |         |          | Global [0]         |            |                              |
-  // | No      | Yes      | Local,Const,Param  | plain [1]  | .weak [1]                    |
-  // | Relaxed | No       | Generic,Shared,    |            |                              |
-  // |         |          | Global [0]         | .volatile  | <atomic sem>                 |
-  // | Other   | No       | Generic,Shared,    | Error [2]  | <atomic sem>                 |
-  // |         |          | Global [0]         |            |                              |
-  // | Yes     | No       | Local,Const,Param  | plain [1]  | .weak [1]                    |
-  // | Relaxed | Yes      | Generic,Shared [0] | .volatile  | .volatile                    |
-  // | Relaxed | Yes      | Global [0]         | .volatile  | .mmio.relaxed.sys (PTX 8.2+) |
-  // |         |          |                    |            |  or .volatile (PTX 8.1-)     |
-  // | Relaxed | Yes      | Local,Const,Param  | plain [1]  | .weak [1]                    |
-  // | Other   | Yes      | Generic, Shared,   | Error [2]  | <atomic sem> [3]             |
-  // |         |          | / Global [0]       |            |                              |
-
-  // clang-format on
-
-  // [0]: volatile and atomics are only supported on global or shared
-  //      memory locations, accessed via generic/shared/global pointers.
-  //      MMIO is only supported on global memory locations,
-  //      accessed via generic/global pointers.
-  // TODO: Implement MMIO access via generic pointer to global.
-  //       Currently implemented for global pointers only.
-
-  // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
-  //      PTX instructions fails to preserve their C++ side-effects.
-  //
-  //      Example (https://github.com/llvm/llvm-project/issues/62057):
-  //
-  //          void example() {
-  //              std::atomic<bool> True = true;
-  //              while (True.load(std::memory_order_relaxed));
-  //          }
-  //
-  //      A C++ program that calls "example" is well-defined: the infinite loop
-  //      performs an atomic operation. By lowering volatile/atomics to
-  //      "weak" memory operations, we are transforming the above into:
-  //
-  //          void undefined_behavior() {
-  //              bool True = true;
-  //              while (True);
-  //          }
-  //
-  //      which exhibits undefined behavior in both C++ and PTX.
-  //
-  //      Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
-  //      behavior due to lack of Independent Forward Progress. Lowering these
-  //      to weak memory operations in sm_60- is therefore fine.
-  //
-  //      TODO: lower atomic and volatile operations to memory locations
-  //      in local, const, and param to two PTX instructions in sm_70+:
-  //        - the "weak" memory instruction we are currently lowering to, and
-  //        - some other instruction that preserves the side-effect, e.g.,
-  //          a dead dummy volatile load.
-
-  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
-      CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
-      CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
-    return NVPTX::PTXLdStInstCode::NotAtomic;
-  }
-
-  // [2]: Atomics with Ordering different than Relaxed are not supported on
-  //      sm_60 and older; this includes volatile atomics.
-  if (!(Ordering == AtomicOrdering::NotAtomic ||
-        Ordering == AtomicOrdering::Monotonic) &&
-      !HasMemoryOrdering) {
-    SmallString<256> Msg;
-    raw_svector_ostream OS(Msg);
-    OS << "PTX does not support \"atomic\" for orderings different than"
-          "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
-       << toIRString(Ordering) << "\".";
-    report_fatal_error(OS.str());
-  }
-
-  // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
-  // the volatile semantics and preserve the atomic ones.
-
-  // PTX volatile and PTX atomics are not available for statespace that differ
-  // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
-  // atomics is undefined if the generic address does not refer to a .global or
-  // .shared memory location.
-  bool AddrGenericOrGlobalOrShared =
-      (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
-       CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
-       CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
-  bool UseRelaxedMMIO =
-      HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
-
-  switch (Ordering) {
-  case AtomicOrdering::NotAtomic:
-    return N->isVolatile() && AddrGenericOrGlobalOrShared
-               ? NVPTX::PTXLdStInstCode::Volatile
-               : NVPTX::PTXLdStInstCode::NotAtomic;
-  case AtomicOrdering::Monotonic:
-    if (N->isVolatile())
-      return UseRelaxedMMIO                ? NVPTX::PTXLdStInstCode::RelaxedMMIO
-             : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
-                                           : NVPTX::PTXLdStInstCode::NotAtomic;
-    else
-      return HasMemoryOrdering             ? NVPTX::PTXLdStInstCode::Relaxed
-             : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
-                                           : NVPTX::PTXLdStInstCode::NotAtomic;
-  case AtomicOrdering::Acquire:
-    if (!N->readMem()) {
-      SmallString<256> Msg;
-      raw_svector_ostream OS(Msg);
-      OS << "PTX only supports Acquire Ordering on reads: "
-         << N->getOperationName();
-      N->print(OS);
-      report_fatal_error(OS.str());
-    }
-    return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
-                                       : NVPTX::PTXLdStInstCode::NotAtomic;
-  case AtomicOrdering::Release:
-    if (!N->writeMem()) {
-      SmallString<256> Msg;
-      raw_svector_ostream OS(Msg);
-      OS << "PTX only supports Release Ordering on writes: "
-         << N->getOperationName();
-      N->print(OS);
-      report_fatal_error(OS.str());
-    }
-    return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
-                                       : NVPTX::PTXLdStInstCode::NotAtomic;
-  case AtomicOrdering::AcquireRelease: {
-    SmallString<256> Msg;
-    raw_svector_ostream OS(Msg);
-    OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
-       << N->getOperationName();
-    N->print(OS);
-    report_fatal_error(OS.str());
-  }
-  case AtomicOrdering::SequentiallyConsistent:
-  case AtomicOrdering::Unordered:
-    // TODO: support AcquireRelease and SequentiallyConsistent
-    SmallString<256> Msg;
-    raw_svector_ostream OS(Msg);
-    OS << "NVPTX backend does not support AtomicOrdering \""
-       << toIRString(Ordering) << "\" yet.";
-    report_fatal_error(OS.str());
-  }
-
-  llvm_unreachable("unexpected unhandled case");
-}
-
 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
                           unsigned CodeAddrSpace, MachineFunction *F) {
   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -1080,18 +916,32 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!LoadedVT.isSimple())
     return false;
 
+  AtomicOrdering Ordering = LD->getSuccessOrdering();
+  // In order to lower atomic loads with stronger guarantees we would need to
+  // use load.acquire or insert fences. However these features were only added
+  // with PTX ISA 6.0 / sm_70.
+  // TODO: Check if we can actually use the new instructions and implement them.
+  if (isStrongerThanMonotonic(Ordering))
+    return false;
+
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
     return tryLDGLDU(N);
   }
 
-  // Memory Semantic Setting
-  unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
-
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
 
+  // Volatile Setting
+  // - .volatile is only available for .global and .shared
+  // - .volatile has the same memory synchronization semantics as .relaxed.sys
+  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
@@ -1132,13 +982,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
-                     getI32Imm(CodeAddrSpace, dl),
-                     getI32Imm(vecType, dl),
-                     getI32Imm(fromType, dl),
-                     getI32Imm(fromTypeWidth, dl),
-                     Addr,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), Addr, Chain };
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -1147,14 +993,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                              NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
-                     getI32Imm(CodeAddrSpace, dl),
-                     getI32Imm(vecType, dl),
-                     getI32Imm(fromType, dl),
-                     getI32Imm(fromTypeWidth, dl),
-                     Base,
-                     Offset,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                                : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -1169,14 +1010,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
-                     getI32Imm(CodeAddrSpace, dl),
-                     getI32Imm(vecType, dl),
-                     getI32Imm(fromType, dl),
-                     getI32Imm(fromTypeWidth, dl),
-                     Base,
-                     Offset,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   } else {
     if (PointerSize == 64)
@@ -1190,13 +1026,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
                                NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
-                     getI32Imm(CodeAddrSpace, dl),
-                     getI32Imm(vecType, dl),
-                     getI32Imm(fromType, dl),
-                     getI32Imm(fromTypeWidth, dl),
-                     N1,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), N1, Chain };
     NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
@@ -1233,8 +1065,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
-  // Memory Semantic Setting
-  unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
 
   // Vector Setting
   MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1301,13 +1138,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
-                     getI32Imm(CodeAddrSpace, DL),
-                     getI32Imm(VecType, DL),
-                     getI32Imm(FromType, DL),
-                     getI32Imm(FromTypeWidth, DL),
-                     Addr,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Addr, Chain };
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1330,14 +1163,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
-                     getI32Imm(CodeAddrSpace, DL),
-                     getI32Imm(VecType, DL),
-                     getI32Imm(FromType, DL),
-                     getI32Imm(FromTypeWidth, DL),
-                     Base,
-                     Offset,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else if (PointerSize == 64
                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
@@ -1380,14 +1208,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
-                     getI32Imm(CodeAddrSpace, DL),
-                     getI32Imm(VecType, DL),
-                     getI32Imm(FromType, DL),
-                     getI32Imm(FromTypeWidth, DL),
-                     Base,
-                     Offset,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
 
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   } else {
@@ -1430,13 +1253,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
     }
     if (!Opcode)
       return false;
-    SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
-                     getI32Imm(CodeAddrSpace, DL),
-                     getI32Imm(VecType, DL),
-                     getI32Imm(FromType, DL),
-                     getI32Imm(FromTypeWidth, DL),
-                     Op1,
-                     Chain};
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Op1, Chain };
     LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
   }
 
@@ -1879,13 +1698,27 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (!StoreVT.isSimple())
     return false;
 
+  AtomicOrdering Ordering = ST->getSuccessOrdering();
+  // In order to lower atomic loads with stronger guarantees we would need to
+  // use store.release or insert fences. However these features were only added
+  // with PTX ISA 6.0 / sm_70.
+  // TODO: Check if we can actually use the new instructions and implement them.
+  if (isStrongerThanMonotonic(Ordering))
+    return false;
+
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
 
-  // Memory Semantic Setting
-  unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
+  // Volatile Setting
+  // - .volatile is only available for .global and .shared
+  // - .volatile has the same memory synchronization semantics as .relaxed.sys
+  bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
 
   // Vector Setting
   MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1922,7 +1755,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Value,
-                     getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(isVolatile, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1939,7 +1772,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Value,
-                     getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(isVolatile, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1964,7 +1797,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
       return false;
 
     SDValue Ops[] = {Value,
-                     getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(isVolatile, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -1986,7 +1819,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     if (!Opcode)
       return false;
     SDValue Ops[] = {Value,
-                     getI32Imm(CodeMemorySem, dl),
+                     getI32Imm(isVolatile, dl),
                      getI32Imm(CodeAddrSpace, dl),
                      getI32Imm(vecType, dl),
                      getI32Imm(toType, dl),
@@ -2025,8 +1858,13 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   unsigned int PointerSize =
       CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
 
-  // Memory Semantic Setting
-  unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
@@ -2068,7 +1906,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     ToTypeWidth = 32;
   }
 
-  StOps.push_back(getI32Imm(CodeMemorySem, DL));
+  StOps.push_back(getI32Imm(IsVolatile, DL));
   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
   StOps.push_back(getI32Imm(VecType, DL));
   StOps.push_back(getI32Imm(ToType, DL));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 7f1ac8688007e..cd17a9de541ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2958,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
 multiclass LD<NVPTXRegClass regclass> {
   def _avar : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
   def _areg : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
   def _areg_64 : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
   def _ari : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr+$offset];", []>;
   def _ari_64 : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr+$offset];", []>;
   def _asi : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr+$offset];", []>;
 }
 
@@ -3006,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in {
 multiclass ST<NVPTXRegClass regclass> {
   def _avar : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr], $src;", []>;
   def _areg : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr], $src;", []>;
   def _areg_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr], $src;", []>;
   def _ari : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr+$offset], $src;", []>;
   def _ari_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr+$offset], $src;", []>;
   def _asi : NVPTXInst<
     (outs),
-    (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
          LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
     " \t[$addr+$offset], $src;", []>;
 }
 
@@ -3057,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in {
 multiclass LD_VEC<NVPTXRegClass regclass> {
   def _v2_avar : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v2_areg : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v2_areg_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v2_ari : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   def _v2_ari_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   def _v2_asi : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   def _v4_avar : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   def _v4_areg : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   def _v4_areg_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   def _v4_ari : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   def _v4_ari_64 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   def _v4_asi : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
 }
 let mayLoad=1, hasSideEffects=0 in {
@@ -3140,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in {
 multiclass ST_VEC<NVPTXRegClass regclass> {
   def _v2_avar : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v2_areg : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v2_areg_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v2_ari : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
          i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   def _v2_ari_64 : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
          i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   def _v2_asi : NVPTXInst<
     (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
          i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   def _v4_avar : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_areg : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_areg_64 : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_ari : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_ari_64 : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   def _v4_asi : NVPTXInst<
     (outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
     "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 8df41913ff12e..3ca4c1a24c79a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -78,18 +78,13 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
   bool hasLDG() const { return SmVersion >= 32; }
-  bool hasHWROT32() const { return SmVersion >= 32; }
+  inline bool hasHWROT32() const { return SmVersion >= 32; }
   bool hasImageHandles() const;
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool hasBF16Math() const { return SmVersion >= 80; }
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
-  // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
-  // release, acq_rel, sc) ?
-  bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
-  // Does SM & PTX support atomic relaxed MMIO operations ?
-  bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
deleted file mode 100644
index 7cdced1778a53..0000000000000
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ /dev/null
@@ -1,951 +0,0 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
-; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
-
-; CHECK-LABEL: generic_plain
-define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
-  ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr %a
-
-  ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr %b
-
-  ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr %c
-
-  ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr %d
-
-  ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr %c
-
-  ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr %c
-
-  ret void
-}
-
-; CHECK-LABEL: generic_volatile
-define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr %a
-
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr %b
-
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr %c
-
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr %d
-
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr %c
-
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr %c
-
-  ret void
-}
-
-; CHECK-LABEL: generic_monotonic
-define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr %a monotonic, align 1
-
-  ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr %b monotonic, align 2
-
-  ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr %c monotonic, align 4
-
-  ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr %d monotonic, align 8
-
-  ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr %e monotonic, align 4
-
-  ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: generic_acq_rel
-define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr %a release, align 1
-
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr %b release, align 2
-
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr %c release, align 4
-
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr %d release, align 8
-
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr %e release, align 4
-
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr %e release, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: generic_monotonic_volatile
-define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr %a monotonic, align 1
-
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr %b monotonic, align 2
-
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr %c monotonic, align 4
-
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr %d monotonic, align 8
-
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr %e monotonic, align 4
-
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr %e monotonic, align 8
-
-  ret void
-}
-
-;; global statespace
-
-; CHECK-LABEL: global_plain
-define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
-  ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr addrspace(1) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr addrspace(1) %a
-
-  ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr addrspace(1) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr addrspace(1) %b
-
-  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr addrspace(1) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr addrspace(1) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr addrspace(1) %d
-
-  ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr addrspace(1) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr addrspace(1) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr addrspace(1) %c
-
-  ret void
-}
-
-; CHECK-LABEL: global_volatile
-define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr addrspace(1) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr addrspace(1) %a
-
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr addrspace(1) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr addrspace(1) %b
-
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr addrspace(1) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr addrspace(1) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr addrspace(1) %d
-
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr addrspace(1) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr addrspace(1) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr addrspace(1) %c
-
-  ret void
-}
-
-; CHECK-LABEL: global_monotonic
-define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
-
-  ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
-  ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
-  ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
-  ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
-
-  ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: global_monotonic_volatile
-define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
-
-  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
-  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
-  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
-  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
-
-  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: global_acq_rel
-define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
-
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
-
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
-
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
-
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(1) %e release, align 4
-
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(1) %e release, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: global_acq_rel_volatile
-define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
-
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
-
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
-
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
-
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
-
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
-
-  ret void
-}
-
-;; shared statespace
-
-; CHECK-LABEL: shared_plain
-define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
-  ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr addrspace(3) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr addrspace(3) %a
-
-  ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr addrspace(3) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr addrspace(3) %b
-
-  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr addrspace(3) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr addrspace(3) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr addrspace(3) %d
-
-  ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr addrspace(3) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr addrspace(3) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr addrspace(3) %c
-
-  ret void
-}
-
-; CHECK-LABEL: shared_volatile
-define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr addrspace(3) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr addrspace(3) %a
-
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr addrspace(3) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr addrspace(3) %b
-
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr addrspace(3) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr addrspace(3) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr addrspace(3) %d
-
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr addrspace(3) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr addrspace(3) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr addrspace(3) %c
-
-  ret void
-}
-
-; CHECK-LABEL: shared_monotonic
-define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
-
-  ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
-
-  ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
-
-  ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
-
-  ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
-
-  ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: shared_monotonic_volatile
-define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
-
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
-
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
-
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
-
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
-
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: shared_acq_rel
-define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
-
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
-
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
-
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
-
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(3) %e release, align 4
-
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(3) %e release, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: shared_acq_rel_volatile
-define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
-
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
-
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
-
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
-
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
-
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
-
-  ret void
-}
-
-;; local statespace
-
-; CHECK-LABEL: local_plain
-define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr addrspace(5) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr addrspace(5) %a
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr addrspace(5) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr addrspace(5) %b
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr addrspace(5) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr addrspace(5) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr addrspace(5) %d
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr addrspace(5) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr addrspace(5) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr addrspace(5) %c
-
-  ret void
-}
-
-; CHECK-LABEL: local_volatile
-define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr addrspace(5) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr addrspace(5) %a
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr addrspace(5) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr addrspace(5) %b
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr addrspace(5) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr addrspace(5) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr addrspace(5) %d
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr addrspace(5) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr addrspace(5) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr addrspace(5) %c
-
-  ret void
-}
-
-; CHECK-LABEL: local_monotonic
-define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: local_acq_rel
-define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(5) %e release, align 4
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(5) %e release, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: local_acq_rel_volatile
-define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
-
-  ret void
-}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index 27065f5eca9f4..c477bd9e744cd 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,10 +1,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; generic statespace
-
-; CHECK-LABEL: generic_plain
-define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: plain
+define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load i8, ptr %a
   %a.add = add i8 %a.load, 1
@@ -44,8 +42,8 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ret void
 }
 
-; CHECK-LABEL: generic_volatile
-define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: volatile
+define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
   ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load volatile i8, ptr %a
   %a.add = add i8 %a.load, 1
@@ -85,8 +83,8 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
   ret void
 }
 
-; CHECK-LABEL: generic_monotonic
-define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: monotonic
+define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -125,542 +123,3 @@ define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unn
 
   ret void
 }
-
-; CHECK-LABEL: generic_monotonic_volatile
-define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr %a monotonic, align 1
-
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr %b monotonic, align 2
-
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr %c monotonic, align 4
-
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr %d monotonic, align 8
-
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr %e monotonic, align 4
-
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr %e monotonic, align 8
-
-  ret void
-}
-
-;; global statespace
-
-; CHECK-LABEL: global_plain
-define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
-  ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr addrspace(1) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr addrspace(1) %a
-
-  ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr addrspace(1) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr addrspace(1) %b
-
-  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr addrspace(1) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr addrspace(1) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr addrspace(1) %d
-
-  ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr addrspace(1) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr addrspace(1) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr addrspace(1) %c
-
-  ret void
-}
-
-; CHECK-LABEL: global_volatile
-define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr addrspace(1) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr addrspace(1) %a
-
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr addrspace(1) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr addrspace(1) %b
-
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr addrspace(1) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr addrspace(1) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr addrspace(1) %d
-
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr addrspace(1) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr addrspace(1) %c
-
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr addrspace(1) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr addrspace(1) %c
-
-  ret void
-}
-
-; CHECK-LABEL: global_monotonic
-define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
-
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
-
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: global_monotonic_volatile
-define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
-
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
-
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
-
-  ret void
-}
-
-;; shared statespace
-
-; CHECK-LABEL: shared_plain
-define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
-  ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr addrspace(3) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr addrspace(3) %a
-
-  ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr addrspace(3) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr addrspace(3) %b
-
-  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr addrspace(3) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr addrspace(3) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr addrspace(3) %d
-
-  ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr addrspace(3) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr addrspace(3) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr addrspace(3) %c
-
-  ret void
-}
-
-; CHECK-LABEL: shared_volatile
-define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr addrspace(3) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr addrspace(3) %a
-
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr addrspace(3) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr addrspace(3) %b
-
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr addrspace(3) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr addrspace(3) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr addrspace(3) %d
-
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr addrspace(3) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr addrspace(3) %c
-
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr addrspace(3) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr addrspace(3) %c
-
-  ret void
-}
-
-; CHECK-LABEL: shared_monotonic
-define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
-
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
-
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
-
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
-
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
-
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: shared_monotonic_volatile
-define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
-
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
-
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
-
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
-
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
-
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
-
-  ret void
-}
-
-;; local statespace
-
-; CHECK-LABEL: local_plain
-define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load i8, ptr addrspace(5) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i8 %a.add, ptr addrspace(5) %a
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load i16, ptr addrspace(5) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store i16 %b.add, ptr addrspace(5) %b
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load i32, ptr addrspace(5) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store i32 %c.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load i64, ptr addrspace(5) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store i64 %d.add, ptr addrspace(5) %d
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load float, ptr addrspace(5) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store float %e.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load double, ptr addrspace(5) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store double %f.add, ptr addrspace(5) %c
-
-  ret void
-}
-
-; CHECK-LABEL: local_volatile
-define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load volatile i8, ptr addrspace(5) %a
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i8 %a.add, ptr addrspace(5) %a
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load volatile i16, ptr addrspace(5) %b
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store volatile i16 %b.add, ptr addrspace(5) %b
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load volatile i32, ptr addrspace(5) %c
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store volatile i32 %c.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load volatile i64, ptr addrspace(5) %d
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store volatile i64 %d.add, ptr addrspace(5) %d
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load volatile float, ptr addrspace(5) %c
-  %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store volatile float %e.add, ptr addrspace(5) %c
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load volatile double, ptr addrspace(5) %c
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store volatile double %f.add, ptr addrspace(5) %c
-
-  ret void
-}
-
-; CHECK-LABEL: local_monotonic
-define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
-
-  ret void
-}
-
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
-  %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
-
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
-  %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
-
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
-  %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
-
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
-  %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
-
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
-  %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
-
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
-  %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
-
-  ret void
-}

>From 296a9563690f2fea78e4d746a6d1896262b69da4 Mon Sep 17 00:00:00 2001
From: Shaw Young <58664393+shawbyoung at users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:00:28 -0400
Subject: [PATCH 06/39] [BOLT] Match functions with call graph (#98125)

Implemented call graph function matching. First, two call graphs are
constructed for both profiled and binary functions. Then functions are
hashed based on the names of their callee/caller functions. Finally,
functions are matched based on these neighbor hashes and the
longest common prefix of their names. The `match-with-call-graph`
flag turns this matching on.

Test Plan: Added match-with-call-graph.test. Matched 164 functions
in a large binary with 10171 profiled functions.
---
 bolt/docs/CommandLineArgumentReference.md     |   4 +
 bolt/include/bolt/Profile/YAMLProfileReader.h |  56 +++++++
 bolt/lib/Profile/YAMLProfileReader.cpp        | 157 +++++++++++++++++-
 .../X86/match-functions-with-call-graph.test  | 103 ++++++++++++
 4 files changed, 313 insertions(+), 7 deletions(-)
 create mode 100644 bolt/test/X86/match-functions-with-call-graph.test

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index bd6e2ec01b53e..d2cbf0ca86449 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -686,6 +686,10 @@
   threshold means fewer functions to process. E.g threshold of 90 means only top
   10 percent of functions with profile will be processed.
 
+- `--match-with-call-graph`
+
+  Match functions with call graph
+
 - `--memcpy1-spec=<func1,func2:cs1:cs2,func3:cs1,...>`
 
   List of functions with call sites for which to specialize memcpy() for size 1
diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 502a1ad612bb0..bd5a86fd676a5 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -43,6 +43,59 @@ class YAMLProfileReader : public ProfileReaderBase {
   using ProfileLookupMap =
       DenseMap<uint32_t, yaml::bolt::BinaryFunctionProfile *>;
 
+  /// A class for matching binary functions in functions in the YAML profile.
+  /// First, a call graph is constructed for both profiled and binary functions.
+  /// Then functions are hashed based on the names of their callee/caller
+  /// functions. Finally, functions are matched based on these neighbor hashes.
+  class CallGraphMatcher {
+  public:
+    /// Constructs the call graphs for binary and profiled functions and
+    /// computes neighbor hashes for binary functions.
+    CallGraphMatcher(BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP,
+                     ProfileLookupMap &IdToYAMLBF);
+
+    /// Returns the YamlBFs adjacent to the parameter YamlBF in the call graph.
+    std::optional<std::set<yaml::bolt::BinaryFunctionProfile *>>
+    getAdjacentYamlBFs(yaml::bolt::BinaryFunctionProfile &YamlBF) {
+      auto It = YamlBFAdjacencyMap.find(&YamlBF);
+      return It == YamlBFAdjacencyMap.end() ? std::nullopt
+                                            : std::make_optional(It->second);
+    }
+
+    /// Returns the binary functions with the parameter neighbor hash.
+    std::optional<std::vector<BinaryFunction *>>
+    getBFsWithNeighborHash(uint64_t NeighborHash) {
+      auto It = NeighborHashToBFs.find(NeighborHash);
+      return It == NeighborHashToBFs.end() ? std::nullopt
+                                           : std::make_optional(It->second);
+    }
+
+  private:
+    /// Adds edges to the binary function call graph given the callsites of the
+    /// parameter function.
+    void constructBFCG(BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP);
+
+    /// Using the constructed binary function call graph, computes and creates
+    /// mappings from "neighbor hash" (composed of the function names of callee
+    /// and caller functions of a function) to binary functions.
+    void computeBFNeighborHashes(BinaryContext &BC);
+
+    /// Constructs the call graph for profile functions.
+    void constructYAMLFCG(yaml::bolt::BinaryProfile &YamlBP,
+                          ProfileLookupMap &IdToYAMLBF);
+
+    /// Adjacency map for binary functions in the call graph.
+    DenseMap<BinaryFunction *, std::set<BinaryFunction *>> BFAdjacencyMap;
+
+    /// Maps neighbor hashes to binary functions.
+    DenseMap<uint64_t, std::vector<BinaryFunction *>> NeighborHashToBFs;
+
+    /// Adjacency map for profile functions in the call graph.
+    DenseMap<yaml::bolt::BinaryFunctionProfile *,
+             std::set<yaml::bolt::BinaryFunctionProfile *>>
+        YamlBFAdjacencyMap;
+  };
+
 private:
   /// Adjustments for basic samples profiles (without LBR).
   bool NormalizeByInsnCount{false};
@@ -100,6 +153,9 @@ class YAMLProfileReader : public ProfileReaderBase {
   /// Matches functions using exact hash.
   size_t matchWithHash(BinaryContext &BC);
 
+  /// Matches functions using the call graph.
+  size_t matchWithCallGraph(BinaryContext &BC);
+
   /// Matches functions with similarly named profiled functions.
   size_t matchWithNameSimilarity(BinaryContext &BC);
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index d9a6998bc5e9f..604a9fb4813be 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -41,6 +41,10 @@ llvm::cl::opt<bool>
     MatchProfileWithFunctionHash("match-profile-with-function-hash",
                                  cl::desc("Match profile with function hash"),
                                  cl::Hidden, cl::cat(BoltOptCategory));
+llvm::cl::opt<bool>
+    MatchWithCallGraph("match-with-call-graph",
+                       cl::desc("Match functions with call graph"), cl::Hidden,
+                       cl::cat(BoltOptCategory));
 
 llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
                                   cl::desc("use DFS order for YAML profile"),
@@ -50,6 +54,69 @@ llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
 namespace llvm {
 namespace bolt {
 
+YAMLProfileReader::CallGraphMatcher::CallGraphMatcher(
+    BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP,
+    ProfileLookupMap &IdToYAMLBF) {
+  constructBFCG(BC, YamlBP);
+  constructYAMLFCG(YamlBP, IdToYAMLBF);
+  computeBFNeighborHashes(BC);
+}
+
+void YAMLProfileReader::CallGraphMatcher::constructBFCG(
+    BinaryContext &BC, yaml::bolt::BinaryProfile &YamlBP) {
+  for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
+    for (const BinaryBasicBlock &BB : BF->blocks()) {
+      for (const MCInst &Instr : BB) {
+        if (!BC.MIB->isCall(Instr))
+          continue;
+        const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr);
+        if (!CallSymbol)
+          continue;
+        BinaryData *BD = BC.getBinaryDataByName(CallSymbol->getName());
+        if (!BD)
+          continue;
+        BinaryFunction *CalleeBF = BC.getFunctionForSymbol(BD->getSymbol());
+        if (!CalleeBF)
+          continue;
+
+        BFAdjacencyMap[CalleeBF].insert(BF);
+        BFAdjacencyMap[BF].insert(CalleeBF);
+      }
+    }
+  }
+}
+
+void YAMLProfileReader::CallGraphMatcher::computeBFNeighborHashes(
+    BinaryContext &BC) {
+  for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
+    auto It = BFAdjacencyMap.find(BF);
+    if (It == BFAdjacencyMap.end())
+      continue;
+    auto &AdjacentBFs = It->second;
+    std::string HashStr;
+    for (BinaryFunction *BF : AdjacentBFs)
+      HashStr += BF->getOneName();
+    uint64_t Hash = std::hash<std::string>{}(HashStr);
+    NeighborHashToBFs[Hash].push_back(BF);
+  }
+}
+
+void YAMLProfileReader::CallGraphMatcher::constructYAMLFCG(
+    yaml::bolt::BinaryProfile &YamlBP, ProfileLookupMap &IdToYAMLBF) {
+
+  for (auto &CallerYamlBF : YamlBP.Functions) {
+    for (auto &YamlBB : CallerYamlBF.Blocks) {
+      for (auto &CallSite : YamlBB.CallSites) {
+        auto IdToYAMLBFIt = IdToYAMLBF.find(CallSite.DestId);
+        if (IdToYAMLBFIt == IdToYAMLBF.end())
+          continue;
+        YamlBFAdjacencyMap[&CallerYamlBF].insert(IdToYAMLBFIt->second);
+        YamlBFAdjacencyMap[IdToYAMLBFIt->second].insert(&CallerYamlBF);
+      }
+    }
+  }
+}
+
 bool YAMLProfileReader::isYAML(const StringRef Filename) {
   if (auto MB = MemoryBuffer::getFileOrSTDIN(Filename)) {
     StringRef Buffer = (*MB)->getBuffer();
@@ -350,7 +417,7 @@ bool YAMLProfileReader::profileMatches(
 }
 
 bool YAMLProfileReader::mayHaveProfileData(const BinaryFunction &BF) {
-  if (opts::MatchProfileWithFunctionHash)
+  if (opts::MatchProfileWithFunctionHash || opts::MatchWithCallGraph)
     return true;
   for (StringRef Name : BF.getNames())
     if (ProfileFunctionNames.contains(Name))
@@ -446,6 +513,79 @@ size_t YAMLProfileReader::matchWithLTOCommonName() {
   return MatchedWithLTOCommonName;
 }
 
+size_t YAMLProfileReader::matchWithCallGraph(BinaryContext &BC) {
+  if (!opts::MatchWithCallGraph)
+    return 0;
+
+  size_t MatchedWithCallGraph = 0;
+  CallGraphMatcher CGMatcher(BC, YamlBP, IdToYamLBF);
+
+  ItaniumPartialDemangler Demangler;
+  auto GetBaseName = [&](std::string &FunctionName) {
+    if (Demangler.partialDemangle(FunctionName.c_str()))
+      return std::string("");
+    size_t BufferSize = 1;
+    char *Buffer = static_cast<char *>(std::malloc(BufferSize));
+    char *BaseName = Demangler.getFunctionBaseName(Buffer, &BufferSize);
+    if (!BaseName) {
+      std::free(Buffer);
+      return std::string("");
+    }
+    if (Buffer != BaseName)
+      Buffer = BaseName;
+    std::string BaseNameStr(Buffer, BufferSize);
+    std::free(Buffer);
+    return BaseNameStr;
+  };
+
+  // Matches YAMLBF to BFs with neighbor hashes.
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+    if (YamlBF.Used)
+      continue;
+    auto AdjacentYamlBFsOpt = CGMatcher.getAdjacentYamlBFs(YamlBF);
+    if (!AdjacentYamlBFsOpt)
+      continue;
+    std::set<yaml::bolt::BinaryFunctionProfile *> AdjacentYamlBFs =
+        AdjacentYamlBFsOpt.value();
+    std::string AdjacentYamlBFsHashStr;
+    for (auto *AdjacentYamlBF : AdjacentYamlBFs)
+      AdjacentYamlBFsHashStr += AdjacentYamlBF->Name;
+    uint64_t Hash = std::hash<std::string>{}(AdjacentYamlBFsHashStr);
+    auto BFsWithSameHashOpt = CGMatcher.getBFsWithNeighborHash(Hash);
+    if (!BFsWithSameHashOpt)
+      continue;
+    std::vector<BinaryFunction *> BFsWithSameHash = BFsWithSameHashOpt.value();
+    // Finds the binary function with the longest common prefix to the profiled
+    // function and matches.
+    BinaryFunction *ClosestBF = nullptr;
+    size_t LCP = 0;
+    std::string YamlBFBaseName = GetBaseName(YamlBF.Name);
+    for (BinaryFunction *BF : BFsWithSameHash) {
+      if (ProfiledFunctions.count(BF))
+        continue;
+      std::string BFName = std::string(BF->getOneName());
+      std::string BFBaseName = GetBaseName(BFName);
+      size_t PrefixLength = 0;
+      size_t N = std::min(YamlBFBaseName.size(), BFBaseName.size());
+      for (size_t I = 0; I < N; ++I) {
+        if (YamlBFBaseName[I] != BFBaseName[I])
+          break;
+        ++PrefixLength;
+      }
+      if (PrefixLength >= LCP) {
+        LCP = PrefixLength;
+        ClosestBF = BF;
+      }
+    }
+    if (ClosestBF) {
+      matchProfileToFunction(YamlBF, *ClosestBF);
+      ++MatchedWithCallGraph;
+    }
+  }
+
+  return MatchedWithCallGraph;
+}
+
 size_t YAMLProfileReader::matchWithNameSimilarity(BinaryContext &BC) {
   if (opts::NameSimilarityFunctionMatchingThreshold == 0)
     return 0;
@@ -581,9 +721,14 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
     }
   }
 
+  // Map profiled function ids to names.
+  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
+    IdToYamLBF[YamlBF.Id] = &YamlBF;
+
   const size_t MatchedWithExactName = matchWithExactName();
   const size_t MatchedWithHash = matchWithHash(BC);
   const size_t MatchedWithLTOCommonName = matchWithLTOCommonName();
+  const size_t MatchedWithCallGraph = matchWithCallGraph(BC);
   const size_t MatchedWithNameSimilarity = matchWithNameSimilarity(BC);
 
   for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs))
@@ -603,6 +748,8 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
            << " functions with hash\n";
     outs() << "BOLT-INFO: matched " << MatchedWithLTOCommonName
            << " functions with matching LTO common names\n";
+    outs() << "BOLT-INFO: matched " << MatchedWithCallGraph
+           << " functions with call graph\n";
     outs() << "BOLT-INFO: matched " << MatchedWithNameSimilarity
            << " functions with similar names\n";
   }
@@ -610,11 +757,6 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
   // Set for parseFunctionProfile().
   NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions");
   NormalizeByCalls = usesEvent("branches");
-
-  // Map profiled function ids to names.
-  for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
-    IdToYamLBF[YamlBF.Id] = &YamlBF;
-
   uint64_t NumUnused = 0;
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
     if (YamlBF.Id >= YamlProfileToFunction.size()) {
@@ -630,7 +772,8 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
 
   BC.setNumUnusedProfiledObjects(NumUnused);
 
-  if (opts::Lite && opts::MatchProfileWithFunctionHash) {
+  if (opts::Lite &&
+      (opts::MatchProfileWithFunctionHash || opts::MatchWithCallGraph)) {
     for (BinaryFunction *BF : BC.getAllBinaryFunctions())
       if (!BF->hasProfile())
         BF->setIgnored();
diff --git a/bolt/test/X86/match-functions-with-call-graph.test b/bolt/test/X86/match-functions-with-call-graph.test
new file mode 100644
index 0000000000000..e826c57f35312
--- /dev/null
+++ b/bolt/test/X86/match-functions-with-call-graph.test
@@ -0,0 +1,103 @@
+## Tests blocks matching by called function names in inferStaleProfile.
+
+# REQUIRES: system-linux
+# RUN: split-file %s %t
+# RUN: %clang %cflags %t/main.cpp -o %t.exe -Wl,-q -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml --profile-ignore-hash -v=1 \
+# RUN:   --dyno-stats --print-cfg --infer-stale-profile=1 --match-with-call-graph 2>&1 | FileCheck %s
+
+# CHECK: BOLT-INFO: matched 1 functions with call graph
+
+#--- main.cpp
+void foo() {}
+
+void bar() {}
+
+void qux() {
+    foo();
+    bar();
+}
+
+void fred() {
+    foo();
+    qux();
+    bar();
+    bar();
+    foo();
+}
+
+int main() {
+    return 0;
+}
+
+#--- yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'match-functions-with-calls-as-anchors.s.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func:       xxh3
+functions:
+  - name:            main
+    fid:             0
+    hash:            0x0000000000000001
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000001
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+  - name:            _Z3foov
+    fid:             1
+    hash:            0x0000000000000002
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000002
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+
+  - name:            _Z3barv
+    fid:             2
+    hash:            0x0000000000000003
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000003
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+  - name:            _Z3quxv
+    fid:             3
+    hash:            0x0000000000000004
+    exec:            4
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000004
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+        calls:           [ { off : 0, fid : 1, cnt : 0},
+                           { off : 0, fid : 2, cnt : 0} ]
+  - name:            _Z4bazv
+    fid:             4
+    hash:            0x0000000000000005
+    exec:            1
+    nblocks:         6
+    blocks:
+      - bid:             1
+        hash:            0x0000000000000005
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+        calls:           [ { off : 0, fid : 3, cnt : 0},
+                           { off : 0, fid : 1, cnt : 0},
+                           { off : 0, fid : 2, cnt : 0},
+                           { off : 0, fid : 1, cnt : 0},
+                           { off : 0, fid : 2, cnt : 0} ]
+...

>From ffda5212282b0b3ff943690738b314e3ac5254c3 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Fri, 19 Jul 2024 16:00:40 -0500
Subject: [PATCH 07/39] [libc++] Make libc++ forward-compatible with
 AppleClang's definition of __builtin_verbose_trap (#99529)

AppleClang as included in the Xcode 16 beta implements
`__builtin_verbose_trap`, but it implements slightly different semantics
from the ones implemented upstream. This patch makes libc++ compatible
with either definition until we drop support for the version of
AppleClang that differs from upstream.
---
 libcxx/vendor/llvm/default_assertion_handler.in | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in
index 9bd831c227798..3b6d6b2cca53c 100644
--- a/libcxx/vendor/llvm/default_assertion_handler.in
+++ b/libcxx/vendor/llvm/default_assertion_handler.in
@@ -24,7 +24,13 @@
 #else
 
 #  if __has_builtin(__builtin_verbose_trap)
-#    define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message)
+// AppleClang shipped a slightly different version of __builtin_verbose_trap from the upstream
+// version before upstream Clang actually got the builtin.
+#    if defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 17000
+#      define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap(message)
+#    else
+#      define _LIBCPP_ASSERTION_HANDLER(message) __builtin_verbose_trap("libc++", message)
+#    endif
 #  else
 #    define _LIBCPP_ASSERTION_HANDLER(message) ((void)message, __builtin_trap())
 #  endif

>From d54ec64f6743dee346bd22298577edf1985ce0b8 Mon Sep 17 00:00:00 2001
From: Sayhaan Siddiqui <49014204+sayhaan at users.noreply.github.com>
Date: Fri, 19 Jul 2024 14:03:50 -0700
Subject: [PATCH 08/39] [BOLT][DWARF] Remove deprecated opt (#99575)

Remove deprecated DeterministicDebugInfo option and its uses.
---
 bolt/docs/CommandLineArgumentReference.md |  5 ---
 bolt/lib/Rewrite/DWARFRewriter.cpp        | 37 +++++------------------
 2 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index d2cbf0ca86449..ea51b77d838a1 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -113,11 +113,6 @@
 
   Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
 
-- `--deterministic-debuginfo`
-
-  Disables parallel execution of tasks that may produce nondeterministic debug
-  info
-
 - `--dot-tooltip-code`
 
   Add basic block instructions as tool tips on nodes
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 4ba6344925856..1ec216b39e95c 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -326,12 +326,6 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
-static cl::opt<bool> DeterministicDebugInfo(
-    "deterministic-debuginfo",
-    cl::desc("disables parallel execution of tasks that may produce "
-             "nondeterministic debug info"),
-    cl::init(true), cl::cat(BoltCategory));
-
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
     cl::desc("Path to where .dwo files or dwp file will be written out to."),
@@ -607,11 +601,6 @@ void DWARFRewriter::updateDebugInfo() {
   StrWriter = std::make_unique<DebugStrWriter>(*BC.DwCtx, false);
   StrOffstsWriter = std::make_unique<DebugStrOffsetsWriter>(BC);
 
-  if (!opts::DeterministicDebugInfo) {
-    opts::DeterministicDebugInfo = true;
-    errs() << "BOLT-WARNING: --deterministic-debuginfo is being deprecated\n";
-  }
-
   /// Stores and serializes information that will be put into the
   /// .debug_addr DWARF section.
   std::unique_ptr<DebugAddrWriter> FinalAddrWriter;
@@ -759,25 +748,13 @@ void DWARFRewriter::updateDebugInfo() {
   CUOffsetMap OffsetMap =
       finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
-  const bool SingleThreadedMode =
-      opts::NoThreads || opts::DeterministicDebugInfo;
-  if (!SingleThreadedMode)
-    DIEBlder.buildCompileUnits();
-  if (SingleThreadedMode) {
-    CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
-    for (std::vector<DWARFUnit *> &Vec : PartVec) {
-      DIEBlder.buildCompileUnits(Vec);
-      for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
-        processUnitDIE(CU, &DIEBlder);
-      finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
-                           DIEBlder.getProcessedCUs(), *FinalAddrWriter);
-    }
-  } else {
-    // Update unit debug info in parallel
-    ThreadPoolInterface &ThreadPool = ParallelUtilities::getThreadPool();
-    for (std::unique_ptr<DWARFUnit> &CU : BC.DwCtx->compile_units())
-      ThreadPool.async(processUnitDIE, CU.get(), &DIEBlder);
-    ThreadPool.wait();
+  CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
+  for (std::vector<DWARFUnit *> &Vec : PartVec) {
+    DIEBlder.buildCompileUnits(Vec);
+    for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
+      processUnitDIE(CU, &DIEBlder);
+    finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
+                         DIEBlder.getProcessedCUs(), *FinalAddrWriter);
   }
 
   DebugNamesTable.emitAccelTable();

>From 63625f4406ac9ed419d72f11e701ec94dd72e317 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas at google.com>
Date: Fri, 19 Jul 2024 14:11:51 -0700
Subject: [PATCH 09/39] [SandboxIR] Implement LoadInst (#99597)

This patch implements a `LoadInst` instruction in SandboxIR. It mirrors
`llvm::LoadInst`.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 66 ++++++++++++++++--
 .../llvm/SandboxIR/SandboxIRValues.def        |  1 +
 llvm/lib/SandboxIR/SandboxIR.cpp              | 69 +++++++++++++++++--
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 31 +++++++++
 4 files changed, 156 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index a9f0177eb9338..f168fdf8b1056 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -59,6 +59,7 @@
 #define LLVM_SANDBOXIR_SANDBOXIR_H
 
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Tracker.h"
@@ -74,6 +75,7 @@ class BasicBlock;
 class Context;
 class Function;
 class Instruction;
+class LoadInst;
 class User;
 class Value;
 
@@ -170,9 +172,10 @@ class Value {
   /// order.
   llvm::Value *Val = nullptr;
 
-  friend class Context; // For getting `Val`.
-  friend class User;    // For getting `Val`.
-  friend class Use;     // For getting `Val`.
+  friend class Context;  // For getting `Val`.
+  friend class User;     // For getting `Val`.
+  friend class Use;      // For getting `Val`.
+  friend class LoadInst; // For getting `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -262,11 +265,14 @@ class Value {
                          llvm::function_ref<bool(const Use &)> ShouldReplace);
   void replaceAllUsesWith(Value *Other);
 
+  /// \Returns the LLVM IR name of the bottom-most LLVM value.
+  StringRef getName() const { return Val->getName(); }
+
 #ifndef NDEBUG
   /// Should crash if there is something wrong with the instruction.
   virtual void verify() const = 0;
-  /// Returns the name in the form 'SB<number>.' like 'SB1.'
-  std::string getName() const;
+  /// Returns the unique id in the form 'SB<number>.' like 'SB1.'
+  std::string getUid() const;
   virtual void dumpCommonHeader(raw_ostream &OS) const;
   void dumpCommonFooter(raw_ostream &OS) const;
   void dumpCommonPrefix(raw_ostream &OS) const;
@@ -489,6 +495,7 @@ class Instruction : public sandboxir::User {
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
   llvm::Instruction *getTopmostLLVMInstruction() const;
+  friend class LoadInst; // For getTopmostLLVMInstruction().
 
   /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program
   /// order.
@@ -553,6 +560,45 @@ class Instruction : public sandboxir::User {
 #endif
 };
 
+class LoadInst final : public Instruction {
+  /// Use LoadInst::create() instead of calling the constructor.
+  LoadInst(llvm::LoadInst *LI, Context &Ctx)
+      : Instruction(ClassID::Load, Opcode::Load, LI, Ctx) {}
+  friend Context; // for LoadInst()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          Instruction *InsertBefore, Context &Ctx,
+                          const Twine &Name = "");
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          BasicBlock *InsertAtEnd, Context &Ctx,
+                          const Twine &Name = "");
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  Value *getPointerOperand() const;
+  Align getAlign() const { return cast<llvm::LoadInst>(Val)->getAlign(); }
+  bool isUnordered() const { return cast<llvm::LoadInst>(Val)->isUnordered(); }
+  bool isSimple() const { return cast<llvm::LoadInst>(Val)->isSimple(); }
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::LoadInst>(Val) && "Expected LoadInst!");
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
 /// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
 /// an OpaqueInstr.
 class OpaqueInst : public sandboxir::Instruction {
@@ -683,8 +729,16 @@ class Context {
 
   friend class BasicBlock; // For getOrCreateValue().
 
+  IRBuilder<ConstantFolder> LLVMIRBuilder;
+  auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
+
+  LoadInst *createLoadInst(llvm::LoadInst *LI);
+  friend LoadInst; // For createLoadInst()
+
 public:
-  Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx), IRTracker(*this) {}
+  Context(LLVMContext &LLVMCtx)
+      : LLVMCtx(LLVMCtx), IRTracker(*this),
+        LLVMIRBuilder(LLVMCtx, ConstantFolder()) {}
 
   Tracker &getTracker() { return IRTracker; }
   /// Convenience function for `getTracker().save()`
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index b090ade3ea0ca..e1ed3cdac6bba 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -25,6 +25,7 @@ DEF_USER(Constant, Constant)
 #endif
 //       ClassID, Opcode(s),  Class
 DEF_INSTR(Opaque, OP(Opaque), OpaqueInst)
+DEF_INSTR(Load, OP(Load), LoadInst)
 
 #ifdef DEF_VALUE
 #undef DEF_VALUE
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 87134995a1538..f392704a6d27e 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -140,14 +140,14 @@ void Value::replaceAllUsesWith(Value *Other) {
 }
 
 #ifndef NDEBUG
-std::string Value::getName() const {
+std::string Value::getUid() const {
   std::stringstream SS;
   SS << "SB" << UID << ".";
   return SS.str();
 }
 
 void Value::dumpCommonHeader(raw_ostream &OS) const {
-  OS << getName() << " " << getSubclassIDStr(SubclassID) << " ";
+  OS << getUid() << " " << getSubclassIDStr(SubclassID) << " ";
 }
 
 void Value::dumpCommonFooter(raw_ostream &OS) const {
@@ -167,7 +167,7 @@ void Value::dumpCommonPrefix(raw_ostream &OS) const {
 }
 
 void Value::dumpCommonSuffix(raw_ostream &OS) const {
-  OS << " ; " << getName() << " (" << getSubclassIDStr(SubclassID) << ")";
+  OS << " ; " << getUid() << " (" << getSubclassIDStr(SubclassID) << ")";
 }
 
 void Value::printAsOperandCommon(raw_ostream &OS) const {
@@ -453,6 +453,49 @@ void Instruction::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+#endif // NDEBUG
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, Context &Ctx,
+                           const Twine &Name) {
+  llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(BeforeIR);
+  auto *NewLI = Builder.CreateAlignedLoad(Ty, Ptr->Val, Align,
+                                          /*isVolatile=*/false, Name);
+  auto *NewSBI = Ctx.createLoadInst(NewLI);
+  return NewSBI;
+}
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, Context &Ctx,
+                           const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
+  auto *NewLI = Builder.CreateAlignedLoad(Ty, Ptr->Val, Align,
+                                          /*isVolatile=*/false, Name);
+  auto *NewSBI = Ctx.createLoadInst(NewLI);
+  return NewSBI;
+}
+
+bool LoadInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Load;
+}
+
+Value *LoadInst::getPointerOperand() const {
+  return Ctx.getValue(cast<llvm::LoadInst>(Val)->getPointerOperand());
+}
+
+#ifndef NDEBUG
+void LoadInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void LoadInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
 
 void OpaqueInst::dump(raw_ostream &OS) const {
   dumpCommonPrefix(OS);
@@ -538,8 +581,8 @@ Value *Context::registerValue(std::unique_ptr<Value> &&VPtr) {
   assert(VPtr->getSubclassID() != Value::ClassID::User &&
          "Can't register a user!");
   Value *V = VPtr.get();
-  llvm::Value *Key = V->Val;
-  LLVMValueToValueMap[Key] = std::move(VPtr);
+  auto Pair = LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
+  assert(Pair.second && "Already exists!");
   return V;
 }
 
@@ -568,6 +611,17 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return nullptr;
   }
   assert(isa<llvm::Instruction>(LLVMV) && "Expected Instruction");
+
+  switch (cast<llvm::Instruction>(LLVMV)->getOpcode()) {
+  case llvm::Instruction::Load: {
+    auto *LLVMLd = cast<llvm::LoadInst>(LLVMV);
+    It->second = std::unique_ptr<LoadInst>(new LoadInst(LLVMLd, *this));
+    return It->second.get();
+  }
+  default:
+    break;
+  }
+
   It->second = std::unique_ptr<OpaqueInst>(
       new OpaqueInst(cast<llvm::Instruction>(LLVMV), *this));
   return It->second.get();
@@ -582,6 +636,11 @@ BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
   return BB;
 }
 
+LoadInst *Context::createLoadInst(llvm::LoadInst *LI) {
+  auto NewPtr = std::unique_ptr<LoadInst>(new LoadInst(LI, *this));
+  return cast<LoadInst>(registerValue(std::move(NewPtr)));
+}
+
 Value *Context::getValue(llvm::Value *V) const {
   auto It = LLVMValueToValueMap.find(V);
   if (It != LLVMValueToValueMap.end())
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ec68ed0afeb2f..04beb429502bc 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -560,3 +560,34 @@ define void @foo(i8 %v1) {
   EXPECT_EQ(I0->getNumUses(), 0u);
   EXPECT_EQ(I0->getNextNode(), Ret);
 }
+
+TEST_F(SandboxIRTest, LoadInst) {
+  parseIR(C, R"IR(
+define void @foo(ptr %arg0, ptr %arg1) {
+  %ld = load i8, ptr %arg0, align 64
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Arg0 = F->getArg(0);
+  auto *Arg1 = F->getArg(1);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Ld = cast<sandboxir::LoadInst>(&*It++);
+  auto *Ret = &*It++;
+
+  // Check getPointerOperand()
+  EXPECT_EQ(Ld->getPointerOperand(), Arg0);
+  // Check getAlign()
+  EXPECT_EQ(Ld->getAlign(), 64);
+  // Check create(InsertBefore)
+  sandboxir::LoadInst *NewLd =
+      sandboxir::LoadInst::create(Ld->getType(), Arg1, Align(8),
+                                  /*InsertBefore=*/Ret, Ctx, "NewLd");
+  EXPECT_EQ(NewLd->getType(), Ld->getType());
+  EXPECT_EQ(NewLd->getPointerOperand(), Arg1);
+  EXPECT_EQ(NewLd->getAlign(), 8);
+  EXPECT_EQ(NewLd->getName(), "NewLd");
+}

>From 2b371003d101ccc50081c08c7ed5fd7dcbfdef71 Mon Sep 17 00:00:00 2001
From: Angel Zhang <angel.zhang at amd.com>
Date: Fri, 19 Jul 2024 17:12:33 -0400
Subject: [PATCH 10/39] [mlir][docs] Update documentation for
 `vector.multi_reduction`. NFC (#99668)

---
 mlir/include/mlir/Dialect/Vector/IR/VectorOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 2019eb5a9fd7f..39ad03c801140 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -300,9 +300,9 @@ def Vector_MultiDimReductionOp :
 
     ```mlir
     %1 = vector.multi_reduction <add>, %0, %acc0 [1, 3] :
-      vector<4x8x16x32xf32> into vector<4x16xf32>
+      vector<4x8x16x32xf32> to vector<4x16xf32>
     %2 = vector.multi_reduction <add>, %1, %acc1 [0, 1] :
-      vector<4x16xf32> into f32
+      vector<4x16xf32> to f32
     ```
   }];
   let builders = [

>From 9d03275550d33636a066f84ee3aab81ad1339637 Mon Sep 17 00:00:00 2001
From: Tacet <4922191+AdvenamTacet at users.noreply.github.com>
Date: Fri, 19 Jul 2024 23:24:10 +0200
Subject: [PATCH 11/39] [ASan][libc++] Turn off SSO annotations for Apple
 platforms (#96269)

This commit disables short string AddressSanitizer annotations on Apple
platforms as a temporary solution to the problem reported in #96099.

For more information on Apple's block implementation, please refer to
clang/docs/Block-ABI-Apple.rst [1]. The core issue lies in the fact
that blocks are unaware of their content, causing AddressSanitizer
errors when blocks are moved using `memmove`.

I believe - and I'm not alone - that the issue should ideally be
addressed within the block moving logic. However, this patch provides
a temporary fix until a proper resolution exists in the Blocks runtime.

[1]: https://github.com/llvm/llvm-project/blob/main/clang/docs/Block-ABI-Apple.rst
---
 libcxx/include/string | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libcxx/include/string b/libcxx/include/string
index 54e9d8990c220..2fd1b1e745908 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1983,6 +1983,11 @@ private:
     (void)__old_mid;
     (void)__new_mid;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
+  #if defined(__APPLE__)
+    // TODO: remove after addressing issue #96099 (https://github.com/llvm/llvm-project/issues/96099)
+    if(!__is_long())
+      return;
+  #endif
     std::__annotate_contiguous_container<_Allocator>(data(), data() + capacity() + 1, __old_mid, __new_mid);
 #endif
   }

>From 837d606458f014ceed1b5d4504909f32b83362a8 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue at users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:24:40 -0400
Subject: [PATCH 12/39] [libc] Temporarily disable hypotf sNaN tests for NVPTX
 targets. (#99708)

https://lab.llvm.org/buildbot/#/builders/101/builds/2269
---
 libc/test/src/math/smoke/CMakeLists.txt | 2 ++
 libc/test/src/math/smoke/HypotTest.h    | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 9384df8f5290d..a02648ea1018b 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -2738,6 +2738,7 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.hypotf
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.properties.architectures    
 )
 
 add_fp_unittest(
@@ -2751,6 +2752,7 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.hypot
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.macros.properties.architectures
 )
 
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 7921d531a3196..d7c62dcbeb0ed 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H
 
+#include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -25,8 +26,13 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
     // Pythagorean triples.
     constexpr T PYT[N][3] = {{3, 4, 5}, {5, 12, 13}, {8, 15, 17}, {7, 24, 25}};
 
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
+    // TODO: Investigate why sNaN tests are failing on nVidia.
+    // https://github.com/llvm/llvm-project/issues/99706.
     EXPECT_FP_EQ(func(inf, sNaN), aNaN);
     EXPECT_FP_EQ(func(sNaN, neg_inf), aNaN);
+#endif // !LIBC_TARGET_ARCH_IS_NVPTX
+
     EXPECT_FP_EQ(func(inf, aNaN), inf);
     EXPECT_FP_EQ(func(aNaN, neg_inf), inf);
     EXPECT_FP_EQ(func(aNaN, aNaN), aNaN);

>From 9e74f6600c37131f90736124f7ac4c0901a1746a Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen at gmail.com>
Date: Fri, 19 Jul 2024 22:38:01 +0100
Subject: [PATCH 13/39] [compiler-rt] dump registers for FreeBSD/i386 (#99702)

---
 .../lib/sanitizer_common/sanitizer_linux.cpp    | 12 ++++++++++++
 .../TestCases/FreeBSD/dump_registers_i386.cpp   | 17 +++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 40c46a8949636..483a1042a6238 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -2262,6 +2262,18 @@ void SignalContext::DumpAllRegisters(void *context) {
   Printf("r14 = 0x%016llx  ", ucontext->uc_mcontext.mc_r14);
   Printf("r15 = 0x%016llx  ", ucontext->uc_mcontext.mc_r15);
   Printf("\n");
+#    elif defined(__i386__)
+  Report("Register values:\n");
+  Printf("eax = 0x%08x  ", ucontext->uc_mcontext.mc_eax);
+  Printf("ebx = 0x%08x  ", ucontext->uc_mcontext.mc_ebx);
+  Printf("ecx = 0x%08x  ", ucontext->uc_mcontext.mc_ecx);
+  Printf("edx = 0x%08x  ", ucontext->uc_mcontext.mc_edx);
+  Printf("\n");
+  Printf("edi = 0x%08x  ", ucontext->uc_mcontext.mc_edi);
+  Printf("esi = 0x%08x  ", ucontext->uc_mcontext.mc_esi);
+  Printf("ebp = 0x%08x  ", ucontext->uc_mcontext.mc_ebp);
+  Printf("esp = 0x%08x  ", ucontext->uc_mcontext.mc_esp);
+  Printf("\n");
 #    else
   (void)ucontext;
 #    endif
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp
new file mode 100644
index 0000000000000..74aea4d8b360a
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/dump_registers_i386.cpp
@@ -0,0 +1,17 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: i386-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: eax = {{0x[0-9a-f]+}}  ebx = {{0x[0-9a-f]+}}  ecx = {{0x[0-9a-f]+}}  edx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: edi = {{0x[0-9a-f]+}}  esi = {{0x[0-9a-f]+}}  ebp = {{0x[0-9a-f]+}}  esp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}

>From 84658fb82b67fc22ecba1560d0cddd09f9104178 Mon Sep 17 00:00:00 2001
From: Edd Dawson <edd.dawson at sony.com>
Date: Fri, 19 Jul 2024 22:38:16 +0100
Subject: [PATCH 14/39] =?UTF-8?q?Revert=20"[PS4/PS5][Driver][DWARF]=20Alwa?=
 =?UTF-8?q?ys=20emit=20.debug=5Faranges=20for=20SCE=20t=E2=80=A6=20(#99711?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…uning (#99629)"

This reverts commit 22eb290a9696e2a3fd042096c61e35eca2fcce0c.
---
 clang/lib/Driver/ToolChains/Clang.cpp         |  7 ++--
 clang/lib/Driver/ToolChains/PS4CPU.cpp        |  8 +++++
 clang/test/Driver/debug-options.c             | 21 ++++++------
 clang/test/Driver/lto-jobs.c                  |  2 +-
 clang/test/Driver/ps4-linker.c                |  8 ++---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  5 +--
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |  3 --
 .../DebugInfo/X86/debug-aranges-sce-tuning.ll | 33 -------------------
 8 files changed, 30 insertions(+), 57 deletions(-)
 delete mode 100644 llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.ll

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index abe517f0f5dea..f7b987bf810c1 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4659,8 +4659,11 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
 
   // -gdwarf-aranges turns on the emission of the aranges section in the
   // backend.
-  if (const Arg *A = Args.getLastArg(options::OPT_gdwarf_aranges);
-      A && checkDebugInfoOption(A, Args, D, TC)) {
+  // Always enabled for SCE tuning.
+  bool NeedAranges = DebuggerTuning == llvm::DebuggerKind::SCE;
+  if (const Arg *A = Args.getLastArg(options::OPT_gdwarf_aranges))
+    NeedAranges = checkDebugInfoOption(A, Args, D, TC) || NeedAranges;
+  if (NeedAranges) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-generate-arange-section");
   }
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index d6af9388e54a6..974e486a0082b 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -162,6 +162,10 @@ void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   };
 
   if (UseLTO) {
+    // We default to creating the arange section, but LTO does not. Enable it
+    // here.
+    AddCodeGenFlag("-generate-arange-section");
+
     // This tells LTO to perform JustMyCode instrumentation.
     if (UseJMC)
       AddCodeGenFlag("-enable-jmc-instrument");
@@ -268,6 +272,10 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   };
 
   if (UseLTO) {
+    // We default to creating the arange section, but LTO does not. Enable it
+    // here.
+    AddCodeGenFlag("-generate-arange-section");
+
     // This tells LTO to perform JustMyCode instrumentation.
     if (UseJMC)
       AddCodeGenFlag("-enable-jmc-instrument");
diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c
index 21785ba01cb41..0a665f7017d63 100644
--- a/clang/test/Driver/debug-options.c
+++ b/clang/test/Driver/debug-options.c
@@ -118,28 +118,27 @@
 // RUN: %clang_cl -### -c -Z7 -target x86_64-windows-msvc -- %s 2>&1 \
 // RUN:             | FileCheck -check-prefix=G_NOTUNING %s
 
-// On the PS4/PS5, -g defaults to -gno-column-info. We default to always
-// generating the arange section, but keyed off SCE DebuggerTuning being in
-// play during codegen, instead of -generate-arange-section.
+// On the PS4/PS5, -g defaults to -gno-column-info, and we always generate the
+// arange section.
 // RUN: %clang -### -c %s -target x86_64-scei-ps4 2>&1 \
 // RUN:             | FileCheck -check-prefix=NOG_PS %s
 // RUN: %clang -### -c %s -target x86_64-sie-ps5 2>&1 \
 // RUN:             | FileCheck -check-prefix=NOG_PS %s
 /// PS4 will stay on v4 even if the generic default version changes.
 // RUN: %clang -### -c %s -g -target x86_64-scei-ps4 2>&1 \
-// RUN:             | FileCheck -check-prefixes=G_DWARF4,G_SCE,NOCI,FWD_TMPL_PARAMS %s
+// RUN:             | FileCheck -check-prefixes=G_DWARF4,GARANGE,G_SCE,NOCI,FWD_TMPL_PARAMS %s
 // RUN: %clang -### -c %s -g -target x86_64-sie-ps5 2>&1 \
-// RUN:             | FileCheck -check-prefixes=G_DWARF5,G_SCE,NOCI,FWD_TMPL_PARAMS %s
+// RUN:             | FileCheck -check-prefixes=G_DWARF5,GARANGE,G_SCE,NOCI,FWD_TMPL_PARAMS %s
 // RUN: %clang -### -c %s -g -gcolumn-info -target x86_64-scei-ps4 2>&1 \
 // RUN:             | FileCheck -check-prefix=CI %s
 // RUN: %clang -### -c %s -gsce -target x86_64-unknown-linux 2>&1 \
 // RUN:             | FileCheck -check-prefix=NOCI %s
 // RUN: %clang -### %s -g -flto=thin -target x86_64-scei-ps4 2>&1 \
-// RUN:             | FileCheck -check-prefix=LDGARANGE %s
+// RUN:             | FileCheck -check-prefix=SNLDTLTOGARANGE %s
 // RUN: %clang -### %s -g -flto=full -target x86_64-scei-ps4 2>&1 \
-// RUN:             | FileCheck -check-prefix=LDGARANGE %s
+// RUN:             | FileCheck -check-prefix=SNLDFLTOGARANGE %s
 // RUN: %clang -### %s -g -flto -target x86_64-scei-ps5 2>&1 \
-// RUN:             | FileCheck -check-prefix=LDGARANGE %s
+// RUN:             | FileCheck -check-prefix=LLDGARANGE %s
 // RUN: %clang -### %s -g -target x86_64-scei-ps5 2>&1 \
 // RUN:             | FileCheck -check-prefix=LDGARANGE %s
 
@@ -322,7 +321,8 @@
 //
 // NOG_PS: "-cc1"
 // NOG_PS-NOT: "-dwarf-version=
-// NOG_PS-NOT: "-generate-arange-section"
+// NOG_PS: "-generate-arange-section"
+// NOG_PS-NOT: "-dwarf-version=
 //
 // G_ERR: error: unknown argument:
 //
@@ -402,7 +402,8 @@
 //
 
 // LDGARANGE: {{".*ld.*"}} {{.*}}
-// LDGARANGE-NOT: -generate-arange-section"
+// LDGARANGE-NOT: "-plugin-opt=-generate-arange-section"
+// LLDGARANGE: {{".*lld.*"}} {{.*}} "-plugin-opt=-generate-arange-section"
 // SNLDTLTOGARANGE: {{".*orbis-ld.*"}} {{.*}} "-lto-thin-debug-options= -generate-arange-section"
 // SNLDFLTOGARANGE: {{".*orbis-ld.*"}} {{.*}} "-lto-debug-options= -generate-arange-section"
 
diff --git a/clang/test/Driver/lto-jobs.c b/clang/test/Driver/lto-jobs.c
index b4f109e4c502c..43a478b0664d8 100644
--- a/clang/test/Driver/lto-jobs.c
+++ b/clang/test/Driver/lto-jobs.c
@@ -11,7 +11,7 @@
 // RUN: %clang --target=x86_64-scei-ps4 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-PS4-LINK-THIN-JOBS-ACTION < %t %s
 //
-// CHECK-PS4-LINK-THIN-JOBS-ACTION: "-lto-thin-debug-options= -threads=5"
+// CHECK-PS4-LINK-THIN-JOBS-ACTION: "-lto-thin-debug-options= -generate-arange-section -threads=5"
 
 // RUN: %clang --target=x86_64-apple-darwin13.3.0 -### %s -flto=thin -flto-jobs=5 2> %t
 // RUN: FileCheck -check-prefix=CHECK-LINK-THIN-JOBS2-ACTION < %t %s
diff --git a/clang/test/Driver/ps4-linker.c b/clang/test/Driver/ps4-linker.c
index be989cdd7d5b1..be0103bffe813 100644
--- a/clang/test/Driver/ps4-linker.c
+++ b/clang/test/Driver/ps4-linker.c
@@ -5,8 +5,8 @@
 // RUN: %clang --target=x86_64-scei-ps4 -flto=full -fjmc %s -### 2>&1 | FileCheck --check-prefixes=CHECK-FULL-LTO,CHECK-LIB %s
 
 // CHECK-NOT: -enable-jmc-instrument
-// CHECK-THIN-LTO: "-lto-thin-debug-options= -enable-jmc-instrument"
-// CHECK-FULL-LTO: "-lto-debug-options= -enable-jmc-instrument"
+// CHECK-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -enable-jmc-instrument"
+// CHECK-FULL-LTO: "-lto-debug-options= -generate-arange-section -enable-jmc-instrument"
 
 // Check the default library name.
 // CHECK-LIB: "--whole-archive" "-lSceDbgJmc" "--no-whole-archive"
@@ -16,5 +16,5 @@
 // RUN: %clang --target=x86_64-scei-ps4 -flto=thin -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-THIN-LTO %s
 // RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-FULL-LTO %s
 
-// CHECK-DIAG-THIN-LTO: "-lto-thin-debug-options= -crash-diagnostics-dir=mydumps"
-// CHECK-DIAG-FULL-LTO: "-lto-debug-options= -crash-diagnostics-dir=mydumps"
+// CHECK-DIAG-THIN-LTO: "-lto-thin-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps"
+// CHECK-DIAG-FULL-LTO: "-lto-debug-options= -generate-arange-section -crash-diagnostics-dir=mydumps"
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 578a90d402c54..80cd5ec501f25 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -354,9 +354,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A)
 
   UseLocSection = !TT.isNVPTX();
 
-  // Always emit .debug_aranges for SCE tuning.
-  UseARangesSection = GenerateARangeSection || tuneForSCE();
-
   HasAppleExtensionAttributes = tuneForLLDB();
 
   // Handle split DWARF.
@@ -1453,7 +1450,7 @@ void DwarfDebug::endModule() {
   emitDebugInfo();
 
   // Emit info into a debug aranges section.
-  if (UseARangesSection)
+  if (GenerateARangeSection)
     emitDebugARanges();
 
   // Emit info into a debug ranges section.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 13f4c379e0027..452485b632c45 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -435,9 +435,6 @@ class DwarfDebug : public DebugHandlerBase {
   ///Allow emission of the .debug_loc section.
   bool UseLocSection = true;
 
-  /// Allow emission of .debug_aranges section
-  bool UseARangesSection = false;
-
   /// Generate DWARF v4 type units.
   bool GenerateTypeUnits;
 
diff --git a/llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.ll b/llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.ll
deleted file mode 100644
index ac0d61aef2b1c..0000000000000
--- a/llvm/test/DebugInfo/X86/debug-aranges-sce-tuning.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; This checks that .debug_aranges is always generated for the SCE debugger
-; tuning.
-
-; RUN: llc -mtriple=x86_64 -debugger-tune=sce -filetype=obj %s -o %t
-; RUN: llvm-dwarfdump -debug-aranges %t | FileCheck %s
-
-; CHECK:      .debug_aranges contents:
-; CHECK-NEXT: Address Range Header:
-; CHECK-SAME:   length = 0x0000002c,
-
-; IR generated and reduced from:
-; $ cat foo.c
-; int foo;
-; $ clang -g -S -emit-llvm foo.c -o foo.ll
-
-target triple = "x86_64-unknown-linux-gnu"
-
- at foo = dso_local global i32 0, align 4, !dbg !0
-
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!6, !7, !8}
-!llvm.ident = !{!9}
-
-!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
-!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
-!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 19.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
-!3 = !DIFile(filename: "foo.c", directory: "/tmp")
-!4 = !{!0}
-!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!6 = !{i32 7, !"Dwarf Version", i32 5}
-!7 = !{i32 2, !"Debug Info Version", i32 3}
-!8 = !{i32 1, !"wchar_size", i32 4}
-!9 = !{!"clang version 19.0.0"}

>From a2f61ba08bebe0be50d3622f2535bc3a1d7e414f Mon Sep 17 00:00:00 2001
From: aaryanshukla <53713108+aaryanshukla at users.noreply.github.com>
Date: Fri, 19 Jul 2024 14:40:34 -0700
Subject: [PATCH 15/39] [libc][math]fadd implementation (#99694)

- **[libc] math fadd**
- **[libc][math] implemented fadd**
---
 libc/config/gpu/entrypoints.txt           |  1 +
 libc/config/linux/aarch64/entrypoints.txt |  1 +
 libc/config/linux/arm/entrypoints.txt     |  1 +
 libc/config/linux/riscv/entrypoints.txt   |  1 +
 libc/config/linux/x86_64/entrypoints.txt  |  1 +
 libc/config/windows/entrypoints.txt       |  1 +
 libc/docs/math/index.rst                  |  2 +-
 libc/spec/stdc.td                         |  1 +
 libc/src/math/CMakeLists.txt              |  1 +
 libc/src/math/fadd.h                      | 20 ++++++++++++++++++++
 libc/src/math/generic/CMakeLists.txt      | 12 ++++++++++++
 libc/src/math/generic/fadd.cpp            | 20 ++++++++++++++++++++
 libc/test/src/math/CMakeLists.txt         | 14 ++++++++++++++
 libc/test/src/math/fadd_test.cpp          | 13 +++++++++++++
 libc/test/src/math/smoke/CMakeLists.txt   | 14 ++++++++++++++
 libc/test/src/math/smoke/fadd_test.cpp    | 13 +++++++++++++
 16 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/math/fadd.h
 create mode 100644 libc/src/math/generic/fadd.cpp
 create mode 100644 libc/test/src/math/fadd_test.cpp
 create mode 100644 libc/test/src/math/smoke/fadd_test.cpp

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b8eb743cf587a..506c7d6d7b314 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -266,6 +266,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
+    libc.src.math.fadd
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.floor
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 208889ba34a59..e2f6bd74bb694 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -370,6 +370,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index a72f8668808a5..0d09e4c22953c 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -239,6 +239,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 266c94d54a9df..33dd8d06173b2 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -371,6 +371,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index cbdee084aa199..7309e95644c74 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -396,6 +396,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd 
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index afc9ca87ff094..b6aced83c5815 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -144,6 +144,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
+    libc.src.math.fadd
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 5fab9ce4df949..cff7f69deb7f8 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -136,7 +136,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fabs             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.7.3               | F.10.4.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| fadd             | N/A              |                 |                        | N/A                  |                        | 7.12.14.1              | F.10.11                    |
+| fadd             | N/A              | |check|         | |check|                | N/A                  | |check|                | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fdim             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.12.1              | F.10.9.1                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 18592e92d330a..285166ef0be78 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -402,6 +402,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fabsl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
           GuardedFunctionSpec<"fabsf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          FunctionSpec<"fadd", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
 
           FunctionSpec<"fdim", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fdimf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index c4e33130e9090..050bf0f1ebf22 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -141,6 +141,7 @@ add_math_entrypoint_object(fabsf)
 add_math_entrypoint_object(fabsl)
 add_math_entrypoint_object(fabsf16)
 add_math_entrypoint_object(fabsf128)
+add_math_entrypoint_object(fadd)
 
 add_math_entrypoint_object(fdim)
 add_math_entrypoint_object(fdimf)
diff --git a/libc/src/math/fadd.h b/libc/src/math/fadd.h
new file mode 100644
index 0000000000000..ec3ce18bb676a
--- /dev/null
+++ b/libc/src/math/fadd.h
@@ -0,0 +1,20 @@
+//===-- Implementation of fadd function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/macros/config.h"
+
+#ifndef LLVM_LIBC_SRC_MATH_FADD_H
+#define LLVM_LIBC_SRC_MATH_FADD_H
+
+namespace LIBC_NAMESPACE_DECL {
+
+float fadd(double x, double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_FADD_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index d775026fabb3e..8e8ffb01adf60 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -452,6 +452,18 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  fadd
+  SRCS
+    fadd.cpp
+  HDRS
+    ../fadd.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   trunc
   SRCS
diff --git a/libc/src/math/generic/fadd.cpp b/libc/src/math/generic/fadd.cpp
new file mode 100644
index 0000000000000..66e5188cbcfd4
--- /dev/null
+++ b/libc/src/math/generic/fadd.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of fadd function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fadd.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, fadd, (double x, double y)) {
+  return fputil::generic::add<float>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index c28385f620cfd..f69b0d9b1b666 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -174,6 +174,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fadd_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    fadd_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.fadd
+    libc.src.__support.FPUtil.basic_operations
+)
+
 add_fp_unittest(
   trunc_test
   NEED_MPFR
diff --git a/libc/test/src/math/fadd_test.cpp b/libc/test/src/math/fadd_test.cpp
new file mode 100644
index 0000000000000..fe9ac8b252ed3
--- /dev/null
+++ b/libc/test/src/math/fadd_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fadd ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/fadd.h"
+
+LIST_ADD_TESTS(float, double, LIBC_NAMESPACE::fadd)
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index a02648ea1018b..69f53c5d27f15 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -141,6 +141,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  fadd_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fadd_test.cpp
+  HDRS
+    AddTest.h
+  DEPENDS
+    libc.src.math.fadd
+    libc.src.__support.FPUtil.basic_operations
+
+)
+
 add_fp_unittest(
   trunc_test
   SUITE
diff --git a/libc/test/src/math/smoke/fadd_test.cpp b/libc/test/src/math/smoke/fadd_test.cpp
new file mode 100644
index 0000000000000..fe9ac8b252ed3
--- /dev/null
+++ b/libc/test/src/math/smoke/fadd_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fadd ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddTest.h"
+
+#include "src/math/fadd.h"
+
+LIST_ADD_TESTS(float, double, LIBC_NAMESPACE::fadd)

>From 56ffbd97fda16008d02180a460211829354f1094 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar at redhat.com>
Date: Fri, 19 Jul 2024 14:47:35 -0700
Subject: [PATCH 16/39] [workflows] Avoid usage of access token in
 issue-write.yml (#94011)

This adds a new composite workflow that allows you to download artifacts
from other workflows without using an access token.

actions/download-artifact from GitHub requires an access token in order
to download artifacts from a different workflow, which is why we can't
use it here if we want to avoid using a token.

See
https://github.com/actions/download-artifact?tab=readme-ov-file#download-artifacts-from-other-workflow-runs-or-repositories
---
 .github/workflows/issue-write.yml             | 17 +++-
 .../unprivileged-download-artifact/action.yml | 81 +++++++++++++++++++
 2 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/unprivileged-download-artifact/action.yml

diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index d4814a2fe9014..5334157a7fd20 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -24,14 +24,21 @@ jobs:
         github.event.workflow_run.conclusion == 'failure'
       )
     steps:
+      - name: Fetch Sources
+        uses: actions/checkout at v4
+        with:
+          sparse-checkout: |
+            .github/workflows/unprivileged-download-artifact/action.yml
+          sparse-checkout-cone-mode: false
       - name: 'Download artifact'
-        uses: actions/download-artifact at 6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+        uses: ./.github/workflows/unprivileged-download-artifact
+        id: download-artifact
         with:
-          github-token: ${{ secrets.ISSUE_WRITE_DOWNLOAD_ARTIFACT }}
           run-id: ${{ github.event.workflow_run.id }}
-          name: workflow-args
+          artifact-name: workflow-args
 
       - name: 'Comment on PR'
+        if: steps.download-artifact.outputs.artifact-id != ''
         uses: actions/github-script at v3
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -144,5 +151,7 @@ jobs:
             });
 
       - name: Dump comments file
-        if: always()
+        if: >-
+          always() &&
+          steps.download-artifact.outputs.artifact-id != ''
         run: cat comments
diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml
new file mode 100644
index 0000000000000..9d8fb59a67c0e
--- /dev/null
+++ b/.github/workflows/unprivileged-download-artifact/action.yml
@@ -0,0 +1,81 @@
+name: Unprivileged Download Artifact
+description: >-
+  Download artifacts from another workflow run without using an access token.
+inputs:
+  run-id:
+    description: >-
+      The run-id for the workflow run that you want to download the artifact
+      from.  If ommitted it will download the most recently created artifact
+      from the repo with the artifact-name.
+    required: false
+  artifact-name:
+    desciption: The name of the artifact to download.
+    required: true
+
+
+outputs:
+  filename:
+    description: >-
+      The filename of the downloaded artifact or the empty string if the
+      artifact was not found.
+    value: ${{ steps.download-artifact.outputs.filename }}
+  artifact-id:
+    description: "The id of the artifact being downloaded."
+    value: ${{ steps.artifact-url.outputs.id }}
+
+
+runs:
+  using: "composite"
+  steps:
+    - uses: actions/github-script at 60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+      id: artifact-url
+      with:
+        script: |
+          var response;
+          if (!"${{ inputs.run-id }}") {
+            response = await github.rest.actions.listArtifactsForRepo({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              name: "${{ inputs.artifact-name }}"
+            })
+          } else {
+            response = await github.rest.actions.listWorkflowRunArtifacts({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              run_id: "${{ inputs.run-id }}",
+              name: "${{ inputs.artifact-name }}"
+            })
+          }
+
+          console.log(response)
+
+          for (artifact of response.data.artifacts) {
+            console.log(artifact);
+          }
+
+          if (response.data.artifacts.length == 0) {
+            console.log("Could not find artifact ${{ inputs.artifact-name }} for workflow run ${{ inputs.run-id }}")
+            return;
+          }
+
+          const url_response = await github.rest.actions.downloadArtifact({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            artifact_id: response.data.artifacts[0].id,
+            archive_format: "zip"
+          })
+
+          core.setOutput("url", url_response.url);
+          core.setOutput("id", response.data.artifacts[0].id);
+
+    - shell: bash
+      if: steps.artifact-url.outputs.url != ''
+      id: download-artifact
+      run: |
+        curl -L -o ${{ inputs.artifact-name }}.zip "${{ steps.artifact-url.outputs.url }}"
+        echo "filename=${{ inputs.artifact-name }}.zip" >> $GITHUB_OUTPUT
+
+    - shell: bash
+      if: steps.download-artifact.outputs.filename != ''
+      run: |
+        unzip ${{ steps.download-artifact.outputs.filename }}

>From fada9227325b3eaa0bdc09a486f29a7f08b7b3fb Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde at fb.com>
Date: Fri, 19 Jul 2024 15:49:22 -0700
Subject: [PATCH 17/39] [LLDB][SBSaveCoreOptions] Fix TestProcessSaveCore
 (#99692)

In #98403 some of the tests were transitioned to the new
`SBProcess::SaveCore(SBSaveCoreOptions)` API, but were not detected in
testing. This patch addresses that.
---
 .../functionalities/process_save_core/TestProcessSaveCore.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py b/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
index 07d06bdc116ec..8573d15733927 100644
--- a/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
+++ b/lldb/test/API/functionalities/process_save_core/TestProcessSaveCore.py
@@ -20,8 +20,8 @@ def test_cannot_save_core_unless_process_stopped(self):
         target = self.dbg.CreateTarget(exe)
         process = target.LaunchSimple(None, None, self.get_process_working_directory())
         self.assertNotEqual(process.GetState(), lldb.eStateStopped)
-        options = SBSaveCoreOptions()
-        options.SetOutputFile(SBFileSpec(core))
+        options = lldb.SBSaveCoreOptions()
+        options.SetOutputFile(lldb.SBFileSpec(core))
         error = process.SaveCore(core)
         self.assertTrue(error.Fail())
 

>From b686600a57f37a938d5ede54b789d6b3543561b0 Mon Sep 17 00:00:00 2001
From: Daniel Hill <dhhillaz at gmail.com>
Date: Fri, 19 Jul 2024 16:52:01 -0700
Subject: [PATCH 18/39] [BOLT] Skip instruction shortening (#93032)

Add the ability to disable the instruction shortening pass through
--shorten-instructions=false
---
 bolt/lib/Rewrite/BinaryPassManager.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index aaa0e1ff4d46f..5dfef0b71cc79 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -263,6 +263,10 @@ static cl::opt<bool> CMOVConversionFlag("cmov-conversion",
                                         cl::ReallyHidden,
                                         cl::cat(BoltOptCategory));
 
+static cl::opt<bool> ShortenInstructions("shorten-instructions",
+                                         cl::desc("shorten instructions"),
+                                         cl::init(true),
+                                         cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
@@ -378,7 +382,8 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   else if (opts::Hugify)
     Manager.registerPass(std::make_unique<HugePage>(NeverPrint));
 
-  Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint));
+  Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint),
+                       opts::ShortenInstructions);
 
   Manager.registerPass(std::make_unique<RemoveNops>(NeverPrint),
                        !opts::KeepNops);

>From b172f4aabe7918362cbac54e394657c37627238c Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu at tencent.com>
Date: Sat, 20 Jul 2024 07:53:12 +0800
Subject: [PATCH 19/39] [SandboxIR] Fix -Wunused-variable in SandboxIR.cpp
 (NFC)

/llvm-project/llvm/lib/SandboxIR/SandboxIR.cpp:584:8:
error: unused variable 'Pair' [-Werror,-Wunused-variable]
  auto Pair = LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
       ^
1 error generated.
---
 llvm/lib/SandboxIR/SandboxIR.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index f392704a6d27e..d49ec0c91e599 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -581,7 +581,8 @@ Value *Context::registerValue(std::unique_ptr<Value> &&VPtr) {
   assert(VPtr->getSubclassID() != Value::ClassID::User &&
          "Can't register a user!");
   Value *V = VPtr.get();
-  auto Pair = LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
+  [[maybe_unused]] auto Pair =
+         LLVMValueToValueMap.insert({VPtr->Val, std::move(VPtr)});
   assert(Pair.second && "Already exists!");
   return V;
 }

>From 7f563232d685f0070d2c6bc74a134e4019dcadc1 Mon Sep 17 00:00:00 2001
From: Itis-hard2name <chenwei at xfusion.com>
Date: Sat, 20 Jul 2024 07:55:21 +0800
Subject: [PATCH 20/39] [bolt][Docs] fix missing option in cmake of stage3 in
 OptimizingClang.md (#93684)

Fixes #93681
---
 bolt/docs/OptimizingClang.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bolt/docs/OptimizingClang.md b/bolt/docs/OptimizingClang.md
index ff7e71b6a76bc..685fcc2b738fa 100644
--- a/bolt/docs/OptimizingClang.md
+++ b/bolt/docs/OptimizingClang.md
@@ -49,6 +49,7 @@ $ cd ${TOPLEV}/stage3
 $ CPATH=${TOPLEV}/stage2-prof-use-lto/install/bin/
 $ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
     -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_ENABLE_PROJECTS="clang" \
     -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3/install
 $ perf record -e cycles:u -j any,u -- ninja clang
 ```

>From 1ee8238f0eab75386522a002c2cbb6146284c0c1 Mon Sep 17 00:00:00 2001
From: klensy <klensy at users.noreply.github.com>
Date: Sat, 20 Jul 2024 02:57:14 +0300
Subject: [PATCH 21/39] [BOLT][test] Fix Filecheck typos (#93979)

Fixes few FileCheck typos in tests and add missing(?) filecheck call in
test.

Co-authored-by: klensy <nightouser at gmail.com>
---
 bolt/test/AArch64/update-debug-reloc.test                | 2 +-
 bolt/test/AArch64/veneer-gold.s                          | 4 ++--
 bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test | 2 +-
 bolt/test/X86/dwarf5-one-loclists-two-bases.test         | 2 +-
 bolt/test/X86/dwarf5-two-loclists.test                   | 2 +-
 bolt/test/X86/dwarf5-two-rnglists.test                   | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/bolt/test/AArch64/update-debug-reloc.test b/bolt/test/AArch64/update-debug-reloc.test
index d57f42a3852a5..dd83229ea7143 100644
--- a/bolt/test/AArch64/update-debug-reloc.test
+++ b/bolt/test/AArch64/update-debug-reloc.test
@@ -2,7 +2,7 @@
 # update-debug-sections option.
 
 RUN: %clang %cflags -g %p/../Inputs/asm_foo.s %p/../Inputs/asm_main.c -o %t.exe
-RUN: llvm-bolt %t.exe -o %t --update-debug-sections
+RUN: llvm-bolt %t.exe -o %t --update-debug-sections 2>&1 | FileCheck %s
 
 CHECK: BOLT-INFO: Target architecture: aarch64
 CHECK-NOT: Reloc num: 10
diff --git a/bolt/test/AArch64/veneer-gold.s b/bolt/test/AArch64/veneer-gold.s
index 3b3e34ecb1a9f..275febce2b372 100644
--- a/bolt/test/AArch64/veneer-gold.s
+++ b/bolt/test/AArch64/veneer-gold.s
@@ -29,7 +29,7 @@ dummy:
 .type foo, %function
 foo:
 # CHECK: <foo>:
-# CHECK-NEXT : {{.*}} bl {{.*}} <foo2>
+# CHECK-NEXT: {{.*}} bl {{.*}} <foo2>
   bl .L2
   ret
 .size foo, .-foo
@@ -38,7 +38,7 @@ foo:
 .type foo2, %function
 foo2:
 # CHECK: <foo2>:
-# CHECK-NEXT : {{.*}} bl {{.*}} <foo2>
+# CHECK-NEXT: {{.*}} bl {{.*}} <foo2>
   bl .L2
   ret
 .size foo2, .-foo2
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
index a4f5ee77ab565..6c603ba4ee19d 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
@@ -90,7 +90,7 @@
 ; BOLT-DWP: DW_TAG_compile_unit
 ; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
 ; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DW-NOT: DW_AT_dwo_name
+; BOLT-DWP-NOT: DW_AT_dwo_name
 ; BOLT-DWP:       Contribution size = 68, Format = DWARF32, Version = 5
 ; BOLT-DWP-NEXT: "main"
 ; BOLT-DWP-NEXT: "int"
diff --git a/bolt/test/X86/dwarf5-one-loclists-two-bases.test b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
index 873512aad5e8d..f25f6c7a46858 100644
--- a/bolt/test/X86/dwarf5-one-loclists-two-bases.test
+++ b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
@@ -34,7 +34,7 @@
 # POSTCHECK: version = 0x0005
 # POSTCHECK: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x0000000c)
 # POSTCHECK: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x0000000c)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 # POSTCHECK: DW_TAG_variable
 # POSTCHECK: DW_AT_location [DW_FORM_loclistx]
 # POSTCHECK-SAME: indexed (0x0)
diff --git a/bolt/test/X86/dwarf5-two-loclists.test b/bolt/test/X86/dwarf5-two-loclists.test
index 2ede02f3b76fb..a7c6351f9813c 100644
--- a/bolt/test/X86/dwarf5-two-loclists.test
+++ b/bolt/test/X86/dwarf5-two-loclists.test
@@ -45,7 +45,7 @@
 # POSTCHECK: version = 0x0005
 # POSTCHECK: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x0000000c)
 # POSTCHECK: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x0000000c)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 # POSTCHECK: DW_TAG_variable
 # POSTCHECK: DW_AT_location [DW_FORM_loclistx]
 # POSTCHECK-SAME: indexed (0x0)
diff --git a/bolt/test/X86/dwarf5-two-rnglists.test b/bolt/test/X86/dwarf5-two-rnglists.test
index 17cdc7643bae5..98f2e347d7673 100644
--- a/bolt/test/X86/dwarf5-two-rnglists.test
+++ b/bolt/test/X86/dwarf5-two-rnglists.test
@@ -52,7 +52,7 @@
 # POSTCHECK-NEXT: DW_AT_addr_base [DW_FORM_sec_offset]  (0x00000008)
 # POSTCHECK-NEXT: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x0000000c)
 # POSTCHECK-NEXT: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x0000000c)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 # POSTCHECK: DW_TAG_subprogram
 # POSTCHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx]
 # POSTCHECK-SAME: indexed (0x1)
@@ -75,7 +75,7 @@
 # POSTCHECK-NEXT: DW_AT_addr_base [DW_FORM_sec_offset]  (0x00000030)
 # POSTCHECK-NEXT: DW_AT_loclists_base [DW_FORM_sec_offset]	(0x00000045)
 # POSTCHECK-NEXT: DW_AT_rnglists_base [DW_FORM_sec_offset]	(0x00000035)
-# POSTCHECK-EMPTY
+# POSTCHECK-EMPTY:
 
 # POSTCHECK: DW_TAG_subprogram
 # POSTCHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx]

>From 8bc02bf5c6e94489a79c8d924e5d9866fcc18417 Mon Sep 17 00:00:00 2001
From: Eisuke Kawashima <e.kawaschima+github at gmail.com>
Date: Sat, 20 Jul 2024 08:59:56 +0900
Subject: [PATCH 22/39] fix(bolt/**.py): fix comparison to None (#94012)

from PEP8
(https://peps.python.org/pep-0008/#programming-recommendations):

> Comparisons to singletons like None should always be done with is or
is not, never the equality operators.

Co-authored-by: Eisuke Kawashima <e-kwsm at users.noreply.github.com>
---
 bolt/docs/generate_doc.py         | 4 ++--
 bolt/test/perf2bolt/lit.local.cfg | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/docs/generate_doc.py b/bolt/docs/generate_doc.py
index d8829daf677b4..763dc00b44ca3 100644
--- a/bolt/docs/generate_doc.py
+++ b/bolt/docs/generate_doc.py
@@ -45,7 +45,7 @@ def parse_bolt_options(output):
         cleaned_line = line.strip()
 
         if cleaned_line.casefold() in map(str.casefold, section_headers):
-            if prev_section != None:  # Save last option from prev section
+            if prev_section is not None:  # Save last option from prev section
                 add_info(sections, current_section, option, description)
                 option, description = None, []
 
@@ -76,7 +76,7 @@ def parse_bolt_options(output):
                 description = [descr]
                 if option.startswith("--print") or option.startswith("--time"):
                     current_section = "BOLT printing options:"
-                elif prev_section != None:
+                elif prev_section is not None:
                     current_section = prev_section
             continue
 
diff --git a/bolt/test/perf2bolt/lit.local.cfg b/bolt/test/perf2bolt/lit.local.cfg
index 05f41ff333b0e..4ee9ad08cc78a 100644
--- a/bolt/test/perf2bolt/lit.local.cfg
+++ b/bolt/test/perf2bolt/lit.local.cfg
@@ -1,4 +1,4 @@
 import shutil
 
-if shutil.which("perf") != None:
+if shutil.which("perf") is not None:
     config.available_features.add("perf")
\ No newline at end of file

>From be7f1827ff180df7d3f585432048e784eaa932ee Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 19 Jul 2024 17:13:20 -0700
Subject: [PATCH 23/39] [LV] Use llvm::all_of in
 LoopVectorizationCostModel::getMaximizedVFForTarget. NFC (#99585)

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ff60bd894cd40..6b050c138552b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4594,15 +4594,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
 
     // Select the largest VF which doesn't require more registers than existing
     // ones.
-    for (int i = RUs.size() - 1; i >= 0; --i) {
-      bool Selected = true;
-      for (auto &pair : RUs[i].MaxLocalUsers) {
-        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
-        if (pair.second > TargetNumRegisters)
-          Selected = false;
-      }
-      if (Selected) {
-        MaxVF = VFs[i];
+    for (int I = RUs.size() - 1; I >= 0; --I) {
+      const auto &MLU = RUs[I].MaxLocalUsers;
+      if (all_of(MLU, [&](decltype(MLU.front()) &LU) {
+            return LU.second <= TTI.getNumberOfRegisters(LU.first);
+          })) {
+        MaxVF = VFs[I];
         break;
       }
     }

>From 05f0e86cc895181b3d2210458c78938f83353002 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Fri, 19 Jul 2024 17:26:13 -0700
Subject: [PATCH 24/39] [lldb] Change lldb's breakpoint handling behavior
 (#96260)

lldb today has two rules: When a thread stops at a BreakpointSite, we
set the thread's StopReason to be "breakpoint hit" (regardless if we've
actually hit the breakpoint, or if we've merely stopped *at* the
breakpoint instruction/point and haven't tripped it yet). And second,
when resuming a process, any thread sitting at a BreakpointSite is
silently stepped over the BreakpointSite -- because we've already
flagged the breakpoint hit when we stopped there originally.

In this patch, I change lldb to only set a thread's stop reason to
breakpoint-hit when we've actually executed the instruction/triggered
the breakpoint. When we resume, we only silently step past a
BreakpointSite that we've registered as hit. We preserve this state
across inferior function calls that the user may do while stopped, etc.

Also, when a user adds a new breakpoint at $pc while stopped, or changes
$pc to be the address of a BreakpointSite, we will silently step past
that breakpoint when the process resumes. This is purely a UX call, I
don't think there's any person who wants to set a breakpoint at $pc and
then hit it immediately on resuming.

One non-intuitive UX from this change, but I'm convinced it is
necessary: If you're stopped at a BreakpointSite that has not yet
executed, you `stepi`, you will hit the breakpoint and the pc will not
yet advance. This thread has not completed its stepi, and the thread
plan is still on the stack. If you then `continue` the thread, lldb will
now stop and say, "instruction step completed", one instruction past the
BreakpointSite. You can continue a second time to resume execution. I
discussed this with Jim, and trying to paper over this behavior will
lead to more complicated scenarios behaving non-intuitively. And mostly
it's the testsuite that was trying to instruction step past a breakpoint
and getting thrown off -- and I changed those tests to expect the new
behavior.

The bugs driving this change are all from lldb dropping the real stop
reason for a thread and setting it to breakpoint-hit when that was not
the case. Jim hit one where we have an aarch64 watchpoint that triggers
one instruction before a BreakpointSite. On this arch we are notified of
the watchpoint hit after the instruction has been unrolled -- we disable
the watchpoint, instruction step, re-enable the watchpoint and collect
the new value. But now we're on a BreakpointSite so the watchpoint-hit
stop reason is lost.

Another was reported by ZequanWu in
https://discourse.llvm.org/t/lldb-unable-to-break-at-start/78282 we
attach to/launch a process with the pc at a BreakpointSite and
misbehave. Caroline Tice mentioned it is also a problem they've had with
putting a breakpoint on _dl_debug_state.

The change to each Process plugin that does execution control is that

1. If we've stopped at a BreakpointSite that has not been executed yet,
we will call Thread::SetThreadStoppedAtUnexecutedBP(pc) to record
that.  When the thread resumes, if the pc is still at the same site, we
will continue, hit the breakpoint, and stop again.

2. When we've actually hit a breakpoint (enabled for this thread or not),
the Process plugin should call Thread::SetThreadHitBreakpointSite().
When we go to resume the thread, we will push a step-over-breakpoint
ThreadPlan before resuming.

The biggest set of changes is to StopInfoMachException where we
translate a Mach Exception into a stop reason. The Mach exception codes
differ in a few places depending on the target (unambiguously), and I
didn't want to duplicate the new code for each target so I've tested
what mach exceptions we get for each action on each target, and
reorganized StopInfoMachException::CreateStopReasonWithMachException to
document these possible values, and handle them without specializing
based on the target arch.

rdar://123942164
---
 lldb/include/lldb/Target/Thread.h             |  24 ++
 .../Process/Utility/StopInfoMachException.cpp | 322 ++++++++----------
 .../Process/Windows/Common/ProcessWindows.cpp |  32 +-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  93 ++---
 .../Process/scripted/ScriptedThread.cpp       |  11 +
 lldb/source/Target/StopInfo.cpp               |   2 +
 lldb/source/Target/Thread.cpp                 |  15 +-
 .../TestConsecutiveBreakpoints.py             |  26 +-
 .../TestStepOverBreakpoint.py                 |   6 +-
 9 files changed, 266 insertions(+), 265 deletions(-)

diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h
index 2ff1f50d497e2..d3e429e9256df 100644
--- a/lldb/include/lldb/Target/Thread.h
+++ b/lldb/include/lldb/Target/Thread.h
@@ -129,6 +129,7 @@ class Thread : public std::enable_shared_from_this<Thread>,
         register_backup_sp; // You need to restore the registers, of course...
     uint32_t current_inlined_depth;
     lldb::addr_t current_inlined_pc;
+    lldb::addr_t stopped_at_unexecuted_bp;
   };
 
   /// Constructor
@@ -378,6 +379,26 @@ class Thread : public std::enable_shared_from_this<Thread>,
 
   virtual void SetQueueLibdispatchQueueAddress(lldb::addr_t dispatch_queue_t) {}
 
+  /// When a thread stops at an enabled BreakpointSite that has not executed,
+  /// the Process plugin should call SetThreadStoppedAtUnexecutedBP(pc).
+  /// If that BreakpointSite was actually triggered (the instruction was
+  /// executed, for a software breakpoint), regardless of whether the
+  /// breakpoint is valid for this thread, SetThreadHitBreakpointSite()
+  /// should be called to record that fact.
+  ///
+  /// Depending on the structure of the Process plugin, it may be easiest
+  /// to call SetThreadStoppedAtUnexecutedBP(pc) unconditionally when at
+  /// a BreakpointSite, and later when it is known that it was triggered,
+  /// SetThreadHitBreakpointSite() can be called.  These two methods
+  /// overwrite the same piece of state in the Thread, the last one
+  /// called on a Thread wins.
+  void SetThreadStoppedAtUnexecutedBP(lldb::addr_t pc) {
+    m_stopped_at_unexecuted_bp = pc;
+  }
+  void SetThreadHitBreakpointSite() {
+    m_stopped_at_unexecuted_bp = LLDB_INVALID_ADDRESS;
+  }
+
   /// Whether this Thread already has all the Queue information cached or not
   ///
   /// A Thread may be associated with a libdispatch work Queue at a given
@@ -1312,6 +1333,9 @@ class Thread : public std::enable_shared_from_this<Thread>,
   bool m_should_run_before_public_stop;  // If this thread has "stop others" 
                                          // private work to do, then it will
                                          // set this.
+  lldb::addr_t m_stopped_at_unexecuted_bp; // Set to the address of a breakpoint
+                                           // instruction that we have not yet
+                                           // hit, but will hit when we resume.
   const uint32_t m_index_id; ///< A unique 1 based index assigned to each thread
                              /// for easy UI/command line access.
   lldb::RegisterContextSP m_reg_context_sp; ///< The register context for this
diff --git a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
index 25cee369d7ee3..f1853be12638e 100644
--- a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
+++ b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
@@ -488,38 +488,6 @@ const char *StopInfoMachException::GetDescription() {
   return m_description.c_str();
 }
 
-static StopInfoSP GetStopInfoForHardwareBP(Thread &thread, Target *target,
-                                           uint32_t exc_data_count,
-                                           uint64_t exc_sub_code,
-                                           uint64_t exc_sub_sub_code) {
-  // Try hardware watchpoint.
-  if (target) {
-    // The exc_sub_code indicates the data break address.
-    WatchpointResourceSP wp_rsrc_sp =
-        target->GetProcessSP()->GetWatchpointResourceList().FindByAddress(
-            (addr_t)exc_sub_code);
-    if (wp_rsrc_sp && wp_rsrc_sp->GetNumberOfConstituents() > 0) {
-      return StopInfo::CreateStopReasonWithWatchpointID(
-          thread, wp_rsrc_sp->GetConstituentAtIndex(0)->GetID());
-    }
-  }
-
-  // Try hardware breakpoint.
-  ProcessSP process_sp(thread.GetProcess());
-  if (process_sp) {
-    // The exc_sub_code indicates the data break address.
-    lldb::BreakpointSiteSP bp_sp =
-        process_sp->GetBreakpointSiteList().FindByAddress(
-            (lldb::addr_t)exc_sub_code);
-    if (bp_sp && bp_sp->IsEnabled()) {
-      return StopInfo::CreateStopReasonWithBreakpointSiteID(thread,
-                                                            bp_sp->GetID());
-    }
-  }
-
-  return nullptr;
-}
-
 #if defined(__APPLE__)
 const char *
 StopInfoMachException::MachException::Name(exception_type_t exc_type) {
@@ -607,6 +575,25 @@ StopInfoSP StopInfoMachException::CreateStopReasonWithMachException(
       target ? target->GetArchitecture().GetMachine()
              : llvm::Triple::UnknownArch;
 
+  ProcessSP process_sp(thread.GetProcess());
+  RegisterContextSP reg_ctx_sp(thread.GetRegisterContext());
+  // Caveat: with x86 KDP if we've hit a breakpoint, the pc we
+  // receive is past the breakpoint instruction.
+  // If we have a breakpoints at 0x100 and 0x101, we hit the
+  // 0x100 breakpoint and the pc is reported at 0x101.
+  // We will initially mark this thread as being stopped at an
+  // unexecuted breakpoint at 0x101. Later when we see that
+  // we stopped for a Breakpoint reason, we will decrement the
+  // pc, and update the thread to record that we hit the
+  // breakpoint at 0x100.
+  // The fact that the pc may be off by one at this point
+  // (for an x86 KDP breakpoint hit) is not a problem.
+  addr_t pc = reg_ctx_sp->GetPC();
+  BreakpointSiteSP bp_site_sp =
+      process_sp->GetBreakpointSiteList().FindByAddress(pc);
+  if (bp_site_sp && bp_site_sp->IsEnabled())
+    thread.SetThreadStoppedAtUnexecutedBP(pc);
+
   switch (exc_type) {
   case 1: // EXC_BAD_ACCESS
   case 2: // EXC_BAD_INSTRUCTION
@@ -633,171 +620,158 @@ StopInfoSP StopInfoMachException::CreateStopReasonWithMachException(
     }
     break;
 
+    // A mach exception comes with 2-4 pieces of data.
+    // The sub-codes are only provided for certain types
+    // of mach exceptions.
+    // [exc_type, exc_code, exc_sub_code, exc_sub_sub_code]
+    //
+    // Here are all of the EXC_BREAKPOINT, exc_type==6,
+    // exceptions we can receive.
+    //
+    // Instruction step:
+    //   [6, 1, 0]
+    //   Intel KDP [6, 3, ??]
+    //   armv7 [6, 0x102, <stop-pc>]  Same as software breakpoint!
+    //
+    // Software breakpoint:
+    //   x86 [6, 2, 0]
+    //   Intel KDP [6, 2, <bp-addr + 1>]
+    //   arm64 [6, 1, <bp-addr>]
+    //   armv7 [6, 0x102, <bp-addr>]  Same as instruction step!
+    //
+    // Hardware breakpoint:
+    //   x86 [6, 1, <bp-addr>, 0]
+    //   x86/Rosetta not implemented, see software breakpoint
+    //   arm64 [6, 1, <bp-addr>]
+    //   armv7 not implemented, see software breakpoint
+    //
+    // Hardware watchpoint:
+    //   x86 [6, 1, <accessed-addr>, 0] (both Intel hw and Rosetta)
+    //   arm64 [6, 0x102, <accessed-addr>, 0]
+    //   armv7 [6, 0x102, <accessed-addr>, 0]
+    //
+    // arm64 BRK instruction (imm arg not reflected in the ME)
+    //   [ 6, 1, <addr-of-BRK-insn>]
+    //
+    // In order of codes mach exceptions:
+    //   [6, 1, 0] - instruction step
+    //   [6, 1, <bp-addr>] - hardware breakpoint or watchpoint
+    //
+    //   [6, 2, 0] - software breakpoint
+    //   [6, 2, <bp-addr + 1>] - software breakpoint
+    //
+    //   [6, 3] - instruction step
+    //
+    //   [6, 0x102, <stop-pc>] armv7 instruction step
+    //   [6, 0x102, <bp-addr>] armv7 software breakpoint
+    //   [6, 0x102, <accessed-addr>, 0] arm64/armv7 watchpoint
+
   case 6: // EXC_BREAKPOINT
   {
-    bool is_actual_breakpoint = false;
-    bool is_trace_if_actual_breakpoint_missing = false;
-    switch (cpu) {
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      if (exc_code == 1) // EXC_I386_SGL
-      {
-        if (!exc_sub_code) {
-          // This looks like a plain trap.
-          // Have to check if there is a breakpoint here as well.  When you
-          // single-step onto a trap, the single step stops you not to trap.
-          // Since we also do that check below, let's just use that logic.
-          is_actual_breakpoint = true;
-          is_trace_if_actual_breakpoint_missing = true;
-        } else {
-          if (StopInfoSP stop_info =
-                  GetStopInfoForHardwareBP(thread, target, exc_data_count,
-                                           exc_sub_code, exc_sub_sub_code))
-            return stop_info;
-        }
-      } else if (exc_code == 2 || // EXC_I386_BPT
-                 exc_code == 3)   // EXC_I386_BPTFLT
-      {
-        // KDP returns EXC_I386_BPTFLT for trace breakpoints
-        if (exc_code == 3)
-          is_trace_if_actual_breakpoint_missing = true;
-
-        is_actual_breakpoint = true;
+    bool stopped_by_hitting_breakpoint = false;
+    bool stopped_by_completing_stepi = false;
+    bool stopped_watchpoint = false;
+    std::optional<addr_t> address;
+
+    // exc_code 1
+    if (exc_code == 1) {
+      if (exc_sub_code == 0) {
+        stopped_by_completing_stepi = true;
+      } else {
+        // Ambiguous: could be signalling a
+        // breakpoint or watchpoint hit.
+        stopped_by_hitting_breakpoint = true;
+        stopped_watchpoint = true;
+        address = exc_sub_code;
+      }
+    }
+
+    // exc_code 2
+    if (exc_code == 2) {
+      if (exc_sub_code == 0)
+        stopped_by_hitting_breakpoint = true;
+      else {
+        stopped_by_hitting_breakpoint = true;
+        // Intel KDP software breakpoint
         if (!pc_already_adjusted)
           pc_decrement = 1;
       }
-      break;
+    }
 
-    case llvm::Triple::arm:
-    case llvm::Triple::thumb:
-      if (exc_code == 0x102) // EXC_ARM_DA_DEBUG
-      {
-        // LWP_TODO: We need to find the WatchpointResource that matches
-        // the address, and evaluate its Watchpoints.
-
-        // It's a watchpoint, then, if the exc_sub_code indicates a
-        // known/enabled data break address from our watchpoint list.
-        lldb::WatchpointSP wp_sp;
-        if (target)
-          wp_sp = target->GetWatchpointList().FindByAddress(
-              (lldb::addr_t)exc_sub_code);
-        if (wp_sp && wp_sp->IsEnabled()) {
-          return StopInfo::CreateStopReasonWithWatchpointID(thread,
-                                                            wp_sp->GetID());
-        } else {
-          is_actual_breakpoint = true;
-          is_trace_if_actual_breakpoint_missing = true;
-        }
-      } else if (exc_code == 1) // EXC_ARM_BREAKPOINT
-      {
-        is_actual_breakpoint = true;
-        is_trace_if_actual_breakpoint_missing = true;
-      } else if (exc_code == 0) // FIXME not EXC_ARM_BREAKPOINT but a kernel
-                                // is currently returning this so accept it
-                                // as indicating a breakpoint until the
-                                // kernel is fixed
-      {
-        is_actual_breakpoint = true;
-        is_trace_if_actual_breakpoint_missing = true;
-      }
-      break;
+    // exc_code 3
+    if (exc_code == 3)
+      stopped_by_completing_stepi = true;
 
-    case llvm::Triple::aarch64_32:
-    case llvm::Triple::aarch64: {
-      // xnu describes three things with type EXC_BREAKPOINT:
-      //
-      //   exc_code 0x102 [EXC_ARM_DA_DEBUG], exc_sub_code addr-of-insn
-      //      Watchpoint access.  exc_sub_code is the address of the
-      //      instruction which trigged the watchpoint trap.
-      //      debugserver may add the watchpoint number that was triggered
-      //      in exc_sub_sub_code.
-      //
-      //   exc_code 1 [EXC_ARM_BREAKPOINT], exc_sub_code 0
-      //      Instruction step has completed.
-      //
-      //   exc_code 1 [EXC_ARM_BREAKPOINT], exc_sub_code address-of-instruction
-      //      Software breakpoint instruction executed.
-
-      if (exc_code == 1 && exc_sub_code == 0) // EXC_ARM_BREAKPOINT
-      {
-        // This is hit when we single instruction step aka MDSCR_EL1 SS bit 0
-        // is set
-        is_actual_breakpoint = true;
-        is_trace_if_actual_breakpoint_missing = true;
-        if (thread.GetTemporaryResumeState() != eStateStepping)
-          not_stepping_but_got_singlestep_exception = true;
-      }
-      if (exc_code == 0x102) // EXC_ARM_DA_DEBUG
-      {
-        // LWP_TODO: We need to find the WatchpointResource that matches
-        // the address, and evaluate its Watchpoints.
-
-        // It's a watchpoint, then, if the exc_sub_code indicates a
-        // known/enabled data break address from our watchpoint list.
-        lldb::WatchpointSP wp_sp;
-        if (target)
-          wp_sp = target->GetWatchpointList().FindByAddress(
-              (lldb::addr_t)exc_sub_code);
-        if (wp_sp && wp_sp->IsEnabled()) {
-          return StopInfo::CreateStopReasonWithWatchpointID(thread,
-                                                            wp_sp->GetID());
-        }
-        // EXC_ARM_DA_DEBUG seems to be reused for EXC_BREAKPOINT as well as
-        // EXC_BAD_ACCESS
-        if (thread.GetTemporaryResumeState() == eStateStepping)
-          return StopInfo::CreateStopReasonToTrace(thread);
+    // exc_code 0x102
+    if (exc_code == 0x102 && exc_sub_code != 0) {
+      if (cpu == llvm::Triple::arm || cpu == llvm::Triple::thumb) {
+        stopped_by_hitting_breakpoint = true;
+        stopped_by_completing_stepi = true;
       }
-      // It looks like exc_sub_code has the 4 bytes of the instruction that
-      // triggered the exception, i.e. our breakpoint opcode
-      is_actual_breakpoint = exc_code == 1;
-      break;
+      stopped_watchpoint = true;
+      address = exc_sub_code;
     }
 
-    default:
-      break;
-    }
+    // The Mach Exception may have been ambiguous --
+    // e.g. we stopped either because of a breakpoint
+    // or a watchpoint.  We'll disambiguate which it
+    // really was.
 
-    if (is_actual_breakpoint) {
-      RegisterContextSP reg_ctx_sp(thread.GetRegisterContext());
+    if (stopped_by_hitting_breakpoint) {
       addr_t pc = reg_ctx_sp->GetPC() - pc_decrement;
 
-      ProcessSP process_sp(thread.CalculateProcess());
-
-      lldb::BreakpointSiteSP bp_site_sp;
-      if (process_sp)
+      if (address)
+        bp_site_sp =
+            process_sp->GetBreakpointSiteList().FindByAddress(*address);
+      if (!bp_site_sp && reg_ctx_sp) {
         bp_site_sp = process_sp->GetBreakpointSiteList().FindByAddress(pc);
+      }
       if (bp_site_sp && bp_site_sp->IsEnabled()) {
-        // Update the PC if we were asked to do so, but only do so if we find
-        // a breakpoint that we know about cause this could be a trap
-        // instruction in the code
-        if (pc_decrement > 0 && adjust_pc_if_needed)
-          reg_ctx_sp->SetPC(pc);
-
-        // If the breakpoint is for this thread, then we'll report the hit,
-        // but if it is for another thread, we can just report no reason.  We
-        // don't need to worry about stepping over the breakpoint here, that
-        // will be taken care of when the thread resumes and notices that
-        // there's a breakpoint under the pc. If we have an operating system
-        // plug-in, we might have set a thread specific breakpoint using the
-        // operating system thread ID, so we can't make any assumptions about
-        // the thread ID so we must always report the breakpoint regardless
-        // of the thread.
+        // We've hit this breakpoint, whether it was intended for this thread
+        // or not.  Clear this in the Tread object so we step past it on resume.
+        thread.SetThreadHitBreakpointSite();
+
+        // If we have an operating system plug-in, we might have set a thread
+        // specific breakpoint using the operating system thread ID, so we
+        // can't make any assumptions about the thread ID so we must always
+        // report the breakpoint regardless of the thread.
         if (bp_site_sp->ValidForThisThread(thread) ||
-            thread.GetProcess()->GetOperatingSystem() != nullptr)
+            thread.GetProcess()->GetOperatingSystem() != nullptr) {
+          // Update the PC if we were asked to do so, but only do so if we find
+          // a breakpoint that we know about cause this could be a trap
+          // instruction in the code
+          if (pc_decrement > 0 && adjust_pc_if_needed && reg_ctx_sp)
+            reg_ctx_sp->SetPC(pc);
           return StopInfo::CreateStopReasonWithBreakpointSiteID(
               thread, bp_site_sp->GetID());
-        else if (is_trace_if_actual_breakpoint_missing)
-          return StopInfo::CreateStopReasonToTrace(thread);
-        else
+        } else {
           return StopInfoSP();
+        }
       }
+    }
 
-      // Don't call this a trace if we weren't single stepping this thread.
-      if (is_trace_if_actual_breakpoint_missing &&
-          thread.GetTemporaryResumeState() == eStateStepping) {
-        return StopInfo::CreateStopReasonToTrace(thread);
+    // Breakpoint-hit events are handled.
+    // Now handle watchpoints.
+
+    if (stopped_watchpoint && address) {
+      WatchpointResourceSP wp_rsrc_sp =
+          target->GetProcessSP()->GetWatchpointResourceList().FindByAddress(
+              *address);
+      if (wp_rsrc_sp && wp_rsrc_sp->GetNumberOfConstituents() > 0) {
+        return StopInfo::CreateStopReasonWithWatchpointID(
+            thread, wp_rsrc_sp->GetConstituentAtIndex(0)->GetID());
       }
     }
+
+    // Finally, handle instruction step.
+
+    if (stopped_by_completing_stepi) {
+      if (thread.GetTemporaryResumeState() != eStateStepping)
+        not_stepping_but_got_singlestep_exception = true;
+      else
+        return StopInfo::CreateStopReasonToTrace(thread);
+    }
+
   } break;
 
   case 7:  // EXC_SYSCALL
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index f383b3d40a4f3..8b12b87eacbc6 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -377,24 +377,17 @@ void ProcessWindows::RefreshStateAfterStop() {
   if (!stop_thread)
     return;
 
-  switch (active_exception->GetExceptionCode()) {
-  case EXCEPTION_SINGLE_STEP: {
-    RegisterContextSP register_context = stop_thread->GetRegisterContext();
-    const uint64_t pc = register_context->GetPC();
-    BreakpointSiteSP site(GetBreakpointSiteList().FindByAddress(pc));
-    if (site && site->ValidForThisThread(*stop_thread)) {
-      LLDB_LOG(log,
-               "Single-stepped onto a breakpoint in process {0} at "
-               "address {1:x} with breakpoint site {2}",
-               m_session_data->m_debugger->GetProcess().GetProcessId(), pc,
-               site->GetID());
-      stop_info = StopInfo::CreateStopReasonWithBreakpointSiteID(*stop_thread,
-                                                                 site->GetID());
-      stop_thread->SetStopInfo(stop_info);
+  RegisterContextSP register_context = stop_thread->GetRegisterContext();
+  uint64_t pc = register_context->GetPC();
 
-      return;
-    }
+  // If we're at a BreakpointSite, mark this as an Unexecuted Breakpoint.
+  // We'll clear that state if we've actually executed the breakpoint.
+  BreakpointSiteSP site(GetBreakpointSiteList().FindByAddress(pc));
+  if (site && site->IsEnabled())
+    stop_thread->SetThreadStoppedAtUnexecutedBP(pc);
 
+  switch (active_exception->GetExceptionCode()) {
+  case EXCEPTION_SINGLE_STEP: {
     auto *reg_ctx = static_cast<RegisterContextWindows *>(
         stop_thread->GetRegisterContext().get());
     uint32_t slot_id = reg_ctx->GetTriggeredHardwareBreakpointSlotId();
@@ -419,8 +412,6 @@ void ProcessWindows::RefreshStateAfterStop() {
   }
 
   case EXCEPTION_BREAKPOINT: {
-    RegisterContextSP register_context = stop_thread->GetRegisterContext();
-
     int breakpoint_size = 1;
     switch (GetTarget().GetArchitecture().GetMachine()) {
     case llvm::Triple::aarch64:
@@ -443,9 +434,9 @@ void ProcessWindows::RefreshStateAfterStop() {
     }
 
     // The current PC is AFTER the BP opcode, on all architectures.
-    uint64_t pc = register_context->GetPC() - breakpoint_size;
+    pc = register_context->GetPC() - breakpoint_size;
 
-    BreakpointSiteSP site(GetBreakpointSiteList().FindByAddress(pc));
+    site = GetBreakpointSiteList().FindByAddress(pc);
     if (site) {
       LLDB_LOG(log,
                "detected breakpoint in process {0} at address {1:x} with "
@@ -453,6 +444,7 @@ void ProcessWindows::RefreshStateAfterStop() {
                m_session_data->m_debugger->GetProcess().GetProcessId(), pc,
                site->GetID());
 
+      stop_thread->SetThreadHitBreakpointSite();
       if (site->ValidForThisThread(*stop_thread)) {
         LLDB_LOG(log,
                  "Breakpoint site {0} is valid for this thread ({1:x}), "
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 604c92369e9a2..9ab03c1fb8ff3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1598,29 +1598,8 @@ bool ProcessGDBRemote::CalculateThreadStopInfo(ThreadGDBRemote *thread) {
     // that have stop reasons, and if there is no entry for a thread, then it
     // has no stop reason.
     thread->GetRegisterContext()->InvalidateIfNeeded(true);
-    if (!GetThreadStopInfoFromJSON(thread, m_jstopinfo_sp)) {
-      // If a thread is stopped at a breakpoint site, set that as the stop
-      // reason even if it hasn't executed the breakpoint instruction yet.
-      // We will silently step over the breakpoint when we resume execution
-      // and miss the fact that this thread hit the breakpoint.
-      const size_t num_thread_ids = m_thread_ids.size();
-      for (size_t i = 0; i < num_thread_ids; i++) {
-        if (m_thread_ids[i] == thread->GetID() && m_thread_pcs.size() > i) {
-          addr_t pc = m_thread_pcs[i];
-          lldb::BreakpointSiteSP bp_site_sp =
-              thread->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
-          if (bp_site_sp) {
-            if (bp_site_sp->ValidForThisThread(*thread)) {
-              thread->SetStopInfo(
-                  StopInfo::CreateStopReasonWithBreakpointSiteID(
-                      *thread, bp_site_sp->GetID()));
-              return true;
-            }
-          }
-        }
-      }
+    if (!GetThreadStopInfoFromJSON(thread, m_jstopinfo_sp))
       thread->SetStopInfo(StopInfoSP());
-    }
     return true;
   }
 
@@ -1729,6 +1708,12 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
     if (ThreadSP memory_thread_sp = m_thread_list.GetBackingThread(thread_sp))
       thread_sp = memory_thread_sp;
 
+    addr_t pc = thread_sp->GetRegisterContext()->GetPC();
+    BreakpointSiteSP bp_site_sp =
+        thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
+    if (bp_site_sp && bp_site_sp->IsEnabled())
+      thread_sp->SetThreadStoppedAtUnexecutedBP(pc);
+
     if (exc_type != 0) {
       const size_t exc_data_size = exc_data.size();
 
@@ -1745,26 +1730,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
       // to no reason.
       if (!reason.empty() && reason != "none") {
         if (reason == "trace") {
-          addr_t pc = thread_sp->GetRegisterContext()->GetPC();
-          lldb::BreakpointSiteSP bp_site_sp =
-              thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(
-                  pc);
-
-          // If the current pc is a breakpoint site then the StopInfo should be
-          // set to Breakpoint Otherwise, it will be set to Trace.
-          if (bp_site_sp && bp_site_sp->ValidForThisThread(*thread_sp)) {
-            thread_sp->SetStopInfo(
-                StopInfo::CreateStopReasonWithBreakpointSiteID(
-                    *thread_sp, bp_site_sp->GetID()));
-          } else
-            thread_sp->SetStopInfo(
-                StopInfo::CreateStopReasonToTrace(*thread_sp));
+          thread_sp->SetStopInfo(StopInfo::CreateStopReasonToTrace(*thread_sp));
           handled = true;
         } else if (reason == "breakpoint") {
-          addr_t pc = thread_sp->GetRegisterContext()->GetPC();
-          lldb::BreakpointSiteSP bp_site_sp =
-              thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(
-                  pc);
+          thread_sp->SetThreadHitBreakpointSite();
           if (bp_site_sp) {
             // If the breakpoint is for this thread, then we'll report the hit,
             // but if it is for another thread, we can just report no reason.
@@ -1880,39 +1849,41 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
               StopInfo::CreateStopReasonVForkDone(*thread_sp));
           handled = true;
         }
-      } else if (!signo) {
-        addr_t pc = thread_sp->GetRegisterContext()->GetPC();
-        lldb::BreakpointSiteSP bp_site_sp =
-            thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
-
-        // If a thread is stopped at a breakpoint site, set that as the stop
-        // reason even if it hasn't executed the breakpoint instruction yet.
-        // We will silently step over the breakpoint when we resume execution
-        // and miss the fact that this thread hit the breakpoint.
-        if (bp_site_sp && bp_site_sp->ValidForThisThread(*thread_sp)) {
-          thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithBreakpointSiteID(
-              *thread_sp, bp_site_sp->GetID()));
-          handled = true;
-        }
       }
 
       if (!handled && signo && !did_exec) {
         if (signo == SIGTRAP) {
           // Currently we are going to assume SIGTRAP means we are either
           // hitting a breakpoint or hardware single stepping.
+
+          // We can't disambiguate between stepping-to-a-breakpointsite and
+          // hitting-a-breakpointsite.
+          //
+          // A user can instruction-step, and be stopped at a BreakpointSite.
+          // Or a user can be sitting at a BreakpointSite,
+          // instruction-step which hits the breakpoint and the pc does not
+          // advance.
+          //
+          // In both cases, we're at a BreakpointSite when stopped, and
+          // the resume state was eStateStepping.
+
+          // Assume if we're at a BreakpointSite, we hit it.
           handled = true;
           addr_t pc =
               thread_sp->GetRegisterContext()->GetPC() + m_breakpoint_pc_offset;
-          lldb::BreakpointSiteSP bp_site_sp =
+          BreakpointSiteSP bp_site_sp =
               thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(
                   pc);
 
+          // We can't know if we hit it or not. So if we are stopped at
+          // a BreakpointSite, assume we hit it, and should step past the
+          // breakpoint when we resume. This is contrary to how we handle
+          // BreakpointSites in any other location, but we can't know for
+          // sure what happened so it's a reasonable default.
           if (bp_site_sp) {
-            // If the breakpoint is for this thread, then we'll report the hit,
-            // but if it is for another thread, we can just report no reason.
-            // We don't need to worry about stepping over the breakpoint here,
-            // that will be taken care of when the thread resumes and notices
-            // that there's a breakpoint under the pc.
+            if (bp_site_sp->IsEnabled())
+              thread_sp->SetThreadHitBreakpointSite();
+
             if (bp_site_sp->ValidForThisThread(*thread_sp)) {
               if (m_breakpoint_pc_offset != 0)
                 thread_sp->GetRegisterContext()->SetPC(pc);
@@ -1926,8 +1897,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
           } else {
             // If we were stepping then assume the stop was the result of the
             // trace.  If we were not stepping then report the SIGTRAP.
-            // FIXME: We are still missing the case where we single step over a
-            // trap instruction.
             if (thread_sp->GetTemporaryResumeState() == eStateStepping)
               thread_sp->SetStopInfo(
                   StopInfo::CreateStopReasonToTrace(*thread_sp));
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
index 88a4ca3b0389f..d0d1508e85172 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
+++ b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
@@ -229,6 +229,17 @@ bool ScriptedThread::CalculateStopInfo() {
         LLVM_PRETTY_FUNCTION, "Failed to get scripted thread stop info.", error,
         LLDBLog::Thread);
 
+  // If we're at a BreakpointSite, mark that we stopped there and
+  // need to hit the breakpoint when we resume.  This will be cleared
+  // if we CreateStopReasonWithBreakpointSiteID.
+  if (RegisterContextSP reg_ctx_sp = GetRegisterContext()) {
+    addr_t pc = reg_ctx_sp->GetPC();
+    if (BreakpointSiteSP bp_site_sp =
+            GetProcess()->GetBreakpointSiteList().FindByAddress(pc))
+      if (bp_site_sp->IsEnabled())
+        SetThreadStoppedAtUnexecutedBP(pc);
+  }
+
   lldb::StopInfoSP stop_info_sp;
   lldb::StopReason stop_reason_type;
 
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 95f78056b1644..895306b5ef0d4 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1365,6 +1365,8 @@ class StopInfoVForkDone : public StopInfo {
 
 StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread,
                                                           break_id_t break_id) {
+  thread.SetThreadHitBreakpointSite();
+
   return StopInfoSP(new StopInfoBreakpoint(thread, break_id));
 }
 
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index e75f5a356cec2..2105efe9305fe 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -217,6 +217,7 @@ Thread::Thread(Process &process, lldb::tid_t tid, bool use_invalid_index_id)
       m_process_wp(process.shared_from_this()), m_stop_info_sp(),
       m_stop_info_stop_id(0), m_stop_info_override_stop_id(0),
       m_should_run_before_public_stop(false),
+      m_stopped_at_unexecuted_bp(LLDB_INVALID_ADDRESS),
       m_index_id(use_invalid_index_id ? LLDB_INVALID_INDEX32
                                       : process.GetNextThreadIndexID(tid)),
       m_reg_context_sp(), m_state(eStateUnloaded), m_state_mutex(),
@@ -519,6 +520,7 @@ bool Thread::CheckpointThreadState(ThreadStateCheckpoint &saved_state) {
   saved_state.current_inlined_depth = GetCurrentInlinedDepth();
   saved_state.m_completed_plan_checkpoint =
       GetPlans().CheckpointCompletedPlans();
+  saved_state.stopped_at_unexecuted_bp = m_stopped_at_unexecuted_bp;
 
   return true;
 }
@@ -554,6 +556,7 @@ void Thread::RestoreThreadStateFromCheckpoint(
       saved_state.current_inlined_depth);
   GetPlans().RestoreCompletedPlanCheckpoint(
       saved_state.m_completed_plan_checkpoint);
+  m_stopped_at_unexecuted_bp = saved_state.stopped_at_unexecuted_bp;
 }
 
 StateType Thread::GetState() const {
@@ -622,7 +625,15 @@ void Thread::SetupForResume() {
       const addr_t thread_pc = reg_ctx_sp->GetPC();
       BreakpointSiteSP bp_site_sp =
           GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc);
-      if (bp_site_sp) {
+      // If we're at a BreakpointSite which we have either
+      //   1. already triggered/hit, or
+      //   2. the Breakpoint was added while stopped, or the pc was moved
+      //      to this BreakpointSite
+      // Step past the breakpoint before resuming.
+      // If we stopped at a breakpoint instruction/BreakpointSite location
+      // without hitting it, and we're still at that same address on
+      // resuming, then we want to hit the BreakpointSite when we resume.
+      if (bp_site_sp && m_stopped_at_unexecuted_bp != thread_pc) {
         // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the
         // target may not require anything special to step over a breakpoint.
 
@@ -711,6 +722,7 @@ bool Thread::ShouldResume(StateType resume_state) {
 
   if (need_to_resume) {
     ClearStackFrames();
+    m_stopped_at_unexecuted_bp = LLDB_INVALID_ADDRESS;
     // Let Thread subclasses do any special work they need to prior to resuming
     WillResume(resume_state);
   }
@@ -1894,6 +1906,7 @@ Unwind &Thread::GetUnwinder() {
 void Thread::Flush() {
   ClearStackFrames();
   m_reg_context_sp.reset();
+  m_stopped_at_unexecuted_bp = LLDB_INVALID_ADDRESS;
 }
 
 bool Thread::IsStillAtLastBreakpointHit() {
diff --git a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py
index 73de4a294388b..ecea28c6e1f6d 100644
--- a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py
+++ b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py
@@ -2,7 +2,6 @@
 Test that we handle breakpoints on consecutive instructions correctly.
 """
 
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -64,20 +63,30 @@ def test_single_step(self):
         """Test that single step stops at the second breakpoint."""
         self.prepare_test()
 
+        # Instruction step to the next instruction
+        # We haven't executed breakpoint2 yet, we're sitting at it now.
+        step_over = False
+        self.thread.StepInstruction(step_over)
+
         step_over = False
         self.thread.StepInstruction(step_over)
 
+        # We've now hit the breakpoint and this StepInstruction has
+        # been interrupted, it is still sitting on the thread plan stack.
+
         self.assertState(self.process.GetState(), lldb.eStateStopped)
         self.assertEqual(
             self.thread.GetFrameAtIndex(0).GetPCAddress().GetLoadAddress(self.target),
             self.bkpt_address.GetLoadAddress(self.target),
         )
-        self.thread = lldbutil.get_one_thread_stopped_at_breakpoint(
-            self.process, self.breakpoint2
-        )
-        self.assertIsNotNone(
-            self.thread, "Expected one thread to be stopped at breakpoint 2"
-        )
+
+        # One more instruction to complete the Step that was interrupted
+        # earlier.
+        self.thread.StepInstruction(step_over)
+        strm = lldb.SBStream()
+        self.thread.GetDescription(strm)
+        self.assertIn("instruction step into", strm.GetData())
+        self.assertIsNotNone(self.thread, "Expected to see that step-in had completed")
 
         self.finish_test()
 
@@ -106,4 +115,7 @@ def test_single_step_thread_specific(self):
             "Stop reason should be 'plan complete'",
         )
 
+        # Hit our second breakpoint
+        self.process.Continue()
+
         self.finish_test()
diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
index 3a7440a31677a..1315288b35c55 100644
--- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
+++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
@@ -5,7 +5,6 @@
 and eStopReasonPlanComplete when breakpoint's condition fails.
 """
 
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -90,6 +89,11 @@ def test_step_instruction(self):
                 self.assertGreaterEqual(step_count, steps_expected)
                 break
 
+        # We did a `stepi` when we hit our last breakpoint, and the stepi was not
+        # completed yet, so when we resume it will complete (running process.Continue()
+        # would have the same result - we step one instruction and stop again when
+        # our interrupted stepi completes).
+        self.thread.StepInstruction(True)
         # Run the process until termination
         self.process.Continue()
         self.assertState(self.process.GetState(), lldb.eStateExited)

>From 6747f12931b770f2c0d48ba4b01a55c36ab35e7c Mon Sep 17 00:00:00 2001
From: Sayhaan Siddiqui <49014204+sayhaan at users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:52:49 -0700
Subject: [PATCH 25/39] [BOLT][DWARF][NFC] Split processUnitDIE into two
 lambdas (#99225)

Split processUnitDIE into two lambdas to separate the processing of DWO
CUs and CUs in the main binary.
---
 bolt/lib/Rewrite/DWARFRewriter.cpp | 144 +++++++++++++++--------------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 1ec216b39e95c..ccb45f40c5c7a 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -620,9 +620,10 @@ void DWARFRewriter::updateDebugInfo() {
   uint32_t CUIndex = 0;
   std::mutex AccessMutex;
   // Needs to be invoked in the same order as CUs are processed.
-  auto createRangeLocListAddressWriters =
-      [&](DWARFUnit &CU) -> DebugLocWriter * {
+  llvm::DenseMap<uint64_t, uint64_t> LocListWritersIndexByCU;
+  auto createRangeLocListAddressWriters = [&](DWARFUnit &CU) {
     std::lock_guard<std::mutex> Lock(AccessMutex);
+
     const uint16_t DwarfVersion = CU.getVersion();
     if (DwarfVersion >= 5) {
       auto AddrW = std::make_unique<DebugAddrWriterDwarf5>(
@@ -641,7 +642,6 @@ void DWARFRewriter::updateDebugInfo() {
         RangeListsWritersByCU[*DWOId] = std::move(DWORangeListsSectionWriter);
       }
       AddressWritersByCU[CU.getOffset()] = std::move(AddrW);
-
     } else {
       auto AddrW =
           std::make_unique<DebugAddrWriter>(&BC, CU.getAddressByteSize());
@@ -657,7 +657,7 @@ void DWARFRewriter::updateDebugInfo() {
             std::move(LegacyRangesSectionWriterByCU);
       }
     }
-    return LocListWritersByCU[CUIndex++].get();
+    LocListWritersIndexByCU[CU.getOffset()] = CUIndex++;
   };
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
@@ -666,74 +666,68 @@ void DWARFRewriter::updateDebugInfo() {
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
-  auto processUnitDIE = [&](DWARFUnit *Unit, DIEBuilder *DIEBlder) {
-    // Check if the unit is a skeleton and we need special updates for it and
-    // its matching split/DWO CU.
-    std::optional<DWARFUnit *> SplitCU;
+  auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
+                            DIEBuilder &DIEBlder,
+                            DebugRangesSectionWriter &TempRangesSectionWriter,
+                            DebugAddrWriter &AddressWriter) {
+    DIEBuilder DWODIEBuilder(BC, &(SplitCU).getContext(), DebugNamesTable,
+                             &Unit);
+    DWODIEBuilder.buildDWOUnit(SplitCU);
+    std::string DWOName = "";
+    std::optional<std::string> DwarfOutputPath =
+        opts::DwarfOutputPath.empty()
+            ? std::nullopt
+            : std::optional<std::string>(opts::DwarfOutputPath.c_str());
+    {
+      std::lock_guard<std::mutex> Lock(AccessMutex);
+      DWOName = DIEBlder.updateDWONameCompDir(
+          *StrOffstsWriter, *StrWriter, Unit, DwarfOutputPath, std::nullopt);
+    }
+    DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
+    DebugStrWriter DWOStrWriter((SplitCU).getContext(), true);
+    DWODIEBuilder.updateDWONameCompDirForTypes(
+        DWOStrOffstsWriter, DWOStrWriter, SplitCU, DwarfOutputPath, DWOName);
+    DebugLoclistWriter DebugLocDWoWriter(Unit, Unit.getVersion(), true,
+                                         AddressWriter);
+
+    updateUnitDebugInfo(SplitCU, DWODIEBuilder, DebugLocDWoWriter,
+                        TempRangesSectionWriter, AddressWriter);
+    DebugLocDWoWriter.finalize(DWODIEBuilder,
+                               *DWODIEBuilder.getUnitDIEbyUnit(SplitCU));
+    if (Unit.getVersion() >= 5)
+      TempRangesSectionWriter.finalizeSection();
+
+    emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit, State,
+                   DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                   GDBIndexSection);
+  };
+  auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) {
+    DebugAddrWriter &AddressWriter =
+        *AddressWritersByCU[Unit.getOffset()].get();
+    DebugRangesSectionWriter &RangesSectionWriter =
+        Unit.getVersion() >= 5 ? *RangeListsSectionWriter.get()
+                               : *LegacyRangesSectionWriter.get();
+    DebugLocWriter &DebugLocWriter =
+        *LocListWritersByCU[LocListWritersIndexByCU[Unit.getOffset()]].get();
     std::optional<uint64_t> RangesBase;
-    std::optional<uint64_t> DWOId = Unit->getDWOId();
+    std::optional<DWARFUnit *> SplitCU;
+    std::optional<uint64_t> DWOId = Unit.getDWOId();
     if (DWOId)
       SplitCU = BC.getDWOCU(*DWOId);
-    DebugLocWriter *DebugLocWriter = createRangeLocListAddressWriters(*Unit);
-    DebugRangesSectionWriter *RangesSectionWriter =
-        Unit->getVersion() >= 5 ? RangeListsSectionWriter.get()
-                                : LegacyRangesSectionWriter.get();
-    DebugAddrWriter *AddressWriter =
-        AddressWritersByCU[Unit->getOffset()].get();
-    // Skipping CUs that failed to load.
-    if (SplitCU) {
-      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
-                               Unit);
-      DWODIEBuilder.buildDWOUnit(**SplitCU);
-      std::string DWOName = "";
-      std::optional<std::string> DwarfOutputPath =
-          opts::DwarfOutputPath.empty()
-              ? std::nullopt
-              : std::optional<std::string>(opts::DwarfOutputPath.c_str());
-      {
-        std::lock_guard<std::mutex> Lock(AccessMutex);
-        DWOName = DIEBlder->updateDWONameCompDir(
-            *StrOffstsWriter, *StrWriter, *Unit, DwarfOutputPath, std::nullopt);
-      }
-      DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
-      DebugStrWriter DWOStrWriter((*SplitCU)->getContext(), true);
-      DWODIEBuilder.updateDWONameCompDirForTypes(DWOStrOffstsWriter,
-                                                 DWOStrWriter, **SplitCU,
-                                                 DwarfOutputPath, DWOName);
-      DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true,
-                                           *AddressWriter);
-      DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter;
-      if (Unit->getVersion() >= 5) {
-        TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get();
-      } else {
-        TempRangesSectionWriter = LegacyRangesWritersByCU[*DWOId].get();
-        RangesBase = RangesSectionWriter->getSectionOffset();
-      }
-
-      updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter,
-                          *TempRangesSectionWriter, *AddressWriter);
-      DebugLocDWoWriter.finalize(DWODIEBuilder,
-                                 *DWODIEBuilder.getUnitDIEbyUnit(**SplitCU));
-      if (Unit->getVersion() >= 5)
-        TempRangesSectionWriter->finalizeSection();
-
-      emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
-                     GDBIndexSection);
-    }
-
-    if (Unit->getVersion() >= 5) {
-      RangesBase = RangesSectionWriter->getSectionOffset() +
+    if (Unit.getVersion() >= 5) {
+      RangesBase = RangesSectionWriter.getSectionOffset() +
                    getDWARF5RngListLocListHeaderSize();
-      RangesSectionWriter->initSection(*Unit);
-      StrOffstsWriter->finalizeSection(*Unit, *DIEBlder);
+      RangesSectionWriter.initSection(Unit);
+      StrOffstsWriter->finalizeSection(Unit, DIEBlder);
+    } else if (SplitCU) {
+      RangesBase = LegacyRangesSectionWriter.get()->getSectionOffset();
     }
 
-    updateUnitDebugInfo(*Unit, *DIEBlder, *DebugLocWriter, *RangesSectionWriter,
-                        *AddressWriter, RangesBase);
-    DebugLocWriter->finalize(*DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
-    if (Unit->getVersion() >= 5)
-      RangesSectionWriter->finalizeSection();
+    updateUnitDebugInfo(Unit, DIEBlder, DebugLocWriter, RangesSectionWriter,
+                        AddressWriter, RangesBase);
+    DebugLocWriter.finalize(DIEBlder, *DIEBlder.getUnitDIEbyUnit(Unit));
+    if (Unit.getVersion() >= 5)
+      RangesSectionWriter.finalizeSection();
   };
 
   DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
@@ -751,8 +745,24 @@ void DWARFRewriter::updateDebugInfo() {
   CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
   for (std::vector<DWARFUnit *> &Vec : PartVec) {
     DIEBlder.buildCompileUnits(Vec);
+    for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
+      createRangeLocListAddressWriters(*CU);
+      std::optional<DWARFUnit *> SplitCU;
+      std::optional<uint64_t> DWOId = CU->getDWOId();
+      if (DWOId)
+        SplitCU = BC.getDWOCU(*DWOId);
+      if (!SplitCU)
+        continue;
+      DebugAddrWriter &AddressWriter =
+          *AddressWritersByCU[CU->getOffset()].get();
+      DebugRangesSectionWriter *TempRangesSectionWriter =
+          CU->getVersion() >= 5 ? RangeListsWritersByCU[*DWOId].get()
+                                : LegacyRangesWritersByCU[*DWOId].get();
+      processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
+                     AddressWriter);
+    }
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
-      processUnitDIE(CU, &DIEBlder);
+      processMainBinaryCU(*CU, DIEBlder);
     finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
                          DIEBlder.getProcessedCUs(), *FinalAddrWriter);
   }

>From 4a739fb53cacdb850d27b164dd6f173d21d5f083 Mon Sep 17 00:00:00 2001
From: Max Winkler <max.enrico.winkler at gmail.com>
Date: Fri, 19 Jul 2024 18:04:39 -0700
Subject: [PATCH 26/39] [Clang] [docs] [MSVC] Add sections on `__forceinline`
 and intrinsic behaviour differences between Clang and MSVC (#99426)

We have had quite a few issues created around how Clang treats
intrinsics vs how MSVC treats intrinsics.

While I was writing this I also added some sections on behaviour changes
that caught me while porting my MSVC codebase to clang-cl.

Hopefully we can point issues around intrinsics to this doc and
hopefully it is useful to others who run into similar behaviour
differences.
The behaviour differences highlighted here are differences, as far as I
am aware, that we do not intend to change or fix for MSVC.
---
 clang/docs/MSVCCompatibility.rst | 130 +++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/clang/docs/MSVCCompatibility.rst b/clang/docs/MSVCCompatibility.rst
index b2486052abf9a..0b6fea597f8d3 100644
--- a/clang/docs/MSVCCompatibility.rst
+++ b/clang/docs/MSVCCompatibility.rst
@@ -154,3 +154,133 @@ a hint suggesting how to fix the problem.
 As of this writing, Clang is able to compile a simple ATL hello world
 application.  There are still issues parsing WRL headers for modern Windows 8
 apps, but they should be addressed soon.
+
+__forceinline behavior
+======================
+
+``__forceinline`` behaves like ``[[clang::always_inline]]``.
+Inlining is always attempted regardless of optimization level.
+
+This differs from MSVC where ``__forceinline`` is only respected once inline expansion is enabled
+which allows any function marked implicitly or explicitly ``inline`` or ``__forceinline`` to be expanded.
+Therefore functions marked ``__forceinline`` will be expanded when the optimization level is ``/Od`` unlike
+MSVC where ``__forceinline`` will not be expanded under ``/Od``.
+
+SIMD and instruction set intrinsic behavior
+===========================================
+
+Clang follows the GCC model for intrinsics and not the MSVC model.
+There are currently no plans to support the MSVC model.
+
+MSVC intrinsics always emit the machine instruction the intrinsic models regardless of the compile time options specified.
+For example ``__popcnt`` always emits the x86 popcnt instruction even if the compiler does not have the option enabled to emit popcnt on its own volition.
+
+There are two common cases where code that compiles with MSVC will need reworking to build on clang.
+Assume the examples are only built with `-msse2` so we do not have the intrinsics at compile time.
+
+.. code-block:: c++
+
+  unsigned PopCnt(unsigned v) {
+    if (HavePopCnt)
+      return __popcnt(v);
+    else
+      return GenericPopCnt(v);
+  }
+
+.. code-block:: c++
+
+  __m128 dot4_sse3(__m128 v0, __m128 v1) {
+    __m128 r = _mm_mul_ps(v0, v1);
+    r = _mm_hadd_ps(r, r);
+    r = _mm_hadd_ps(r, r);
+    return r;
+  }
+
+Clang expects that either you have compile time support for the target features, `-msse3` and `-mpopcnt`, you mark the function with the expected target feature or use runtime detection with an indirect call.
+
+.. code-block:: c++
+
+  __attribute__((__target__("sse3"))) __m128 dot4_sse3(__m128 v0, __m128 v1) {
+    __m128 r = _mm_mul_ps(v0, v1);
+    r = _mm_hadd_ps(r, r);
+    r = _mm_hadd_ps(r, r);
+    return r;
+  }
+
+The SSE3 dot product can be easily fixed by either building the translation unit with SSE3 support or using `__target__` to compile that specific function with SSE3 support.
+
+.. code-block:: c++
+
+  unsigned PopCnt(unsigned v) {
+    if (HavePopCnt)
+      return __popcnt(v);
+    else
+      return GenericPopCnt(v);
+  }
+
+The above ``PopCnt`` example must be changed to work with clang. If we mark the function with `__target__("popcnt")` then the compiler is free to emit popcnt at will which we do not want. While this isn't a concern in our small example it is a concern in larger functions with surrounding code around the intrinsics. Similar reasoning for compiling the translation unit with `-mpopcnt`.
+We must split each branch into its own function that can be called indirectly instead of using the intrinsic directly.
+
+.. code-block:: c++
+
+  __attribute__((__target__("popcnt"))) unsigned hwPopCnt(unsigned v) { return __popcnt(v); }
+  unsigned (*PopCnt)(unsigned) = HavePopCnt ? hwPopCnt : GenericPopCnt;
+
+.. code-block:: c++
+
+  __attribute__((__target__("popcnt"))) unsigned hwPopCnt(unsigned v) { return __popcnt(v); }
+  unsigned PopCnt(unsigned v) {
+    if (HavePopCnt)
+      return hwPopCnt(v);
+    else
+      return GenericPopCnt(v);
+  }
+
+In the above example ``hwPopCnt`` will not be inlined into ``PopCnt`` since ``PopCnt`` doesn't have the popcnt target feature.
+With a larger function that does real work the function call overhead is negligible. However in our popcnt example there is the function call
+overhead. There is no analog for this specific MSVC behavior in clang.
+
+For clang we effectively have to create the dispatch function ourselves to each specfic implementation.
+
+SIMD vector types
+=================
+
+Clang's simd vector types are builtin types and not user defined types as in MSVC. This does have some observable behavior changes.
+We will look at the x86 `__m128` type for the examples below but the statements apply to all vector types including ARM's `float32x4_t`.
+
+There are no members that can be accessed on the vector types. Vector types are not structs in clang.
+You cannot use ``__m128.m128_f32[0]`` to access the first element of the `__m128`.
+This also means struct initialization like ``__m128{ { 0.0f, 0.0f, 0.0f, 0.0f } }`` will not compile with clang.
+
+Since vector types are builtin types, clang implements operators on them natively.
+
+.. code-block:: c++
+
+  #ifdef _MSC_VER
+  __m128 operator+(__m128 a, __m128 b) { return _mm_add_ps(a, b); }
+  #endif
+
+The above code will fail to compile since overloaded 'operator+' must have at least one parameter of class or enumeration type.
+You will need to fix such code to have the check ``#if defined(_MSC_VER) && !defined(__clang__)``.
+
+Since `__m128` is not a class type in clang any overloads after a template definition will not be considered.
+
+.. code-block:: c++
+
+  template<class T>
+  void foo(T) {}
+
+  template<class T>
+  void bar(T t) {
+    foo(t);
+  }
+
+  void foo(__m128) {}
+
+  int main() {
+    bar(_mm_setzero_ps());
+  }
+
+With MSVC ``foo(__m128)`` will be selected but with clang ``foo<__m128>()`` will be selected since on clang `__m128` is a builtin type.
+
+In general the takeaway is `__m128` is a builtin type on clang while a class type on MSVC.

>From a3ebb669d1027700a2f12bd5e075ad3fde3d4f70 Mon Sep 17 00:00:00 2001
From: aaryanshukla <53713108+aaryanshukla at users.noreply.github.com>
Date: Fri, 19 Jul 2024 18:43:27 -0700
Subject: [PATCH 27/39] [libc][math]: updated math docs for newhdrgen (#99715)

---
 libc/src/math/docs/add_math_function.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md
index 9c23b8ca789bc..e9a6aadc6c44f 100644
--- a/libc/src/math/docs/add_math_function.md
+++ b/libc/src/math/docs/add_math_function.md
@@ -18,7 +18,7 @@ together with its specifications:
 ```
 - Add function specs to the file:
 ```
-  libc/spec/stdc.td
+  libc/newhdrgen/yaml/math.yaml
 ```
 
 ## Implementation

>From 52c08d7ffd380f4abd819c20bec76252272f6337 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Fri, 19 Jul 2024 18:42:17 -0700
Subject: [PATCH 28/39] Revert "[lldb] Change lldb's breakpoint handling
 behavior (#96260)"

This reverts commit 05f0e86cc895181b3d2210458c78938f83353002.

The debuginfo dexter tests are failing, probably because the way
stepping over breakpoints has changed with my patches.  And there
are two API tests fails on the ubuntu-arm (32-bit) bot. I'll need
to investigate both of these, neither has an obvious failure reason.
---
 lldb/include/lldb/Target/Thread.h             |  24 --
 .../Process/Utility/StopInfoMachException.cpp | 322 ++++++++++--------
 .../Process/Windows/Common/ProcessWindows.cpp |  32 +-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  93 +++--
 .../Process/scripted/ScriptedThread.cpp       |  11 -
 lldb/source/Target/StopInfo.cpp               |   2 -
 lldb/source/Target/Thread.cpp                 |  15 +-
 .../TestConsecutiveBreakpoints.py             |  26 +-
 .../TestStepOverBreakpoint.py                 |   6 +-
 9 files changed, 265 insertions(+), 266 deletions(-)

diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h
index d3e429e9256df..2ff1f50d497e2 100644
--- a/lldb/include/lldb/Target/Thread.h
+++ b/lldb/include/lldb/Target/Thread.h
@@ -129,7 +129,6 @@ class Thread : public std::enable_shared_from_this<Thread>,
         register_backup_sp; // You need to restore the registers, of course...
     uint32_t current_inlined_depth;
     lldb::addr_t current_inlined_pc;
-    lldb::addr_t stopped_at_unexecuted_bp;
   };
 
   /// Constructor
@@ -379,26 +378,6 @@ class Thread : public std::enable_shared_from_this<Thread>,
 
   virtual void SetQueueLibdispatchQueueAddress(lldb::addr_t dispatch_queue_t) {}
 
-  /// When a thread stops at an enabled BreakpointSite that has not executed,
-  /// the Process plugin should call SetThreadStoppedAtUnexecutedBP(pc).
-  /// If that BreakpointSite was actually triggered (the instruction was
-  /// executed, for a software breakpoint), regardless of whether the
-  /// breakpoint is valid for this thread, SetThreadHitBreakpointSite()
-  /// should be called to record that fact.
-  ///
-  /// Depending on the structure of the Process plugin, it may be easiest
-  /// to call SetThreadStoppedAtUnexecutedBP(pc) unconditionally when at
-  /// a BreakpointSite, and later when it is known that it was triggered,
-  /// SetThreadHitBreakpointSite() can be called.  These two methods
-  /// overwrite the same piece of state in the Thread, the last one
-  /// called on a Thread wins.
-  void SetThreadStoppedAtUnexecutedBP(lldb::addr_t pc) {
-    m_stopped_at_unexecuted_bp = pc;
-  }
-  void SetThreadHitBreakpointSite() {
-    m_stopped_at_unexecuted_bp = LLDB_INVALID_ADDRESS;
-  }
-
   /// Whether this Thread already has all the Queue information cached or not
   ///
   /// A Thread may be associated with a libdispatch work Queue at a given
@@ -1333,9 +1312,6 @@ class Thread : public std::enable_shared_from_this<Thread>,
   bool m_should_run_before_public_stop;  // If this thread has "stop others" 
                                          // private work to do, then it will
                                          // set this.
-  lldb::addr_t m_stopped_at_unexecuted_bp; // Set to the address of a breakpoint
-                                           // instruction that we have not yet
-                                           // hit, but will hit when we resume.
   const uint32_t m_index_id; ///< A unique 1 based index assigned to each thread
                              /// for easy UI/command line access.
   lldb::RegisterContextSP m_reg_context_sp; ///< The register context for this
diff --git a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
index f1853be12638e..25cee369d7ee3 100644
--- a/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
+++ b/lldb/source/Plugins/Process/Utility/StopInfoMachException.cpp
@@ -488,6 +488,38 @@ const char *StopInfoMachException::GetDescription() {
   return m_description.c_str();
 }
 
+static StopInfoSP GetStopInfoForHardwareBP(Thread &thread, Target *target,
+                                           uint32_t exc_data_count,
+                                           uint64_t exc_sub_code,
+                                           uint64_t exc_sub_sub_code) {
+  // Try hardware watchpoint.
+  if (target) {
+    // The exc_sub_code indicates the data break address.
+    WatchpointResourceSP wp_rsrc_sp =
+        target->GetProcessSP()->GetWatchpointResourceList().FindByAddress(
+            (addr_t)exc_sub_code);
+    if (wp_rsrc_sp && wp_rsrc_sp->GetNumberOfConstituents() > 0) {
+      return StopInfo::CreateStopReasonWithWatchpointID(
+          thread, wp_rsrc_sp->GetConstituentAtIndex(0)->GetID());
+    }
+  }
+
+  // Try hardware breakpoint.
+  ProcessSP process_sp(thread.GetProcess());
+  if (process_sp) {
+    // The exc_sub_code indicates the data break address.
+    lldb::BreakpointSiteSP bp_sp =
+        process_sp->GetBreakpointSiteList().FindByAddress(
+            (lldb::addr_t)exc_sub_code);
+    if (bp_sp && bp_sp->IsEnabled()) {
+      return StopInfo::CreateStopReasonWithBreakpointSiteID(thread,
+                                                            bp_sp->GetID());
+    }
+  }
+
+  return nullptr;
+}
+
 #if defined(__APPLE__)
 const char *
 StopInfoMachException::MachException::Name(exception_type_t exc_type) {
@@ -575,25 +607,6 @@ StopInfoSP StopInfoMachException::CreateStopReasonWithMachException(
       target ? target->GetArchitecture().GetMachine()
              : llvm::Triple::UnknownArch;
 
-  ProcessSP process_sp(thread.GetProcess());
-  RegisterContextSP reg_ctx_sp(thread.GetRegisterContext());
-  // Caveat: with x86 KDP if we've hit a breakpoint, the pc we
-  // receive is past the breakpoint instruction.
-  // If we have a breakpoints at 0x100 and 0x101, we hit the
-  // 0x100 breakpoint and the pc is reported at 0x101.
-  // We will initially mark this thread as being stopped at an
-  // unexecuted breakpoint at 0x101. Later when we see that
-  // we stopped for a Breakpoint reason, we will decrement the
-  // pc, and update the thread to record that we hit the
-  // breakpoint at 0x100.
-  // The fact that the pc may be off by one at this point
-  // (for an x86 KDP breakpoint hit) is not a problem.
-  addr_t pc = reg_ctx_sp->GetPC();
-  BreakpointSiteSP bp_site_sp =
-      process_sp->GetBreakpointSiteList().FindByAddress(pc);
-  if (bp_site_sp && bp_site_sp->IsEnabled())
-    thread.SetThreadStoppedAtUnexecutedBP(pc);
-
   switch (exc_type) {
   case 1: // EXC_BAD_ACCESS
   case 2: // EXC_BAD_INSTRUCTION
@@ -620,158 +633,171 @@ StopInfoSP StopInfoMachException::CreateStopReasonWithMachException(
     }
     break;
 
-    // A mach exception comes with 2-4 pieces of data.
-    // The sub-codes are only provided for certain types
-    // of mach exceptions.
-    // [exc_type, exc_code, exc_sub_code, exc_sub_sub_code]
-    //
-    // Here are all of the EXC_BREAKPOINT, exc_type==6,
-    // exceptions we can receive.
-    //
-    // Instruction step:
-    //   [6, 1, 0]
-    //   Intel KDP [6, 3, ??]
-    //   armv7 [6, 0x102, <stop-pc>]  Same as software breakpoint!
-    //
-    // Software breakpoint:
-    //   x86 [6, 2, 0]
-    //   Intel KDP [6, 2, <bp-addr + 1>]
-    //   arm64 [6, 1, <bp-addr>]
-    //   armv7 [6, 0x102, <bp-addr>]  Same as instruction step!
-    //
-    // Hardware breakpoint:
-    //   x86 [6, 1, <bp-addr>, 0]
-    //   x86/Rosetta not implemented, see software breakpoint
-    //   arm64 [6, 1, <bp-addr>]
-    //   armv7 not implemented, see software breakpoint
-    //
-    // Hardware watchpoint:
-    //   x86 [6, 1, <accessed-addr>, 0] (both Intel hw and Rosetta)
-    //   arm64 [6, 0x102, <accessed-addr>, 0]
-    //   armv7 [6, 0x102, <accessed-addr>, 0]
-    //
-    // arm64 BRK instruction (imm arg not reflected in the ME)
-    //   [ 6, 1, <addr-of-BRK-insn>]
-    //
-    // In order of codes mach exceptions:
-    //   [6, 1, 0] - instruction step
-    //   [6, 1, <bp-addr>] - hardware breakpoint or watchpoint
-    //
-    //   [6, 2, 0] - software breakpoint
-    //   [6, 2, <bp-addr + 1>] - software breakpoint
-    //
-    //   [6, 3] - instruction step
-    //
-    //   [6, 0x102, <stop-pc>] armv7 instruction step
-    //   [6, 0x102, <bp-addr>] armv7 software breakpoint
-    //   [6, 0x102, <accessed-addr>, 0] arm64/armv7 watchpoint
-
   case 6: // EXC_BREAKPOINT
   {
-    bool stopped_by_hitting_breakpoint = false;
-    bool stopped_by_completing_stepi = false;
-    bool stopped_watchpoint = false;
-    std::optional<addr_t> address;
-
-    // exc_code 1
-    if (exc_code == 1) {
-      if (exc_sub_code == 0) {
-        stopped_by_completing_stepi = true;
-      } else {
-        // Ambiguous: could be signalling a
-        // breakpoint or watchpoint hit.
-        stopped_by_hitting_breakpoint = true;
-        stopped_watchpoint = true;
-        address = exc_sub_code;
-      }
-    }
-
-    // exc_code 2
-    if (exc_code == 2) {
-      if (exc_sub_code == 0)
-        stopped_by_hitting_breakpoint = true;
-      else {
-        stopped_by_hitting_breakpoint = true;
-        // Intel KDP software breakpoint
+    bool is_actual_breakpoint = false;
+    bool is_trace_if_actual_breakpoint_missing = false;
+    switch (cpu) {
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      if (exc_code == 1) // EXC_I386_SGL
+      {
+        if (!exc_sub_code) {
+          // This looks like a plain trap.
+          // Have to check if there is a breakpoint here as well.  When you
+          // single-step onto a trap, the single step stops you not to trap.
+          // Since we also do that check below, let's just use that logic.
+          is_actual_breakpoint = true;
+          is_trace_if_actual_breakpoint_missing = true;
+        } else {
+          if (StopInfoSP stop_info =
+                  GetStopInfoForHardwareBP(thread, target, exc_data_count,
+                                           exc_sub_code, exc_sub_sub_code))
+            return stop_info;
+        }
+      } else if (exc_code == 2 || // EXC_I386_BPT
+                 exc_code == 3)   // EXC_I386_BPTFLT
+      {
+        // KDP returns EXC_I386_BPTFLT for trace breakpoints
+        if (exc_code == 3)
+          is_trace_if_actual_breakpoint_missing = true;
+
+        is_actual_breakpoint = true;
         if (!pc_already_adjusted)
           pc_decrement = 1;
       }
-    }
+      break;
 
-    // exc_code 3
-    if (exc_code == 3)
-      stopped_by_completing_stepi = true;
+    case llvm::Triple::arm:
+    case llvm::Triple::thumb:
+      if (exc_code == 0x102) // EXC_ARM_DA_DEBUG
+      {
+        // LWP_TODO: We need to find the WatchpointResource that matches
+        // the address, and evaluate its Watchpoints.
+
+        // It's a watchpoint, then, if the exc_sub_code indicates a
+        // known/enabled data break address from our watchpoint list.
+        lldb::WatchpointSP wp_sp;
+        if (target)
+          wp_sp = target->GetWatchpointList().FindByAddress(
+              (lldb::addr_t)exc_sub_code);
+        if (wp_sp && wp_sp->IsEnabled()) {
+          return StopInfo::CreateStopReasonWithWatchpointID(thread,
+                                                            wp_sp->GetID());
+        } else {
+          is_actual_breakpoint = true;
+          is_trace_if_actual_breakpoint_missing = true;
+        }
+      } else if (exc_code == 1) // EXC_ARM_BREAKPOINT
+      {
+        is_actual_breakpoint = true;
+        is_trace_if_actual_breakpoint_missing = true;
+      } else if (exc_code == 0) // FIXME not EXC_ARM_BREAKPOINT but a kernel
+                                // is currently returning this so accept it
+                                // as indicating a breakpoint until the
+                                // kernel is fixed
+      {
+        is_actual_breakpoint = true;
+        is_trace_if_actual_breakpoint_missing = true;
+      }
+      break;
 
-    // exc_code 0x102
-    if (exc_code == 0x102 && exc_sub_code != 0) {
-      if (cpu == llvm::Triple::arm || cpu == llvm::Triple::thumb) {
-        stopped_by_hitting_breakpoint = true;
-        stopped_by_completing_stepi = true;
+    case llvm::Triple::aarch64_32:
+    case llvm::Triple::aarch64: {
+      // xnu describes three things with type EXC_BREAKPOINT:
+      //
+      //   exc_code 0x102 [EXC_ARM_DA_DEBUG], exc_sub_code addr-of-insn
+      //      Watchpoint access.  exc_sub_code is the address of the
+      //      instruction which trigged the watchpoint trap.
+      //      debugserver may add the watchpoint number that was triggered
+      //      in exc_sub_sub_code.
+      //
+      //   exc_code 1 [EXC_ARM_BREAKPOINT], exc_sub_code 0
+      //      Instruction step has completed.
+      //
+      //   exc_code 1 [EXC_ARM_BREAKPOINT], exc_sub_code address-of-instruction
+      //      Software breakpoint instruction executed.
+
+      if (exc_code == 1 && exc_sub_code == 0) // EXC_ARM_BREAKPOINT
+      {
+        // This is hit when we single instruction step aka MDSCR_EL1 SS bit 0
+        // is set
+        is_actual_breakpoint = true;
+        is_trace_if_actual_breakpoint_missing = true;
+        if (thread.GetTemporaryResumeState() != eStateStepping)
+          not_stepping_but_got_singlestep_exception = true;
+      }
+      if (exc_code == 0x102) // EXC_ARM_DA_DEBUG
+      {
+        // LWP_TODO: We need to find the WatchpointResource that matches
+        // the address, and evaluate its Watchpoints.
+
+        // It's a watchpoint, then, if the exc_sub_code indicates a
+        // known/enabled data break address from our watchpoint list.
+        lldb::WatchpointSP wp_sp;
+        if (target)
+          wp_sp = target->GetWatchpointList().FindByAddress(
+              (lldb::addr_t)exc_sub_code);
+        if (wp_sp && wp_sp->IsEnabled()) {
+          return StopInfo::CreateStopReasonWithWatchpointID(thread,
+                                                            wp_sp->GetID());
+        }
+        // EXC_ARM_DA_DEBUG seems to be reused for EXC_BREAKPOINT as well as
+        // EXC_BAD_ACCESS
+        if (thread.GetTemporaryResumeState() == eStateStepping)
+          return StopInfo::CreateStopReasonToTrace(thread);
       }
-      stopped_watchpoint = true;
-      address = exc_sub_code;
+      // It looks like exc_sub_code has the 4 bytes of the instruction that
+      // triggered the exception, i.e. our breakpoint opcode
+      is_actual_breakpoint = exc_code == 1;
+      break;
     }
 
-    // The Mach Exception may have been ambiguous --
-    // e.g. we stopped either because of a breakpoint
-    // or a watchpoint.  We'll disambiguate which it
-    // really was.
+    default:
+      break;
+    }
 
-    if (stopped_by_hitting_breakpoint) {
+    if (is_actual_breakpoint) {
+      RegisterContextSP reg_ctx_sp(thread.GetRegisterContext());
       addr_t pc = reg_ctx_sp->GetPC() - pc_decrement;
 
-      if (address)
-        bp_site_sp =
-            process_sp->GetBreakpointSiteList().FindByAddress(*address);
-      if (!bp_site_sp && reg_ctx_sp) {
+      ProcessSP process_sp(thread.CalculateProcess());
+
+      lldb::BreakpointSiteSP bp_site_sp;
+      if (process_sp)
         bp_site_sp = process_sp->GetBreakpointSiteList().FindByAddress(pc);
-      }
       if (bp_site_sp && bp_site_sp->IsEnabled()) {
-        // We've hit this breakpoint, whether it was intended for this thread
-        // or not.  Clear this in the Tread object so we step past it on resume.
-        thread.SetThreadHitBreakpointSite();
-
-        // If we have an operating system plug-in, we might have set a thread
-        // specific breakpoint using the operating system thread ID, so we
-        // can't make any assumptions about the thread ID so we must always
-        // report the breakpoint regardless of the thread.
+        // Update the PC if we were asked to do so, but only do so if we find
+        // a breakpoint that we know about cause this could be a trap
+        // instruction in the code
+        if (pc_decrement > 0 && adjust_pc_if_needed)
+          reg_ctx_sp->SetPC(pc);
+
+        // If the breakpoint is for this thread, then we'll report the hit,
+        // but if it is for another thread, we can just report no reason.  We
+        // don't need to worry about stepping over the breakpoint here, that
+        // will be taken care of when the thread resumes and notices that
+        // there's a breakpoint under the pc. If we have an operating system
+        // plug-in, we might have set a thread specific breakpoint using the
+        // operating system thread ID, so we can't make any assumptions about
+        // the thread ID so we must always report the breakpoint regardless
+        // of the thread.
         if (bp_site_sp->ValidForThisThread(thread) ||
-            thread.GetProcess()->GetOperatingSystem() != nullptr) {
-          // Update the PC if we were asked to do so, but only do so if we find
-          // a breakpoint that we know about cause this could be a trap
-          // instruction in the code
-          if (pc_decrement > 0 && adjust_pc_if_needed && reg_ctx_sp)
-            reg_ctx_sp->SetPC(pc);
+            thread.GetProcess()->GetOperatingSystem() != nullptr)
           return StopInfo::CreateStopReasonWithBreakpointSiteID(
               thread, bp_site_sp->GetID());
-        } else {
+        else if (is_trace_if_actual_breakpoint_missing)
+          return StopInfo::CreateStopReasonToTrace(thread);
+        else
           return StopInfoSP();
-        }
       }
-    }
-
-    // Breakpoint-hit events are handled.
-    // Now handle watchpoints.
 
-    if (stopped_watchpoint && address) {
-      WatchpointResourceSP wp_rsrc_sp =
-          target->GetProcessSP()->GetWatchpointResourceList().FindByAddress(
-              *address);
-      if (wp_rsrc_sp && wp_rsrc_sp->GetNumberOfConstituents() > 0) {
-        return StopInfo::CreateStopReasonWithWatchpointID(
-            thread, wp_rsrc_sp->GetConstituentAtIndex(0)->GetID());
-      }
-    }
-
-    // Finally, handle instruction step.
-
-    if (stopped_by_completing_stepi) {
-      if (thread.GetTemporaryResumeState() != eStateStepping)
-        not_stepping_but_got_singlestep_exception = true;
-      else
+      // Don't call this a trace if we weren't single stepping this thread.
+      if (is_trace_if_actual_breakpoint_missing &&
+          thread.GetTemporaryResumeState() == eStateStepping) {
         return StopInfo::CreateStopReasonToTrace(thread);
+      }
     }
-
   } break;
 
   case 7:  // EXC_SYSCALL
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 8b12b87eacbc6..f383b3d40a4f3 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -377,17 +377,24 @@ void ProcessWindows::RefreshStateAfterStop() {
   if (!stop_thread)
     return;
 
-  RegisterContextSP register_context = stop_thread->GetRegisterContext();
-  uint64_t pc = register_context->GetPC();
-
-  // If we're at a BreakpointSite, mark this as an Unexecuted Breakpoint.
-  // We'll clear that state if we've actually executed the breakpoint.
-  BreakpointSiteSP site(GetBreakpointSiteList().FindByAddress(pc));
-  if (site && site->IsEnabled())
-    stop_thread->SetThreadStoppedAtUnexecutedBP(pc);
-
   switch (active_exception->GetExceptionCode()) {
   case EXCEPTION_SINGLE_STEP: {
+    RegisterContextSP register_context = stop_thread->GetRegisterContext();
+    const uint64_t pc = register_context->GetPC();
+    BreakpointSiteSP site(GetBreakpointSiteList().FindByAddress(pc));
+    if (site && site->ValidForThisThread(*stop_thread)) {
+      LLDB_LOG(log,
+               "Single-stepped onto a breakpoint in process {0} at "
+               "address {1:x} with breakpoint site {2}",
+               m_session_data->m_debugger->GetProcess().GetProcessId(), pc,
+               site->GetID());
+      stop_info = StopInfo::CreateStopReasonWithBreakpointSiteID(*stop_thread,
+                                                                 site->GetID());
+      stop_thread->SetStopInfo(stop_info);
+
+      return;
+    }
+
     auto *reg_ctx = static_cast<RegisterContextWindows *>(
         stop_thread->GetRegisterContext().get());
     uint32_t slot_id = reg_ctx->GetTriggeredHardwareBreakpointSlotId();
@@ -412,6 +419,8 @@ void ProcessWindows::RefreshStateAfterStop() {
   }
 
   case EXCEPTION_BREAKPOINT: {
+    RegisterContextSP register_context = stop_thread->GetRegisterContext();
+
     int breakpoint_size = 1;
     switch (GetTarget().GetArchitecture().GetMachine()) {
     case llvm::Triple::aarch64:
@@ -434,9 +443,9 @@ void ProcessWindows::RefreshStateAfterStop() {
     }
 
     // The current PC is AFTER the BP opcode, on all architectures.
-    pc = register_context->GetPC() - breakpoint_size;
+    uint64_t pc = register_context->GetPC() - breakpoint_size;
 
-    site = GetBreakpointSiteList().FindByAddress(pc);
+    BreakpointSiteSP site(GetBreakpointSiteList().FindByAddress(pc));
     if (site) {
       LLDB_LOG(log,
                "detected breakpoint in process {0} at address {1:x} with "
@@ -444,7 +453,6 @@ void ProcessWindows::RefreshStateAfterStop() {
                m_session_data->m_debugger->GetProcess().GetProcessId(), pc,
                site->GetID());
 
-      stop_thread->SetThreadHitBreakpointSite();
       if (site->ValidForThisThread(*stop_thread)) {
         LLDB_LOG(log,
                  "Breakpoint site {0} is valid for this thread ({1:x}), "
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 9ab03c1fb8ff3..604c92369e9a2 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1598,8 +1598,29 @@ bool ProcessGDBRemote::CalculateThreadStopInfo(ThreadGDBRemote *thread) {
     // that have stop reasons, and if there is no entry for a thread, then it
     // has no stop reason.
     thread->GetRegisterContext()->InvalidateIfNeeded(true);
-    if (!GetThreadStopInfoFromJSON(thread, m_jstopinfo_sp))
+    if (!GetThreadStopInfoFromJSON(thread, m_jstopinfo_sp)) {
+      // If a thread is stopped at a breakpoint site, set that as the stop
+      // reason even if it hasn't executed the breakpoint instruction yet.
+      // We will silently step over the breakpoint when we resume execution
+      // and miss the fact that this thread hit the breakpoint.
+      const size_t num_thread_ids = m_thread_ids.size();
+      for (size_t i = 0; i < num_thread_ids; i++) {
+        if (m_thread_ids[i] == thread->GetID() && m_thread_pcs.size() > i) {
+          addr_t pc = m_thread_pcs[i];
+          lldb::BreakpointSiteSP bp_site_sp =
+              thread->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
+          if (bp_site_sp) {
+            if (bp_site_sp->ValidForThisThread(*thread)) {
+              thread->SetStopInfo(
+                  StopInfo::CreateStopReasonWithBreakpointSiteID(
+                      *thread, bp_site_sp->GetID()));
+              return true;
+            }
+          }
+        }
+      }
       thread->SetStopInfo(StopInfoSP());
+    }
     return true;
   }
 
@@ -1708,12 +1729,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
     if (ThreadSP memory_thread_sp = m_thread_list.GetBackingThread(thread_sp))
       thread_sp = memory_thread_sp;
 
-    addr_t pc = thread_sp->GetRegisterContext()->GetPC();
-    BreakpointSiteSP bp_site_sp =
-        thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
-    if (bp_site_sp && bp_site_sp->IsEnabled())
-      thread_sp->SetThreadStoppedAtUnexecutedBP(pc);
-
     if (exc_type != 0) {
       const size_t exc_data_size = exc_data.size();
 
@@ -1730,10 +1745,26 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
       // to no reason.
       if (!reason.empty() && reason != "none") {
         if (reason == "trace") {
-          thread_sp->SetStopInfo(StopInfo::CreateStopReasonToTrace(*thread_sp));
+          addr_t pc = thread_sp->GetRegisterContext()->GetPC();
+          lldb::BreakpointSiteSP bp_site_sp =
+              thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(
+                  pc);
+
+          // If the current pc is a breakpoint site then the StopInfo should be
+          // set to Breakpoint Otherwise, it will be set to Trace.
+          if (bp_site_sp && bp_site_sp->ValidForThisThread(*thread_sp)) {
+            thread_sp->SetStopInfo(
+                StopInfo::CreateStopReasonWithBreakpointSiteID(
+                    *thread_sp, bp_site_sp->GetID()));
+          } else
+            thread_sp->SetStopInfo(
+                StopInfo::CreateStopReasonToTrace(*thread_sp));
           handled = true;
         } else if (reason == "breakpoint") {
-          thread_sp->SetThreadHitBreakpointSite();
+          addr_t pc = thread_sp->GetRegisterContext()->GetPC();
+          lldb::BreakpointSiteSP bp_site_sp =
+              thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(
+                  pc);
           if (bp_site_sp) {
             // If the breakpoint is for this thread, then we'll report the hit,
             // but if it is for another thread, we can just report no reason.
@@ -1849,41 +1880,39 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
               StopInfo::CreateStopReasonVForkDone(*thread_sp));
           handled = true;
         }
+      } else if (!signo) {
+        addr_t pc = thread_sp->GetRegisterContext()->GetPC();
+        lldb::BreakpointSiteSP bp_site_sp =
+            thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
+
+        // If a thread is stopped at a breakpoint site, set that as the stop
+        // reason even if it hasn't executed the breakpoint instruction yet.
+        // We will silently step over the breakpoint when we resume execution
+        // and miss the fact that this thread hit the breakpoint.
+        if (bp_site_sp && bp_site_sp->ValidForThisThread(*thread_sp)) {
+          thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithBreakpointSiteID(
+              *thread_sp, bp_site_sp->GetID()));
+          handled = true;
+        }
       }
 
       if (!handled && signo && !did_exec) {
         if (signo == SIGTRAP) {
           // Currently we are going to assume SIGTRAP means we are either
           // hitting a breakpoint or hardware single stepping.
-
-          // We can't disambiguate between stepping-to-a-breakpointsite and
-          // hitting-a-breakpointsite.
-          //
-          // A user can instruction-step, and be stopped at a BreakpointSite.
-          // Or a user can be sitting at a BreakpointSite,
-          // instruction-step which hits the breakpoint and the pc does not
-          // advance.
-          //
-          // In both cases, we're at a BreakpointSite when stopped, and
-          // the resume state was eStateStepping.
-
-          // Assume if we're at a BreakpointSite, we hit it.
           handled = true;
           addr_t pc =
               thread_sp->GetRegisterContext()->GetPC() + m_breakpoint_pc_offset;
-          BreakpointSiteSP bp_site_sp =
+          lldb::BreakpointSiteSP bp_site_sp =
               thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(
                   pc);
 
-          // We can't know if we hit it or not. So if we are stopped at
-          // a BreakpointSite, assume we hit it, and should step past the
-          // breakpoint when we resume. This is contrary to how we handle
-          // BreakpointSites in any other location, but we can't know for
-          // sure what happened so it's a reasonable default.
           if (bp_site_sp) {
-            if (bp_site_sp->IsEnabled())
-              thread_sp->SetThreadHitBreakpointSite();
-
+            // If the breakpoint is for this thread, then we'll report the hit,
+            // but if it is for another thread, we can just report no reason.
+            // We don't need to worry about stepping over the breakpoint here,
+            // that will be taken care of when the thread resumes and notices
+            // that there's a breakpoint under the pc.
             if (bp_site_sp->ValidForThisThread(*thread_sp)) {
               if (m_breakpoint_pc_offset != 0)
                 thread_sp->GetRegisterContext()->SetPC(pc);
@@ -1897,6 +1926,8 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
           } else {
             // If we were stepping then assume the stop was the result of the
             // trace.  If we were not stepping then report the SIGTRAP.
+            // FIXME: We are still missing the case where we single step over a
+            // trap instruction.
             if (thread_sp->GetTemporaryResumeState() == eStateStepping)
               thread_sp->SetStopInfo(
                   StopInfo::CreateStopReasonToTrace(*thread_sp));
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
index d0d1508e85172..88a4ca3b0389f 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
+++ b/lldb/source/Plugins/Process/scripted/ScriptedThread.cpp
@@ -229,17 +229,6 @@ bool ScriptedThread::CalculateStopInfo() {
         LLVM_PRETTY_FUNCTION, "Failed to get scripted thread stop info.", error,
         LLDBLog::Thread);
 
-  // If we're at a BreakpointSite, mark that we stopped there and
-  // need to hit the breakpoint when we resume.  This will be cleared
-  // if we CreateStopReasonWithBreakpointSiteID.
-  if (RegisterContextSP reg_ctx_sp = GetRegisterContext()) {
-    addr_t pc = reg_ctx_sp->GetPC();
-    if (BreakpointSiteSP bp_site_sp =
-            GetProcess()->GetBreakpointSiteList().FindByAddress(pc))
-      if (bp_site_sp->IsEnabled())
-        SetThreadStoppedAtUnexecutedBP(pc);
-  }
-
   lldb::StopInfoSP stop_info_sp;
   lldb::StopReason stop_reason_type;
 
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 895306b5ef0d4..95f78056b1644 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1365,8 +1365,6 @@ class StopInfoVForkDone : public StopInfo {
 
 StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread,
                                                           break_id_t break_id) {
-  thread.SetThreadHitBreakpointSite();
-
   return StopInfoSP(new StopInfoBreakpoint(thread, break_id));
 }
 
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 2105efe9305fe..e75f5a356cec2 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -217,7 +217,6 @@ Thread::Thread(Process &process, lldb::tid_t tid, bool use_invalid_index_id)
       m_process_wp(process.shared_from_this()), m_stop_info_sp(),
       m_stop_info_stop_id(0), m_stop_info_override_stop_id(0),
       m_should_run_before_public_stop(false),
-      m_stopped_at_unexecuted_bp(LLDB_INVALID_ADDRESS),
       m_index_id(use_invalid_index_id ? LLDB_INVALID_INDEX32
                                       : process.GetNextThreadIndexID(tid)),
       m_reg_context_sp(), m_state(eStateUnloaded), m_state_mutex(),
@@ -520,7 +519,6 @@ bool Thread::CheckpointThreadState(ThreadStateCheckpoint &saved_state) {
   saved_state.current_inlined_depth = GetCurrentInlinedDepth();
   saved_state.m_completed_plan_checkpoint =
       GetPlans().CheckpointCompletedPlans();
-  saved_state.stopped_at_unexecuted_bp = m_stopped_at_unexecuted_bp;
 
   return true;
 }
@@ -556,7 +554,6 @@ void Thread::RestoreThreadStateFromCheckpoint(
       saved_state.current_inlined_depth);
   GetPlans().RestoreCompletedPlanCheckpoint(
       saved_state.m_completed_plan_checkpoint);
-  m_stopped_at_unexecuted_bp = saved_state.stopped_at_unexecuted_bp;
 }
 
 StateType Thread::GetState() const {
@@ -625,15 +622,7 @@ void Thread::SetupForResume() {
       const addr_t thread_pc = reg_ctx_sp->GetPC();
       BreakpointSiteSP bp_site_sp =
           GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc);
-      // If we're at a BreakpointSite which we have either
-      //   1. already triggered/hit, or
-      //   2. the Breakpoint was added while stopped, or the pc was moved
-      //      to this BreakpointSite
-      // Step past the breakpoint before resuming.
-      // If we stopped at a breakpoint instruction/BreakpointSite location
-      // without hitting it, and we're still at that same address on
-      // resuming, then we want to hit the BreakpointSite when we resume.
-      if (bp_site_sp && m_stopped_at_unexecuted_bp != thread_pc) {
+      if (bp_site_sp) {
         // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the
         // target may not require anything special to step over a breakpoint.
 
@@ -722,7 +711,6 @@ bool Thread::ShouldResume(StateType resume_state) {
 
   if (need_to_resume) {
     ClearStackFrames();
-    m_stopped_at_unexecuted_bp = LLDB_INVALID_ADDRESS;
     // Let Thread subclasses do any special work they need to prior to resuming
     WillResume(resume_state);
   }
@@ -1906,7 +1894,6 @@ Unwind &Thread::GetUnwinder() {
 void Thread::Flush() {
   ClearStackFrames();
   m_reg_context_sp.reset();
-  m_stopped_at_unexecuted_bp = LLDB_INVALID_ADDRESS;
 }
 
 bool Thread::IsStillAtLastBreakpointHit() {
diff --git a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py
index ecea28c6e1f6d..73de4a294388b 100644
--- a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py
+++ b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/TestConsecutiveBreakpoints.py
@@ -2,6 +2,7 @@
 Test that we handle breakpoints on consecutive instructions correctly.
 """
 
+
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -63,30 +64,20 @@ def test_single_step(self):
         """Test that single step stops at the second breakpoint."""
         self.prepare_test()
 
-        # Instruction step to the next instruction
-        # We haven't executed breakpoint2 yet, we're sitting at it now.
-        step_over = False
-        self.thread.StepInstruction(step_over)
-
         step_over = False
         self.thread.StepInstruction(step_over)
 
-        # We've now hit the breakpoint and this StepInstruction has
-        # been interrupted, it is still sitting on the thread plan stack.
-
         self.assertState(self.process.GetState(), lldb.eStateStopped)
         self.assertEqual(
             self.thread.GetFrameAtIndex(0).GetPCAddress().GetLoadAddress(self.target),
             self.bkpt_address.GetLoadAddress(self.target),
         )
-
-        # One more instruction to complete the Step that was interrupted
-        # earlier.
-        self.thread.StepInstruction(step_over)
-        strm = lldb.SBStream()
-        self.thread.GetDescription(strm)
-        self.assertIn("instruction step into", strm.GetData())
-        self.assertIsNotNone(self.thread, "Expected to see that step-in had completed")
+        self.thread = lldbutil.get_one_thread_stopped_at_breakpoint(
+            self.process, self.breakpoint2
+        )
+        self.assertIsNotNone(
+            self.thread, "Expected one thread to be stopped at breakpoint 2"
+        )
 
         self.finish_test()
 
@@ -115,7 +106,4 @@ def test_single_step_thread_specific(self):
             "Stop reason should be 'plan complete'",
         )
 
-        # Hit our second breakpoint
-        self.process.Continue()
-
         self.finish_test()
diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
index 1315288b35c55..3a7440a31677a 100644
--- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
+++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
@@ -5,6 +5,7 @@
 and eStopReasonPlanComplete when breakpoint's condition fails.
 """
 
+
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -89,11 +90,6 @@ def test_step_instruction(self):
                 self.assertGreaterEqual(step_count, steps_expected)
                 break
 
-        # We did a `stepi` when we hit our last breakpoint, and the stepi was not
-        # completed yet, so when we resume it will complete (running process.Continue()
-        # would have the same result - we step one instruction and stop again when
-        # our interrupted stepi completes).
-        self.thread.StepInstruction(True)
         # Run the process until termination
         self.process.Continue()
         self.assertState(self.process.GetState(), lldb.eStateExited)

>From cfc22605f61baf83035dba2e7c6db547a3fb7c62 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 20 Jul 2024 10:44:30 +0900
Subject: [PATCH 29/39] InstrProf: Mark BiasLI as invariant. (#95588)

Bias doesn't change after startup.

The test is enhanced for optimized sequences and atomic ops.
---
 .../Instrumentation/InstrProfiling.cpp        |  3 +++
 .../runtime-counter-relocation.ll             | 26 ++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f994f8a62c320..09c56eb5fe6aa 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -945,6 +945,9 @@ Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
     IRBuilder<> EntryBuilder(&Fn->getEntryBlock().front());
     auto *Bias = getOrCreateBiasVar(getInstrProfCounterBiasVarName());
     BiasLI = EntryBuilder.CreateLoad(Int64Ty, Bias, "profc_bias");
+    // Bias doesn't change after startup.
+    BiasLI->setMetadata(LLVMContext::MD_invariant_load,
+                        MDNode::get(M.getContext(), std::nullopt));
   }
   auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), BiasLI);
   return Builder.CreateIntToPtr(Add, Addr->getType());
diff --git a/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll b/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
index 53b1e4918e8d1..3f0dbef387d2f 100644
--- a/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
@@ -1,9 +1,13 @@
 ; RUN: opt < %s -S -passes=instrprof | FileCheck %s
 ; RUN: opt < %s -S -passes=instrprof -runtime-counter-relocation | FileCheck -check-prefixes=RELOC %s
+; RUN: opt < %s -S -passes=instrprof,inline,gvn -runtime-counter-relocation | FileCheck -check-prefixes=RELOC,RELOCOPT %s
+; RUN: opt < %s -S -passes=instrprof            -runtime-counter-relocation -instrprof-atomic-counter-update-all | FileCheck -check-prefixes=ATOMIC %s
+; RUN: opt < %s -S -passes=instrprof,inline,gvn -runtime-counter-relocation -instrprof-atomic-counter-update-all | FileCheck -check-prefixes=ATOMIC,ATOMICOPT %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
+ at __profn_bar = private constant [3 x i8] c"bar"
 ; RELOC: $__llvm_profile_counter_bias = comdat any
 ; RELOC: @__llvm_profile_counter_bias = linkonce_odr hidden global i64 0, comdat
 
@@ -12,14 +16,34 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT: %[[PGOCOUNTINC:.+]] = add i64 %[[PGOCOUNT]], 1
 ; CHECK-NEXT: store i64 %[[PGOCOUNTINC]], ptr @__profc_foo
 ; RELOC-LABEL: define void @foo
-; RELOC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias
+; RELOC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align 8, !invariant.load !0
 ; RELOC-NEXT: %[[PROFC_BIAS:.+]] = add i64 ptrtoint (ptr @__profc_foo to i64), %[[BIAS]]
 ; RELOC-NEXT: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_BIAS]] to ptr
 ; RELOC-NEXT: %[[PGOCOUNT:.+]] = load i64, ptr %[[PROFC_ADDR]]
 ; RELOC-NEXT: %[[PGOCOUNTINC:.+]] = add i64 %[[PGOCOUNT]], 1
 ; RELOC-NEXT: store i64 %[[PGOCOUNTINC]], ptr %[[PROFC_ADDR]]
+; RELOCOPT-NEXT: %[[PROFC_BIAS1:.+]] = add i64 ptrtoint (ptr @__profc_bar to i64), %[[BIAS]]
+; RELOCOPT-NEXT: %[[PROFC_ADDR1:.+]] = inttoptr i64 %[[PROFC_BIAS1]] to ptr
+; RELOCOPT-NEXT: %[[PGOCOUNT1:.+]] = load i64, ptr %[[PROFC_ADDR1]]
+; RELOCOPT-NEXT: %[[PGOCOUNTINC1:.+]] = add i64 %[[PGOCOUNT1]], 1
+; RELOCOPT-NEXT: store i64 %[[PGOCOUNTINC1]], ptr %[[PROFC_ADDR1]]
+; ATOMIC-LABEL: define void @foo
+; ATOMIC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align 8, !invariant.load !0
+; ATOMIC-NEXT: %[[PROFC_BIAS:.+]] = add i64 ptrtoint (ptr @__profc_foo to i64), %[[BIAS]]
+; ATOMIC-NEXT: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_BIAS]] to ptr
+; ATOMIC-NEXT: %[[PGOCOUNTINC:.+]] = atomicrmw add ptr %[[PROFC_ADDR]], i64 1 monotonic
+; ATOMICOPT-NEXT: %[[PROFC_BIAS1:.+]] = add i64 ptrtoint (ptr @__profc_bar to i64), %[[BIAS]]
+; ATOMICOPT-NEXT: %[[PROFC_ADDR1:.+]] = inttoptr i64 %[[PROFC_BIAS1]] to ptr
+; ATOMICOPT-NEXT: %[[PGOCOUNTINC1:.+]] = atomicrmw add ptr %[[PROFC_ADDR1]], i64 1 monotonic
+
+define void @bar() {
+  call void @llvm.instrprof.increment(ptr @__profn_bar, i64 0, i32 1, i32 0)
+  ret void
+}
+
 define void @foo() {
   call void @llvm.instrprof.increment(ptr @__profn_foo, i64 0, i32 1, i32 0)
+  call void @bar()
   ret void
 }
 

>From abaf13ad589d72b356a8788a5095f869d9d038d0 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe at google.com>
Date: Fri, 19 Jul 2024 18:58:25 -0700
Subject: [PATCH 30/39] Revert "Reapply "Add source file name for template
 instantiations in -ftime-trace"" (#99731)

Reverts llvm/llvm-project#99545

There were a couple of issues reported in the PR: a sanitizer warning
(https://lab.llvm.org/buildbot/#/builders/164/builds/1246/steps/14/logs/stdio)
and a tmp file accidentally included in the commit.
---
 a-abfdec1d.o.tmp                              |   0
 clang/docs/ReleaseNotes.rst                   |   3 -
 clang/include/clang/Driver/Options.td         |   4 -
 .../include/clang/Frontend/FrontendOptions.h  |   8 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   1 -
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  11 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  11 +-
 clang/test/Driver/ftime-trace-sections.cpp    |   2 +-
 clang/test/Driver/ftime-trace.cpp             |  39 +++---
 clang/tools/driver/cc1_main.cpp               |   3 +-
 clang/unittests/Support/TimeProfilerTest.cpp  | 123 ++++--------------
 llvm/include/llvm/Support/TimeProfiler.h      |  23 +---
 llvm/lib/Support/TimeProfiler.cpp             |  61 ++-------
 13 files changed, 64 insertions(+), 225 deletions(-)
 delete mode 100644 a-abfdec1d.o.tmp

diff --git a/a-abfdec1d.o.tmp b/a-abfdec1d.o.tmp
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7ac6ed934290d..aa91663260755 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -750,9 +750,6 @@ Improvements to Clang's time-trace
 - Clang now specifies that using ``auto`` in a lambda parameter is a C++14 extension when
   appropriate. (`#46059: <https://github.com/llvm/llvm-project/issues/46059>`_).
 
-- Clang now adds source file infomation for template instantiations as ``event["args"]["filename"]``. This
-  added behind an option ``-ftime-trace-verbose``. This is expected to increase the size of trace by 2-3 times.
-
 Improvements to Coverage Mapping
 --------------------------------
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9c6cebd77ff0a..8707e71f2a319 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3998,10 +3998,6 @@ def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Grou
   HelpText<"Minimum time granularity (in microseconds) traced by time profiler">,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
   MarshallingInfoInt<FrontendOpts<"TimeTraceGranularity">, "500u">;
-def ftime_trace_verbose : Joined<["-"], "ftime-trace-verbose">, Group<f_Group>,
-  HelpText<"Make time trace capture verbose event details (e.g. source filenames). This can increase the size of the output by 2-3 times">,
-  Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
-  MarshallingInfoFlag<FrontendOpts<"TimeTraceVerbose">>;
 def ftime_trace_EQ : Joined<["-"], "ftime-trace=">, Group<f_Group>,
   HelpText<"Similar to -ftime-trace. Specify the JSON file or a directory which will contain the JSON file">,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 8241925c98476..5e5034fe01eb5 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -580,11 +580,6 @@ class FrontendOptions {
   /// Minimum time granularity (in microseconds) traced by time profiler.
   unsigned TimeTraceGranularity;
 
-  /// Make time trace capture verbose event details (e.g. source filenames).
-  /// This can increase the size of the output by 2-3 times.
-  LLVM_PREFERRED_TYPE(bool)
-  unsigned TimeTraceVerbose : 1;
-
   /// Path which stores the output files for -ftime-trace
   std::string TimeTracePath;
 
@@ -606,8 +601,7 @@ class FrontendOptions {
         EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false),
         EmitSymbolGraphSymbolLabelsForTesting(false),
         EmitPrettySymbolGraphs(false), GenReducedBMI(false),
-        UseClangIRPipeline(false), TimeTraceGranularity(500),
-        TimeTraceVerbose(false) {}
+        UseClangIRPipeline(false), TimeTraceGranularity(500) {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
   /// extension. For example, "c" would return Language::C.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f7b987bf810c1..c9f0fe0a3d60f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6757,7 +6757,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (const char *Name = C.getTimeTraceFile(&JA)) {
     CmdArgs.push_back(Args.MakeArgString("-ftime-trace=" + Twine(Name)));
     Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_granularity_EQ);
-    Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_verbose);
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 725b62db5e80a..a7bc6749c5852 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -3426,16 +3426,11 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
     return true;
 
   llvm::TimeTraceScope TimeScope("InstantiateClass", [&]() {
-    llvm::TimeTraceMetadata M;
-    llvm::raw_string_ostream OS(M.Detail);
+    std::string Name;
+    llvm::raw_string_ostream OS(Name);
     Instantiation->getNameForDiagnostic(OS, getPrintingPolicy(),
                                         /*Qualified=*/true);
-    if (llvm::isTimeTraceVerbose()) {
-      auto Loc = SourceMgr.getExpansionLoc(Instantiation->getLocation());
-      M.File = SourceMgr.getFilename(Loc);
-      M.Line = SourceMgr.getExpansionLineNumber(Loc);
-    }
-    return M;
+    return Name;
   });
 
   Pattern = PatternDef;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index a12d2eff1d2c8..97161febc15f7 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -4966,16 +4966,11 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   }
 
   llvm::TimeTraceScope TimeScope("InstantiateFunction", [&]() {
-    llvm::TimeTraceMetadata M;
-    llvm::raw_string_ostream OS(M.Detail);
+    std::string Name;
+    llvm::raw_string_ostream OS(Name);
     Function->getNameForDiagnostic(OS, getPrintingPolicy(),
                                    /*Qualified=*/true);
-    if (llvm::isTimeTraceVerbose()) {
-      auto Loc = SourceMgr.getExpansionLoc(Function->getLocation());
-      M.File = SourceMgr.getFilename(Loc);
-      M.Line = SourceMgr.getExpansionLineNumber(Loc);
-    }
-    return M;
+    return Name;
   });
 
   // If we're performing recursive template instantiation, create our own
diff --git a/clang/test/Driver/ftime-trace-sections.cpp b/clang/test/Driver/ftime-trace-sections.cpp
index da7109b9d81a6..0c16052bc0c3a 100644
--- a/clang/test/Driver/ftime-trace-sections.cpp
+++ b/clang/test/Driver/ftime-trace-sections.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir %t && cd %t
-// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o out %s
 // RUN: %python %S/ftime-trace-sections.py < out.json
 
 template <typename T>
diff --git a/clang/test/Driver/ftime-trace.cpp b/clang/test/Driver/ftime-trace.cpp
index 60c5885704b58..5fe63de915a71 100644
--- a/clang/test/Driver/ftime-trace.cpp
+++ b/clang/test/Driver/ftime-trace.cpp
@@ -1,18 +1,18 @@
 // RUN: rm -rf %t && mkdir -p %t && cd %t
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -o out %s
 // RUN: cat out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -o out %s
 // RUN: cat new-name.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
 // RUN: mkdir dir1 dir2
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -o out %s
 // RUN: cat dir1/out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
-// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s
+// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -o out %s
 // RUN: cat dir2/out.json \
 // RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 // RUN:   | FileCheck %s
@@ -34,33 +34,32 @@
 // RUN: mkdir d e f && cp %s d/a.cpp && touch d/b.c
 
 /// TODO: Support -fno-integrated-as.
-// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1
-// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1
+// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0"
 
-// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2
-// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2
+// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0"
+// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0"
 
 /// -o specifies the link output. Create ${output}-${basename}.json.
-// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1
-// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1
+// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0"
+// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0"
 
 /// -dumpdir is f/g, not ending with a path separator. We create f/g${basename}.json.
-// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2
-// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2
+// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0"
+// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0"
 
-// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3
-// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
-// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose"
+// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3
+// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0"
+// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0"
 
-// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -ftime-trace-verbose -xassembler d/a.cpp 2>&1 | \
+// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -xassembler d/a.cpp 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=UNUSED
 // UNUSED:      warning: argument unused during compilation: '-ftime-trace'
 // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace=e'
 // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-granularity=1'
-// UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-verbose'
 // UNUSED-NOT:  warning:
 
 template <typename T>
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index f5e5fad36573e..c2ccb47a15bc8 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -241,8 +241,7 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
 
   if (!Clang->getFrontendOpts().TimeTracePath.empty()) {
     llvm::timeTraceProfilerInitialize(
-        Clang->getFrontendOpts().TimeTraceGranularity, Argv0,
-        Clang->getFrontendOpts().TimeTraceVerbose);
+        Clang->getFrontendOpts().TimeTraceGranularity, Argv0);
   }
   // --print-supported-cpus takes priority over the actual compilation.
   if (Clang->getFrontendOpts().PrintSupportedCPUs)
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index 56d880cffde61..5f3950ff033f1 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -10,15 +10,11 @@
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/PreprocessorOptions.h"
 
-#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/JSON.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
-#include "llvm/Support/VirtualFileSystem.h"
 #include <stack>
 
 #include "gtest/gtest.h"
-#include <tuple>
 
 using namespace clang;
 using namespace llvm;
@@ -27,8 +23,7 @@ namespace {
 
 // Should be called before testing.
 void setupProfiler() {
-  timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test",
-                              /*TimeTraceVerbose=*/true);
+  timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test");
 }
 
 // Should be called after `compileFromString()`.
@@ -43,24 +38,14 @@ std::string teardownProfiler() {
 
 // Returns true if code compiles successfully.
 // We only parse AST here. This is enough for constexpr evaluation.
-bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
-                       llvm::StringMap<std::string> Headers = {}) {
+bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) {
   CompilerInstance Compiler;
   Compiler.createDiagnostics();
 
-  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS(
-      new llvm::vfs::InMemoryFileSystem());
-  FS->addFile(File, 0, MemoryBuffer::getMemBuffer(Code));
-  for (const auto &Header : Headers) {
-    FS->addFile(Header.getKey(), 0,
-                MemoryBuffer::getMemBuffer(Header.getValue()));
-  }
-  llvm::IntrusiveRefCntPtr<FileManager> Files(
-      new FileManager(FileSystemOptions(), FS));
-  Compiler.setFileManager(Files.get());
-
   auto Invocation = std::make_shared<CompilerInvocation>();
-  std::vector<const char *> Args = {Standard.data(), File.data()};
+  Invocation->getPreprocessorOpts().addRemappedFile(
+      FileName, MemoryBuffer::getMemBuffer(Code).release());
+  const char *Args[] = {Standard.data(), FileName.data()};
   CompilerInvocation::CreateFromArgs(*Invocation, Args,
                                      Compiler.getDiagnostics());
   Compiler.setInvocation(std::move(Invocation));
@@ -75,28 +60,13 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef File,
   return Compiler.ExecuteAction(Action);
 }
 
-std::string GetMetadata(json::Object *Event) {
-  std::string Metadata;
-  llvm::raw_string_ostream OS(Metadata);
-  if (json::Object *Args = Event->getObject("args")) {
-    if (auto Detail = Args->getString("detail"))
-      OS << Detail;
-    // Use only filename to not include os-specific path separators.
-    if (auto File = Args->getString("file"))
-      OS << ", " << llvm::sys::path::filename(*File);
-    if (auto Line = Args->getInteger("line"))
-      OS << ":" << *Line;
-  }
-  return Metadata;
-}
-
 // Returns pretty-printed trace graph.
 std::string buildTraceGraph(StringRef Json) {
   struct EventRecord {
     int64_t TimestampBegin;
     int64_t TimestampEnd;
-    std::string Name;
-    std::string Metadata;
+    StringRef Name;
+    StringRef Detail;
   };
   std::vector<EventRecord> Events;
 
@@ -111,13 +81,10 @@ std::string buildTraceGraph(StringRef Json) {
     int64_t TimestampBegin = TraceEventObj->getInteger("ts").value_or(0);
     int64_t TimestampEnd =
         TimestampBegin + TraceEventObj->getInteger("dur").value_or(0);
-    std::string Name = TraceEventObj->getString("name").value_or("").str();
-    std::string Metadata = GetMetadata(TraceEventObj);
-
-    // Source events are asynchronous events and may not perfectly nest the
-    // synchronous events. Skip testing them.
-    if (Name == "Source")
-      continue;
+    StringRef Name = TraceEventObj->getString("name").value_or("");
+    StringRef Detail = "";
+    if (json::Object *Args = TraceEventObj->getObject("args"))
+      Detail = Args->getString("detail").value_or("");
 
     // This is a "summary" event, like "Total PerformPendingInstantiations",
     // skip it
@@ -125,7 +92,7 @@ std::string buildTraceGraph(StringRef Json) {
       continue;
 
     Events.emplace_back(
-        EventRecord{TimestampBegin, TimestampEnd, Name, Metadata});
+        EventRecord{TimestampBegin, TimestampEnd, Name, Detail});
   }
 
   // There can be nested events that are very fast, for example:
@@ -165,9 +132,9 @@ std::string buildTraceGraph(StringRef Json) {
       Stream << "| ";
     }
     Stream.write(Event.Name.data(), Event.Name.size());
-    if (!Event.Metadata.empty()) {
+    if (!Event.Detail.empty()) {
       Stream << " (";
-      Stream.write(Event.Metadata.data(), Event.Metadata.size());
+      Stream.write(Event.Detail.data(), Event.Detail.size());
       Stream << ")";
     }
     Stream << "\n";
@@ -178,7 +145,7 @@ std::string buildTraceGraph(StringRef Json) {
 } // namespace
 
 TEST(TimeProfilerTest, ConstantEvaluationCxx20) {
-  std::string Code = R"(
+  constexpr StringRef Code = R"(
 void print(double value);
 
 namespace slow_namespace {
@@ -208,7 +175,8 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line
   setupProfiler();
   ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc"));
   std::string Json = teardownProfiler();
-  ASSERT_EQ(R"(
+  std::string TraceGraph = buildTraceGraph(Json);
+  ASSERT_TRUE(TraceGraph == R"(
 Frontend
 | ParseDeclarationOrFunctionDefinition (test.cc:2:1)
 | ParseDeclarationOrFunctionDefinition (test.cc:6:1)
@@ -234,54 +202,14 @@ Frontend
 | ParseDeclarationOrFunctionDefinition (test.cc:25:1)
 | | EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
-)",
-            buildTraceGraph(Json));
-}
-
-TEST(TimeProfilerTest, TemplateInstantiations) {
-  std::string B_H = R"(
-    template <typename T>
-    T fooB(T t) {
-      return T();
-    }
+)");
 
-    #define MacroTemp(x) template <typename T> void foo##x(T) { T(); }
-  )";
-
-  std::string A_H = R"(
-    #include "b.h"
-
-    MacroTemp(MTA)
-
-    template <typename T>
-    void fooA(T t) { fooB(t); fooMTA(t); }
-  )";
-  std::string Code = R"(
-    #include "a.h"
-    void user() { fooA(0); }
-  )";
-
-  setupProfiler();
-  ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc",
-                                /*Headers=*/{{"a.h", A_H}, {"b.h", B_H}}));
-  std::string Json = teardownProfiler();
-  ASSERT_EQ(R"(
-Frontend
-| ParseFunctionDefinition (fooB)
-| ParseFunctionDefinition (fooMTA)
-| ParseFunctionDefinition (fooA)
-| ParseDeclarationOrFunctionDefinition (test.cc:3:5)
-| | ParseFunctionDefinition (user)
-| PerformPendingInstantiations
-| | InstantiateFunction (fooA<int>, a.h:7)
-| | | InstantiateFunction (fooB<int>, b.h:3)
-| | | InstantiateFunction (fooMTA<int>, a.h:4)
-)",
-            buildTraceGraph(Json));
+  // NOTE: If this test is failing, run this test with
+  // `llvm::errs() << TraceGraph;` and change the assert above.
 }
 
 TEST(TimeProfilerTest, ConstantEvaluationC99) {
-  std::string Code = R"(
+  constexpr StringRef Code = R"(
 struct {
   short quantval[4]; // 3rd line
 } value;
@@ -290,12 +218,15 @@ struct {
   setupProfiler();
   ASSERT_TRUE(compileFromString(Code, "-std=c99", "test.c"));
   std::string Json = teardownProfiler();
-  ASSERT_EQ(R"(
+  std::string TraceGraph = buildTraceGraph(Json);
+  ASSERT_TRUE(TraceGraph == R"(
 Frontend
 | ParseDeclarationOrFunctionDefinition (test.c:2:1)
 | | isIntegerConstantExpr (<test.c:3:18>)
 | | EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
 | PerformPendingInstantiations
-)",
-            buildTraceGraph(Json));
+)");
+
+  // NOTE: If this test is failing, run this test with
+  // `llvm::errs() << TraceGraph;` and change the assert above.
 }
diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h
index 6eb92930b36fd..31f7df10916db 100644
--- a/llvm/include/llvm/Support/TimeProfiler.h
+++ b/llvm/include/llvm/Support/TimeProfiler.h
@@ -83,28 +83,16 @@ namespace llvm {
 
 class raw_pwrite_stream;
 
-struct TimeTraceMetadata {
-  std::string Detail;
-  // Source file and line number information for the event.
-  std::string File;
-  int Line;
-
-  bool isEmpty() const { return Detail.empty() && File.empty(); }
-};
-
 struct TimeTraceProfiler;
 TimeTraceProfiler *getTimeTraceProfilerInstance();
 
-bool isTimeTraceVerbose();
-
 struct TimeTraceProfilerEntry;
 
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
 void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                 StringRef ProcName,
-                                 bool TimeTraceVerbose = false);
+                                 StringRef ProcName);
 
 /// Cleanup the time trace profiler, if it was initialized.
 void timeTraceProfilerCleanup();
@@ -140,10 +128,6 @@ TimeTraceProfilerEntry *
 timeTraceProfilerBegin(StringRef Name,
                        llvm::function_ref<std::string()> Detail);
 
-TimeTraceProfilerEntry *
-timeTraceProfilerBegin(StringRef Name,
-                       llvm::function_ref<TimeTraceMetadata()> MetaData);
-
 /// Manually begin a time section, with the given \p Name and \p Detail.
 /// This starts Async Events having \p Name as a category which is shown
 /// separately from other traces. See
@@ -180,11 +164,6 @@ class TimeTraceScope {
     if (getTimeTraceProfilerInstance() != nullptr)
       Entry = timeTraceProfilerBegin(Name, Detail);
   }
-  TimeTraceScope(StringRef Name,
-                 llvm::function_ref<TimeTraceMetadata()> Metadata) {
-    if (getTimeTraceProfilerInstance() != nullptr)
-      Entry = timeTraceProfilerBegin(Name, Metadata);
-  }
   ~TimeTraceScope() {
     if (getTimeTraceProfilerInstance() != nullptr)
       timeTraceProfilerEnd(Entry);
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index c2014028ddadc..9612db7d30f98 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -73,20 +73,12 @@ struct llvm::TimeTraceProfilerEntry {
   const TimePointType Start;
   TimePointType End;
   const std::string Name;
-  TimeTraceMetadata Metadata;
-
+  const std::string Detail;
   const bool AsyncEvent = false;
   TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
                          std::string &&Dt, bool Ae)
-      : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), Metadata(),
-        AsyncEvent(Ae) {
-    Metadata.Detail = std::move(Dt);
-  }
-
-  TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
-                         TimeTraceMetadata &&Mt, bool Ae)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Metadata(std::move(Mt)), AsyncEvent(Ae) {}
+        Detail(std::move(Dt)), AsyncEvent(Ae) {}
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
@@ -105,12 +97,10 @@ struct llvm::TimeTraceProfilerEntry {
 };
 
 struct llvm::TimeTraceProfiler {
-  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "",
-                    bool TimeTraceVerbose = false)
+  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
       : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()),
         ProcName(ProcName), Pid(sys::Process::getProcessId()),
-        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity),
-        TimeTraceVerbose(TimeTraceVerbose) {
+        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) {
     llvm::get_thread_name(ThreadName);
   }
 
@@ -123,15 +113,6 @@ struct llvm::TimeTraceProfiler {
     return Stack.back().get();
   }
 
-  TimeTraceProfilerEntry *
-  begin(std::string Name, llvm::function_ref<TimeTraceMetadata()> Metadata,
-        bool AsyncEvent = false) {
-    Stack.emplace_back(std::make_unique<TimeTraceProfilerEntry>(
-        ClockType::now(), TimePointType(), std::move(Name), Metadata(),
-        AsyncEvent));
-    return Stack.back().get();
-  }
-
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
     end(*Stack.back());
@@ -203,15 +184,8 @@ struct llvm::TimeTraceProfiler {
           J.attribute("dur", DurUs);
         }
         J.attribute("name", E.Name);
-        if (!E.Metadata.isEmpty()) {
-          J.attributeObject("args", [&] {
-            if (!E.Metadata.Detail.empty())
-              J.attribute("detail", E.Metadata.Detail);
-            if (!E.Metadata.File.empty())
-              J.attribute("file", E.Metadata.File);
-            if (E.Metadata.Line > 0)
-              J.attribute("line", E.Metadata.Line);
-          });
+        if (!E.Detail.empty()) {
+          J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
         }
       });
 
@@ -333,25 +307,14 @@ struct llvm::TimeTraceProfiler {
 
   // Minimum time granularity (in microseconds)
   const unsigned TimeTraceGranularity;
-
-  // Make time trace capture verbose event details (e.g. source filenames). This
-  // can increase the size of the output by 2-3 times.
-  const bool TimeTraceVerbose;
 };
 
-bool llvm::isTimeTraceVerbose() {
-  return getTimeTraceProfilerInstance() &&
-         getTimeTraceProfilerInstance()->TimeTraceVerbose;
-}
-
 void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                       StringRef ProcName,
-                                       bool TimeTraceVerbose) {
+                                       StringRef ProcName) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
   TimeTraceProfilerInstance = new TimeTraceProfiler(
-      TimeTraceGranularity, llvm::sys::path::filename(ProcName),
-      TimeTraceVerbose);
+      TimeTraceGranularity, llvm::sys::path::filename(ProcName));
 }
 
 // Removes all TimeTraceProfilerInstances.
@@ -418,14 +381,6 @@ llvm::timeTraceProfilerBegin(StringRef Name,
   return nullptr;
 }
 
-TimeTraceProfilerEntry *
-llvm::timeTraceProfilerBegin(StringRef Name,
-                             llvm::function_ref<TimeTraceMetadata()> Metadata) {
-  if (TimeTraceProfilerInstance != nullptr)
-    return TimeTraceProfilerInstance->begin(std::string(Name), Metadata, false);
-  return nullptr;
-}
-
 TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name,
                                                           StringRef Detail) {
   if (TimeTraceProfilerInstance != nullptr)

>From d4dc8e4a85c4a88a9080ac7b4e478498cb5edd33 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 20 Jul 2024 11:49:03 +0900
Subject: [PATCH 31/39] Fixup for #95588, don't assume `align 8`.

This shall fix aarch64 builders.
---
 .../InstrProfiling/runtime-counter-relocation.ll              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll b/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
index 3f0dbef387d2f..e1da23e7be31c 100644
--- a/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/runtime-counter-relocation.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT: %[[PGOCOUNTINC:.+]] = add i64 %[[PGOCOUNT]], 1
 ; CHECK-NEXT: store i64 %[[PGOCOUNTINC]], ptr @__profc_foo
 ; RELOC-LABEL: define void @foo
-; RELOC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align 8, !invariant.load !0
+; RELOC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align {{[0-9]+}}, !invariant.load !0
 ; RELOC-NEXT: %[[PROFC_BIAS:.+]] = add i64 ptrtoint (ptr @__profc_foo to i64), %[[BIAS]]
 ; RELOC-NEXT: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_BIAS]] to ptr
 ; RELOC-NEXT: %[[PGOCOUNT:.+]] = load i64, ptr %[[PROFC_ADDR]]
@@ -28,7 +28,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; RELOCOPT-NEXT: %[[PGOCOUNTINC1:.+]] = add i64 %[[PGOCOUNT1]], 1
 ; RELOCOPT-NEXT: store i64 %[[PGOCOUNTINC1]], ptr %[[PROFC_ADDR1]]
 ; ATOMIC-LABEL: define void @foo
-; ATOMIC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align 8, !invariant.load !0
+; ATOMIC-NEXT: %[[BIAS:.+]] = load i64, ptr @__llvm_profile_counter_bias, align {{[0-9]+}}, !invariant.load !0
 ; ATOMIC-NEXT: %[[PROFC_BIAS:.+]] = add i64 ptrtoint (ptr @__profc_foo to i64), %[[BIAS]]
 ; ATOMIC-NEXT: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_BIAS]] to ptr
 ; ATOMIC-NEXT: %[[PGOCOUNTINC:.+]] = atomicrmw add ptr %[[PROFC_ADDR]], i64 1 monotonic

>From 2d756d9d4c89ac50404f942f2db2f4a402aa0e00 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Fri, 19 Jul 2024 20:12:13 -0700
Subject: [PATCH 32/39] [lld-macho,test] Adjust reproduce-thin-archive-objc.s

When `cd %t` is used, it's conventional to move it above and omit `-o /dev/null`.

We don't check the string before `warning:` since (a) the string is not
very useful and (b) downstream might customize `ctx->e.logName`
(argv[0]).

count 0 is better than `--allow-empty`. In addition, without `2>&1` the
previous test was effective.
---
 lld/test/MachO/reproduce-thin-archive-objc.s | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/lld/test/MachO/reproduce-thin-archive-objc.s b/lld/test/MachO/reproduce-thin-archive-objc.s
index c5fe42f130526..8159f03f0f740 100644
--- a/lld/test/MachO/reproduce-thin-archive-objc.s
+++ b/lld/test/MachO/reproduce-thin-archive-objc.s
@@ -4,20 +4,19 @@
 ## during linking. However, we need to iterate over all members for -ObjC, check that we don't
 ## crash when we encounter a missing member.
 
-# RUN: rm -rf %t; mkdir %t
-# RUN: sed s/SYM/_main/   %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o %t/main.o
-# RUN: sed s/SYM/_unused/ %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o %t/unused.o
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: sed s/SYM/_main/   %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o main.o
+# RUN: sed s/SYM/_unused/ %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o unused.o
 
-# RUN: cd %t; llvm-ar rcsT unused.a unused.o; rm unused.o
+# RUN: llvm-ar rcsT unused.a unused.o; rm unused.o
 ## FIXME: Absolute paths don't end up relativized in the repro file.
 
 # RUN: %no-fatal-warnings-lld %t/main.o %t/unused.a -ObjC -o /dev/null 2>&1 \
 # RUN:                      | FileCheck %s --check-prefix=WARN
 
-# RUN: %lld %t/main.o %t/unused.a -ObjC --no-warn-thin-archive-missing-members -o /dev/null \
-# RUN:    | FileCheck %s --implicit-check-not 'warning' --allow-empty
+# RUN: %lld main.o unused.a -ObjC --no-warn-thin-archive-missing-members 2>&1 | count 0
 
-# WARN: ld64.lld: warning: {{.*}}unused.a: -ObjC failed to open archive member: 'unused.o'
+# WARN: warning: {{.*}}unused.a: -ObjC failed to open archive member: 'unused.o'
 
 .text
 .globl SYM

>From 740161a9b98c9920dedf1852b5f1c94d0a683af5 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 20 Jul 2024 12:02:48 +0900
Subject: [PATCH 33/39] Revert "[LLVM][LTO] Factor out RTLib calls and allow
 them to be dropped (#98512)"

This reverts commit c05126bdfc3b02daa37d11056fa43db1a6cdef69.
(llvmorg-19-init-17714-gc05126bdfc3b)
See #99610
---
 lld/COFF/Driver.cpp                           |   7 +-
 lld/ELF/Driver.cpp                            |   6 +-
 lld/wasm/Driver.cpp                           |   6 +-
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   2 +-
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |  96 -----
 llvm/include/llvm/CodeGen/RuntimeLibcalls.h   | 113 +++++
 llvm/include/llvm/CodeGen/TargetLowering.h    |  37 +-
 llvm/include/llvm/IR/RuntimeLibcalls.h        | 126 ------
 llvm/include/llvm/LTO/LTO.h                   |   2 +-
 llvm/lib/CodeGen/AtomicExpandPass.cpp         |   2 +-
 llvm/lib/CodeGen/DwarfEHPrepare.cpp           |   2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   2 +-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   2 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   2 +-
 .../SelectionDAG/StatepointLowering.cpp       |   2 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       | 387 +++++++++++++++++-
 llvm/lib/IR/CMakeLists.txt                    |   1 -
 llvm/lib/IR/RuntimeLibcalls.cpp               | 379 -----------------
 llvm/lib/LTO/LTO.cpp                          |  14 +-
 llvm/lib/Object/IRSymtab.cpp                  |  20 +-
 llvm/lib/Target/AArch64/AArch64FastISel.cpp   |   2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 llvm/lib/Target/ARM/ARMFastISel.cpp           |   2 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   2 +-
 llvm/lib/Target/ARM/ARMLegalizerInfo.h        |   2 +-
 llvm/lib/Target/ARM/ARMSelectionDAGInfo.h     |   2 +-
 .../Target/Hexagon/HexagonISelLowering.cpp    |   4 +-
 llvm/lib/Target/Lanai/LanaiISelLowering.cpp   |   2 +-
 .../LoongArch/LoongArchISelLowering.cpp       |   2 +-
 llvm/lib/Target/Mips/MipsISelLowering.cpp     |   2 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   2 +-
 .../WebAssemblyRuntimeLibcallSignatures.cpp   |   2 +-
 .../WebAssemblyRuntimeLibcallSignatures.h     |   2 +-
 llvm/tools/lto/lto.cpp                        |   2 +-
 37 files changed, 568 insertions(+), 676 deletions(-)
 delete mode 100644 llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
 create mode 100644 llvm/include/llvm/CodeGen/RuntimeLibcalls.h
 delete mode 100644 llvm/include/llvm/IR/RuntimeLibcalls.h
 delete mode 100644 llvm/lib/IR/RuntimeLibcalls.cpp

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 9e28b1c50be50..cef6271e4c8f8 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2428,12 +2428,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       // file's symbol table. If any of those library functions are defined in a
       // bitcode file in an archive member, we need to arrange to use LTO to
       // compile those archive members by adding them to the link beforehand.
-      if (!ctx.bitcodeFileInstances.empty()) {
-        llvm::Triple TT(
-            ctx.bitcodeFileInstances.front()->obj->getTargetTriple());
-        for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
+      if (!ctx.bitcodeFileInstances.empty())
+        for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
           ctx.symtab.addLibcall(s);
-      }
 
       // Windows specific -- if __load_config_used can be resolved, resolve it.
       if (ctx.symtab.findUnderscore("_load_config_used"))
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 40e095a133d95..3a3070aea94f5 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2884,11 +2884,9 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // to, i.e. if the symbol's definition is in bitcode. Any other required
   // libcall symbols will be added to the link after LTO when we add the LTO
   // object file to the link.
-  if (!ctx.bitcodeFiles.empty()) {
-    llvm::Triple TT(ctx.bitcodeFiles.front()->obj->getTargetTriple());
-    for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
+  if (!ctx.bitcodeFiles.empty())
+    for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
       handleLibcall(s);
-  }
 
   // Archive members defining __wrap symbols may be extracted.
   std::vector<WrappedSymbol> wrapped = addWrappedSymbols(args);
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 8c83d17db02f5..b66b988005d58 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -1320,11 +1320,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // We only need to add libcall symbols to the link before LTO if the symbol's
   // definition is in bitcode. Any other required libcall symbols will be added
   // to the link after LTO when we add the LTO object file to the link.
-  if (!ctx.bitcodeFiles.empty()) {
-    llvm::Triple TT(ctx.bitcodeFiles.front()->obj->getTargetTriple());
-    for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
+  if (!ctx.bitcodeFiles.empty())
+    for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
       handleLibcall(s);
-  }
   if (errorCount())
     return;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index b17bc9aa2a44e..b599b0368b9aa 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -22,7 +22,7 @@
 
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
deleted file mode 100644
index ce63dcc405fd5..0000000000000
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//===-- CodeGen/RuntimeLibcallUtil.h - Runtime Library Calls ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines some helper functions for runtime library calls.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_RUNTIMELIBCALLS_H
-#define LLVM_CODEGEN_RUNTIMELIBCALLS_H
-
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/RuntimeLibcalls.h"
-#include "llvm/Support/AtomicOrdering.h"
-
-namespace llvm {
-namespace RTLIB {
-
-/// GetFPLibCall - Helper to return the right libcall for the given floating
-/// point type, or UNKNOWN_LIBCALL if there is none.
-Libcall getFPLibCall(EVT VT, Libcall Call_F32, Libcall Call_F64,
-                     Libcall Call_F80, Libcall Call_F128, Libcall Call_PPCF128);
-
-/// getFPEXT - Return the FPEXT_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getFPEXT(EVT OpVT, EVT RetVT);
-
-/// getFPROUND - Return the FPROUND_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getFPROUND(EVT OpVT, EVT RetVT);
-
-/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getFPTOSINT(EVT OpVT, EVT RetVT);
-
-/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getFPTOUINT(EVT OpVT, EVT RetVT);
-
-/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getSINTTOFP(EVT OpVT, EVT RetVT);
-
-/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getUINTTOFP(EVT OpVT, EVT RetVT);
-
-/// getPOWI - Return the POWI_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getPOWI(EVT RetVT);
-
-/// getLDEXP - Return the LDEXP_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getLDEXP(EVT RetVT);
-
-/// getFREXP - Return the FREXP_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getFREXP(EVT RetVT);
-
-/// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
-/// UNKNOWN_LIBCALL if there is none.
-Libcall getSYNC(unsigned Opc, MVT VT);
-
-/// Return the outline atomics value for the given atomic ordering, access
-/// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there
-/// is none.
-Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4], AtomicOrdering Order,
-                               uint64_t MemSize);
-
-/// Return the outline atomics value for the given opcode, atomic ordering
-/// and type, or UNKNOWN_LIBCALL if there is none.
-Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
-
-/// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return
-/// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
-/// UNKNOW_LIBCALL if there is none.
-Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
-
-/// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return
-/// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
-/// UNKNOW_LIBCALL if there is none.
-Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
-
-/// getMEMSET_ELEMENT_UNORDERED_ATOMIC - Return
-/// MEMSET_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
-/// UNKNOW_LIBCALL if there is none.
-Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
-
-} // namespace RTLIB
-} // namespace llvm
-
-#endif
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
new file mode 100644
index 0000000000000..3a407c4a4d940
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -0,0 +1,113 @@
+//===-- CodeGen/RuntimeLibcalls.h - Runtime Library Calls -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the enum representing the list of runtime library calls
+// the backend may emit during code generation, and also some helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_RUNTIMELIBCALLS_H
+#define LLVM_CODEGEN_RUNTIMELIBCALLS_H
+
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/AtomicOrdering.h"
+
+namespace llvm {
+namespace RTLIB {
+  /// RTLIB::Libcall enum - This enum defines all of the runtime library calls
+  /// the backend can emit.  The various long double types cannot be merged,
+  /// because 80-bit library functions use "xf" and 128-bit use "tf".
+  ///
+  /// When adding PPCF128 functions here, note that their names generally need
+  /// to be overridden for Darwin with the xxx$LDBL128 form.  See
+  /// PPCISelLowering.cpp.
+  ///
+  enum Libcall {
+#define HANDLE_LIBCALL(code, name) code,
+    #include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+  };
+
+  /// GetFPLibCall - Helper to return the right libcall for the given floating
+  /// point type, or UNKNOWN_LIBCALL if there is none.
+  Libcall getFPLibCall(EVT VT,
+                       Libcall Call_F32,
+                       Libcall Call_F64,
+                       Libcall Call_F80,
+                       Libcall Call_F128,
+                       Libcall Call_PPCF128);
+
+  /// getFPEXT - Return the FPEXT_*_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getFPEXT(EVT OpVT, EVT RetVT);
+
+  /// getFPROUND - Return the FPROUND_*_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getFPROUND(EVT OpVT, EVT RetVT);
+
+  /// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getFPTOSINT(EVT OpVT, EVT RetVT);
+
+  /// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getFPTOUINT(EVT OpVT, EVT RetVT);
+
+  /// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getSINTTOFP(EVT OpVT, EVT RetVT);
+
+  /// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getUINTTOFP(EVT OpVT, EVT RetVT);
+
+  /// getPOWI - Return the POWI_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getPOWI(EVT RetVT);
+
+  /// getLDEXP - Return the LDEXP_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getLDEXP(EVT RetVT);
+
+  /// getFREXP - Return the FREXP_* value for the given types, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getFREXP(EVT RetVT);
+
+  /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
+  /// UNKNOWN_LIBCALL if there is none.
+  Libcall getSYNC(unsigned Opc, MVT VT);
+
+  /// Return the outline atomics value for the given atomic ordering, access
+  /// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there
+  /// is none.
+  Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4],
+                                 AtomicOrdering Order, uint64_t MemSize);
+
+  /// Return the outline atomics value for the given opcode, atomic ordering
+  /// and type, or UNKNOWN_LIBCALL if there is none.
+  Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
+
+  /// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return
+  /// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
+  /// UNKNOW_LIBCALL if there is none.
+  Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
+
+  /// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return
+  /// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
+  /// UNKNOW_LIBCALL if there is none.
+  Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
+
+  /// getMEMSET_ELEMENT_UNORDERED_ATOMIC - Return
+  /// MEMSET_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
+  /// UNKNOW_LIBCALL if there is none.
+  Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
+
+}
+}
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d4a2166bf768e..64c73820add64 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -31,7 +31,7 @@
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -45,7 +45,6 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -3415,40 +3414,44 @@ class TargetLoweringBase {
     return nullptr;
   }
 
+  //===--------------------------------------------------------------------===//
+  // Runtime Library hooks
+  //
+
   /// Rename the default libcall routine name for the specified libcall.
   void setLibcallName(RTLIB::Libcall Call, const char *Name) {
-    Libcalls.setLibcallName(Call, Name);
+    LibcallRoutineNames[Call] = Name;
   }
-
   void setLibcallName(ArrayRef<RTLIB::Libcall> Calls, const char *Name) {
-    Libcalls.setLibcallName(Calls, Name);
+    for (auto Call : Calls)
+      setLibcallName(Call, Name);
   }
 
   /// Get the libcall routine name for the specified libcall.
   const char *getLibcallName(RTLIB::Libcall Call) const {
-    return Libcalls.getLibcallName(Call);
+    return LibcallRoutineNames[Call];
   }
 
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
   void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    Libcalls.setCmpLibcallCC(Call, CC);
+    CmpLibcallCCs[Call] = CC;
   }
 
   /// Get the CondCode that's to be used to test the result of the comparison
   /// libcall against zero.
   ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return Libcalls.getCmpLibcallCC(Call);
+    return CmpLibcallCCs[Call];
   }
 
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
-    Libcalls.setLibcallCallingConv(Call, CC);
+    LibcallCallingConvs[Call] = CC;
   }
 
   /// Get the CallingConv that should be used for the specified libcall.
   CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-    return Libcalls.getLibcallCallingConv(Call);
+    return LibcallCallingConvs[Call];
   }
 
   /// Execute target specific actions to finalize target lowering.
@@ -3627,8 +3630,18 @@ class TargetLoweringBase {
   std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
     PromoteToType;
 
-  /// The list of libcalls that the target will use.
-  RTLIB::RuntimeLibcallsInfo Libcalls;
+  /// Stores the name each libcall.
+  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
+
+  /// The ISD::CondCode that should be used to test the result of each of the
+  /// comparison libcall against zero.
+  ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
+
+  /// Stores the CallingConv that should be used for each libcall.
+  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
+
+  /// Set default libcall names and calling conventions.
+  void InitLibcalls(const Triple &TT);
 
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
deleted file mode 100644
index 3057bff397b2f..0000000000000
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ /dev/null
@@ -1,126 +0,0 @@
-//===- RuntimeLibcalls.h - Interface for runtime libcalls -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a common interface to work with library calls into a
-// runtime that may be emitted by a given backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_RUNTIME_LIBCALLS_H
-#define LLVM_IR_RUNTIME_LIBCALLS_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/TargetParser/Triple.h"
-
-namespace llvm {
-namespace RTLIB {
-
-/// RTLIB::Libcall enum - This enum defines all of the runtime library calls
-/// the backend can emit.  The various long double types cannot be merged,
-/// because 80-bit library functions use "xf" and 128-bit use "tf".
-///
-/// When adding PPCF128 functions here, note that their names generally need
-/// to be overridden for Darwin with the xxx$LDBL128 form.  See
-/// PPCISelLowering.cpp.
-///
-enum Libcall {
-#define HANDLE_LIBCALL(code, name) code,
-#include "llvm/IR/RuntimeLibcalls.def"
-#undef HANDLE_LIBCALL
-};
-
-/// A simple container for information about the supported runtime calls.
-struct RuntimeLibcallsInfo {
-  explicit RuntimeLibcallsInfo(const Triple &TT) {
-    initLibcalls(TT);
-    initCmpLibcallCCs();
-  }
-
-  /// Rename the default libcall routine name for the specified libcall.
-  void setLibcallName(RTLIB::Libcall Call, const char *Name) {
-    LibcallRoutineNames[Call] = Name;
-  }
-
-  void setLibcallName(ArrayRef<RTLIB::Libcall> Calls, const char *Name) {
-    for (auto Call : Calls)
-      setLibcallName(Call, Name);
-  }
-
-  /// Get the libcall routine name for the specified libcall.
-  const char *getLibcallName(RTLIB::Libcall Call) const {
-    return LibcallRoutineNames[Call];
-  }
-
-  /// Override the default CondCode to be used to test the result of the
-  /// comparison libcall against zero.
-  void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    CmpLibcallCCs[Call] = CC;
-  }
-
-  /// Get the CondCode that's to be used to test the result of the comparison
-  /// libcall against zero.
-  ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return CmpLibcallCCs[Call];
-  }
-
-  /// Set the CallingConv that should be used for the specified libcall.
-  void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
-    LibcallCallingConvs[Call] = CC;
-  }
-
-  /// Get the CallingConv that should be used for the specified libcall.
-  CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-    return LibcallCallingConvs[Call];
-  }
-
-  iterator_range<const char **> getLibcallNames() {
-    return llvm::make_range(LibcallRoutineNames,
-                            LibcallRoutineNames + RTLIB::UNKNOWN_LIBCALL);
-  }
-
-private:
-  /// Stores the name each libcall.
-  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
-
-  /// The ISD::CondCode that should be used to test the result of each of the
-  /// comparison libcall against zero.
-  ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
-
-  /// Stores the CallingConv that should be used for each libcall.
-  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
-
-  static bool darwinHasSinCos(const Triple &TT) {
-    assert(TT.isOSDarwin() && "should be called with darwin triple");
-    // Don't bother with 32 bit x86.
-    if (TT.getArch() == Triple::x86)
-      return false;
-    // Macos < 10.9 has no sincos_stret.
-    if (TT.isMacOSX())
-      return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit();
-    // iOS < 7.0 has no sincos_stret.
-    if (TT.isiOS())
-      return !TT.isOSVersionLT(7, 0);
-    // Any other darwin such as WatchOS/TvOS is new enough.
-    return true;
-  }
-
-  /// Sets default libcall calling conventions.
-  void initCmpLibcallCCs();
-
-  /// Set default libcall names. If a target wants to opt-out of a libcall it
-  /// should be placed here.
-  void initLibcalls(const Triple &TT);
-};
-
-} // namespace RTLIB
-} // namespace llvm
-
-#endif // LLVM_IR_RUNTIME_LIBCALLS_H
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 30eda34cd7ba5..94996ae89e35d 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -301,7 +301,7 @@ class LTO {
 
   /// Static method that returns a list of libcall symbols that can be generated
   /// by LTO but might not be visible from bitcode symbol table.
-  static SmallVector<const char *> getRuntimeLibcallSymbols(const Triple &TT);
+  static ArrayRef<const char*> getRuntimeLibcallSymbols();
 
 private:
   Config Conf;
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index ebcf76175a36b..cea09bcb45386 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/AtomicExpandUtils.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index 324329ce989e7..09e7cfb12bdba 100644
--- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 68a8a273a1b47..b10d51adb69ae 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -38,7 +38,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 640a425ffa735..b8653aff690ef 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9b2153c68ccae..b584aa5923ed6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -37,7 +37,7 @@
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7f5b46af01c62..9f515739ee048 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 02d44cd36ae53..0ccae50286345 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -36,7 +36,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 98a795edb7a03..c179c97029c7a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -44,7 +44,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 4268da8670d50..671ec84fb9416 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -26,7 +26,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackMaps.h"
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 8040f1eeae810..f0fa6aa772e37 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -28,7 +28,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -98,6 +98,350 @@ static cl::opt<bool> DisableStrictNodeMutation("disable-strictnode-mutation",
        cl::desc("Don't mutate strict-float node to a legalize node"),
        cl::init(false), cl::Hidden);
 
+static bool darwinHasSinCos(const Triple &TT) {
+  assert(TT.isOSDarwin() && "should be called with darwin triple");
+  // Don't bother with 32 bit x86.
+  if (TT.getArch() == Triple::x86)
+    return false;
+  // Macos < 10.9 has no sincos_stret.
+  if (TT.isMacOSX())
+    return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit();
+  // iOS < 7.0 has no sincos_stret.
+  if (TT.isiOS())
+    return !TT.isOSVersionLT(7, 0);
+  // Any other darwin such as WatchOS/TvOS is new enough.
+  return true;
+}
+
+void TargetLoweringBase::InitLibcalls(const Triple &TT) {
+#define HANDLE_LIBCALL(code, name) \
+  setLibcallName(RTLIB::code, name);
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+  // Initialize calling conventions to their default.
+  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
+    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
+
+  // Use the f128 variants of math functions on x86_64
+  if (TT.getArch() == Triple::ArchType::x86_64 && TT.isGNUEnvironment()) {
+    setLibcallName(RTLIB::REM_F128, "fmodf128");
+    setLibcallName(RTLIB::FMA_F128, "fmaf128");
+    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
+    setLibcallName(RTLIB::LOG_F128, "logf128");
+    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
+    setLibcallName(RTLIB::LOG2_F128, "log2f128");
+    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
+    setLibcallName(RTLIB::LOG10_F128, "log10f128");
+    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
+    setLibcallName(RTLIB::EXP_F128, "expf128");
+    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
+    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
+    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
+    setLibcallName(RTLIB::SIN_F128, "sinf128");
+    setLibcallName(RTLIB::COS_F128, "cosf128");
+    setLibcallName(RTLIB::TAN_F128, "tanf128");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+    setLibcallName(RTLIB::ASIN_F128, "asinf128");
+    setLibcallName(RTLIB::ACOS_F128, "acosf128");
+    setLibcallName(RTLIB::ATAN_F128, "atanf128");
+    setLibcallName(RTLIB::SINH_F128, "sinhf128");
+    setLibcallName(RTLIB::COSH_F128, "coshf128");
+    setLibcallName(RTLIB::TANH_F128, "tanhf128");
+    setLibcallName(RTLIB::POW_F128, "powf128");
+    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
+    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+    setLibcallName(RTLIB::RINT_F128, "rintf128");
+    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+    setLibcallName(RTLIB::ROUND_F128, "roundf128");
+    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
+    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
+    setLibcallName(RTLIB::FMIN_F128, "fminf128");
+    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
+    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+  }
+
+  // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
+  if (TT.isPPC()) {
+    setLibcallName(RTLIB::ADD_F128, "__addkf3");
+    setLibcallName(RTLIB::SUB_F128, "__subkf3");
+    setLibcallName(RTLIB::MUL_F128, "__mulkf3");
+    setLibcallName(RTLIB::DIV_F128, "__divkf3");
+    setLibcallName(RTLIB::POWI_F128, "__powikf2");
+    setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2");
+    setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2");
+    setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2");
+    setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2");
+    setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi");
+    setLibcallName(RTLIB::FPTOSINT_F128_I64, "__fixkfdi");
+    setLibcallName(RTLIB::FPTOSINT_F128_I128, "__fixkfti");
+    setLibcallName(RTLIB::FPTOUINT_F128_I32, "__fixunskfsi");
+    setLibcallName(RTLIB::FPTOUINT_F128_I64, "__fixunskfdi");
+    setLibcallName(RTLIB::FPTOUINT_F128_I128, "__fixunskfti");
+    setLibcallName(RTLIB::SINTTOFP_I32_F128, "__floatsikf");
+    setLibcallName(RTLIB::SINTTOFP_I64_F128, "__floatdikf");
+    setLibcallName(RTLIB::SINTTOFP_I128_F128, "__floattikf");
+    setLibcallName(RTLIB::UINTTOFP_I32_F128, "__floatunsikf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F128, "__floatundikf");
+    setLibcallName(RTLIB::UINTTOFP_I128_F128, "__floatuntikf");
+    setLibcallName(RTLIB::OEQ_F128, "__eqkf2");
+    setLibcallName(RTLIB::UNE_F128, "__nekf2");
+    setLibcallName(RTLIB::OGE_F128, "__gekf2");
+    setLibcallName(RTLIB::OLT_F128, "__ltkf2");
+    setLibcallName(RTLIB::OLE_F128, "__lekf2");
+    setLibcallName(RTLIB::OGT_F128, "__gtkf2");
+    setLibcallName(RTLIB::UO_F128, "__unordkf2");
+  }
+
+  // A few names are different on particular architectures or environments.
+  if (TT.isOSDarwin()) {
+    // For f16/f32 conversions, Darwin uses the standard naming scheme, instead
+    // of the gnueabi-style __gnu_*_ieee.
+    // FIXME: What about other targets?
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+
+    // Some darwins have an optimized __bzero/bzero function.
+    switch (TT.getArch()) {
+    case Triple::x86:
+    case Triple::x86_64:
+      if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
+        setLibcallName(RTLIB::BZERO, "__bzero");
+      break;
+    case Triple::aarch64:
+    case Triple::aarch64_32:
+      setLibcallName(RTLIB::BZERO, "bzero");
+      break;
+    default:
+      break;
+    }
+
+    if (darwinHasSinCos(TT)) {
+      setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
+      setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
+      if (TT.isWatchABI()) {
+        setLibcallCallingConv(RTLIB::SINCOS_STRET_F32,
+                              CallingConv::ARM_AAPCS_VFP);
+        setLibcallCallingConv(RTLIB::SINCOS_STRET_F64,
+                              CallingConv::ARM_AAPCS_VFP);
+      }
+    }
+
+    switch (TT.getOS()) {
+    case Triple::MacOSX:
+      if (TT.isMacOSXVersionLT(10, 9)) {
+        setLibcallName(RTLIB::EXP10_F32, nullptr);
+        setLibcallName(RTLIB::EXP10_F64, nullptr);
+      } else {
+        setLibcallName(RTLIB::EXP10_F32, "__exp10f");
+        setLibcallName(RTLIB::EXP10_F64, "__exp10");
+      }
+      break;
+    case Triple::IOS:
+      if (TT.isOSVersionLT(7, 0)) {
+        setLibcallName(RTLIB::EXP10_F32, nullptr);
+        setLibcallName(RTLIB::EXP10_F64, nullptr);
+        break;
+      }
+      [[fallthrough]];
+    case Triple::TvOS:
+    case Triple::WatchOS:
+    case Triple::XROS:
+      setLibcallName(RTLIB::EXP10_F32, "__exp10f");
+      setLibcallName(RTLIB::EXP10_F64, "__exp10");
+      break;
+    default:
+      break;
+    }
+  } else {
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
+  }
+
+  if (TT.isGNUEnvironment() || TT.isOSFuchsia() ||
+      (TT.isAndroid() && !TT.isAndroidVersionLT(9))) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    setLibcallName(RTLIB::SINCOS_F80, "sincosl");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosl");
+    setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
+  }
+
+  if (TT.isPS()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+  }
+
+  if (TT.isOSOpenBSD()) {
+    setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
+  }
+
+  if (TT.isOSWindows() && !TT.isOSCygMing()) {
+    setLibcallName(RTLIB::LDEXP_F32, nullptr);
+    setLibcallName(RTLIB::LDEXP_F80, nullptr);
+    setLibcallName(RTLIB::LDEXP_F128, nullptr);
+    setLibcallName(RTLIB::LDEXP_PPCF128, nullptr);
+
+    setLibcallName(RTLIB::FREXP_F32, nullptr);
+    setLibcallName(RTLIB::FREXP_F80, nullptr);
+    setLibcallName(RTLIB::FREXP_F128, nullptr);
+    setLibcallName(RTLIB::FREXP_PPCF128, nullptr);
+  }
+
+  if (TT.isAArch64()) {
+    if (TT.isOSMSVCRT()) {
+      // MSVCRT doesn't have powi; fall back to pow
+      setLibcallName(RTLIB::POWI_F32, nullptr);
+      setLibcallName(RTLIB::POWI_F64, nullptr);
+    }
+  }
+
+  // Disable most libcalls on AMDGPU.
+  if (TT.isAMDGPU()) {
+    for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
+      if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
+        setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+    }
+  }
+
+  // Disable most libcalls on NVPTX.
+  if (TT.isNVPTX()) {
+    for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
+      if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
+        setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+  }
+
+  if (TT.isARM() || TT.isThumb()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MUL_I128, nullptr);
+    setLibcallName(RTLIB::MULO_I64, nullptr);
+    setLibcallName(RTLIB::MULO_I128, nullptr);
+
+    if (TT.isOSMSVCRT()) {
+      // MSVCRT doesn't have powi; fall back to pow
+      setLibcallName(RTLIB::POWI_F32, nullptr);
+      setLibcallName(RTLIB::POWI_F64, nullptr);
+    }
+  }
+
+  if (TT.getArch() == Triple::ArchType::avr) {
+    // Division rtlib functions (not supported), use divmod functions instead
+    setLibcallName(RTLIB::SDIV_I8, nullptr);
+    setLibcallName(RTLIB::SDIV_I16, nullptr);
+    setLibcallName(RTLIB::SDIV_I32, nullptr);
+    setLibcallName(RTLIB::UDIV_I8, nullptr);
+    setLibcallName(RTLIB::UDIV_I16, nullptr);
+    setLibcallName(RTLIB::UDIV_I32, nullptr);
+
+    // Modulus rtlib functions (not supported), use divmod functions instead
+    setLibcallName(RTLIB::SREM_I8, nullptr);
+    setLibcallName(RTLIB::SREM_I16, nullptr);
+    setLibcallName(RTLIB::SREM_I32, nullptr);
+    setLibcallName(RTLIB::UREM_I8, nullptr);
+    setLibcallName(RTLIB::UREM_I16, nullptr);
+    setLibcallName(RTLIB::UREM_I32, nullptr);
+  }
+
+  if (TT.getArch() == Triple::ArchType::hexagon) {
+    // These cause problems when the shift amount is non-constant.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  if (TT.isLoongArch()) {
+    if (!TT.isLoongArch64()) {
+      // Set libcalls.
+      setLibcallName(RTLIB::MUL_I128, nullptr);
+      // The MULO libcall is not part of libgcc, only compiler-rt.
+      setLibcallName(RTLIB::MULO_I64, nullptr);
+    }
+    // The MULO libcall is not part of libgcc, only compiler-rt.
+    setLibcallName(RTLIB::MULO_I128, nullptr);
+  }
+
+  if (TT.isMIPS32()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MUL_I128, nullptr);
+    setLibcallName(RTLIB::MULO_I64, nullptr);
+    setLibcallName(RTLIB::MULO_I128, nullptr);
+  }
+
+  if (TT.isPPC()) {
+    if (!TT.isPPC64()) {
+      // These libcalls are not available in 32-bit.
+      setLibcallName(RTLIB::SHL_I128, nullptr);
+      setLibcallName(RTLIB::SRL_I128, nullptr);
+      setLibcallName(RTLIB::SRA_I128, nullptr);
+      setLibcallName(RTLIB::MUL_I128, nullptr);
+      setLibcallName(RTLIB::MULO_I64, nullptr);
+    }
+    setLibcallName(RTLIB::MULO_I128, nullptr);
+  }
+
+  if (TT.isRISCV32()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+    setLibcallName(RTLIB::MUL_I128, nullptr);
+    setLibcallName(RTLIB::MULO_I64, nullptr);
+  }
+
+  if (TT.isSPARC()) {
+    if (!TT.isSPARC64()) {
+      // These libcalls are not available in 32-bit.
+      setLibcallName(RTLIB::MULO_I64, nullptr);
+      setLibcallName(RTLIB::MUL_I128, nullptr);
+      setLibcallName(RTLIB::SHL_I128, nullptr);
+      setLibcallName(RTLIB::SRL_I128, nullptr);
+      setLibcallName(RTLIB::SRA_I128, nullptr);
+    }
+    setLibcallName(RTLIB::MULO_I128, nullptr);
+  }
+
+  if (TT.isSystemZ()) {
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  if (TT.isX86()) {
+    if (TT.getArch() == Triple::ArchType::x86) {
+      // These libcalls are not available in 32-bit.
+      setLibcallName(RTLIB::SHL_I128, nullptr);
+      setLibcallName(RTLIB::SRL_I128, nullptr);
+      setLibcallName(RTLIB::SRA_I128, nullptr);
+      setLibcallName(RTLIB::MUL_I128, nullptr);
+      // The MULO libcall is not part of libgcc, only compiler-rt.
+      setLibcallName(RTLIB::MULO_I64, nullptr);
+    }
+
+    // The MULO libcall is not part of libgcc, only compiler-rt.
+    setLibcallName(RTLIB::MULO_I128, nullptr);
+
+    if (TT.isOSMSVCRT()) {
+      // MSVCRT doesn't have powi; fall back to pow
+      setLibcallName(RTLIB::POWI_F32, nullptr);
+      setLibcallName(RTLIB::POWI_F64, nullptr);
+    }
+  }
+}
+
 /// GetFPLibCall - Helper to return the right libcall for the given floating
 /// point type, or UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPLibCall(EVT VT,
@@ -574,9 +918,41 @@ RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
   }
 }
 
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  std::fill(CCs, CCs + RTLIB::UNKNOWN_LIBCALL, ISD::SETCC_INVALID);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::UO_F128] = ISD::SETNE;
+  CCs[RTLIB::UO_PPCF128] = ISD::SETNE;
+}
+
 /// NOTE: The TargetMachine owns TLOF.
-TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), Libcalls(TM.getTargetTriple()) {
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   initActions();
 
   // Perform these initializations only once.
@@ -608,6 +984,11 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
 
   MinCmpXchgSizeInBits = 0;
   SupportsUnalignedAtomics = false;
+
+  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);
+
+  InitLibcalls(TM.getTargetTriple());
+  InitCmpLibcallCCs(CmpLibcallCCs);
 }
 
 void TargetLoweringBase::initActions() {
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 8bf199f0f44c9..4eb08f648760d 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -73,7 +73,6 @@ add_llvm_component_library(LLVMCore
   VectorBuilder.cpp
   Verifier.cpp
   VFABIDemangler.cpp
-  RuntimeLibcalls.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/IR
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
deleted file mode 100644
index de3db557d8b50..0000000000000
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-//===- RuntimeLibcalls.cpp - Interface for runtime libcalls -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/RuntimeLibcalls.h"
-
-using namespace llvm;
-using namespace RTLIB;
-
-/// Set default libcall names. If a target wants to opt-out of a libcall it
-/// should be placed here.
-void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
-  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
-            nullptr);
-
-#define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
-#include "llvm/IR/RuntimeLibcalls.def"
-#undef HANDLE_LIBCALL
-
-  // Initialize calling conventions to their default.
-  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
-    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
-
-  // Use the f128 variants of math functions on x86_64
-  if (TT.getArch() == Triple::ArchType::x86_64 && TT.isGNUEnvironment()) {
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
-    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::TAN_F128, "tanf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::ASIN_F128, "asinf128");
-    setLibcallName(RTLIB::ACOS_F128, "acosf128");
-    setLibcallName(RTLIB::ATAN_F128, "atanf128");
-    setLibcallName(RTLIB::SINH_F128, "sinhf128");
-    setLibcallName(RTLIB::COSH_F128, "coshf128");
-    setLibcallName(RTLIB::TANH_F128, "tanhf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
-  }
-
-  // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
-  if (TT.isPPC()) {
-    setLibcallName(RTLIB::ADD_F128, "__addkf3");
-    setLibcallName(RTLIB::SUB_F128, "__subkf3");
-    setLibcallName(RTLIB::MUL_F128, "__mulkf3");
-    setLibcallName(RTLIB::DIV_F128, "__divkf3");
-    setLibcallName(RTLIB::POWI_F128, "__powikf2");
-    setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2");
-    setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2");
-    setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2");
-    setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2");
-    setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi");
-    setLibcallName(RTLIB::FPTOSINT_F128_I64, "__fixkfdi");
-    setLibcallName(RTLIB::FPTOSINT_F128_I128, "__fixkfti");
-    setLibcallName(RTLIB::FPTOUINT_F128_I32, "__fixunskfsi");
-    setLibcallName(RTLIB::FPTOUINT_F128_I64, "__fixunskfdi");
-    setLibcallName(RTLIB::FPTOUINT_F128_I128, "__fixunskfti");
-    setLibcallName(RTLIB::SINTTOFP_I32_F128, "__floatsikf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F128, "__floatdikf");
-    setLibcallName(RTLIB::SINTTOFP_I128_F128, "__floattikf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F128, "__floatunsikf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F128, "__floatundikf");
-    setLibcallName(RTLIB::UINTTOFP_I128_F128, "__floatuntikf");
-    setLibcallName(RTLIB::OEQ_F128, "__eqkf2");
-    setLibcallName(RTLIB::UNE_F128, "__nekf2");
-    setLibcallName(RTLIB::OGE_F128, "__gekf2");
-    setLibcallName(RTLIB::OLT_F128, "__ltkf2");
-    setLibcallName(RTLIB::OLE_F128, "__lekf2");
-    setLibcallName(RTLIB::OGT_F128, "__gtkf2");
-    setLibcallName(RTLIB::UO_F128, "__unordkf2");
-  }
-
-  // A few names are different on particular architectures or environments.
-  if (TT.isOSDarwin()) {
-    // For f16/f32 conversions, Darwin uses the standard naming scheme,
-    // instead of the gnueabi-style __gnu_*_ieee.
-    // FIXME: What about other targets?
-    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
-    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
-
-    // Some darwins have an optimized __bzero/bzero function.
-    switch (TT.getArch()) {
-    case Triple::x86:
-    case Triple::x86_64:
-      if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
-        setLibcallName(RTLIB::BZERO, "__bzero");
-      break;
-    case Triple::aarch64:
-    case Triple::aarch64_32:
-      setLibcallName(RTLIB::BZERO, "bzero");
-      break;
-    default:
-      break;
-    }
-
-    if (darwinHasSinCos(TT)) {
-      setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
-      setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
-      if (TT.isWatchABI()) {
-        setLibcallCallingConv(RTLIB::SINCOS_STRET_F32,
-                              CallingConv::ARM_AAPCS_VFP);
-        setLibcallCallingConv(RTLIB::SINCOS_STRET_F64,
-                              CallingConv::ARM_AAPCS_VFP);
-      }
-    }
-
-    switch (TT.getOS()) {
-    case Triple::MacOSX:
-      if (TT.isMacOSXVersionLT(10, 9)) {
-        setLibcallName(RTLIB::EXP10_F32, nullptr);
-        setLibcallName(RTLIB::EXP10_F64, nullptr);
-      } else {
-        setLibcallName(RTLIB::EXP10_F32, "__exp10f");
-        setLibcallName(RTLIB::EXP10_F64, "__exp10");
-      }
-      break;
-    case Triple::IOS:
-      if (TT.isOSVersionLT(7, 0)) {
-        setLibcallName(RTLIB::EXP10_F32, nullptr);
-        setLibcallName(RTLIB::EXP10_F64, nullptr);
-        break;
-      }
-      [[fallthrough]];
-    case Triple::TvOS:
-    case Triple::WatchOS:
-    case Triple::XROS:
-      setLibcallName(RTLIB::EXP10_F32, "__exp10f");
-      setLibcallName(RTLIB::EXP10_F64, "__exp10");
-      break;
-    default:
-      break;
-    }
-  } else {
-    setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
-    setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
-  }
-
-  if (TT.isGNUEnvironment() || TT.isOSFuchsia() ||
-      (TT.isAndroid() && !TT.isAndroidVersionLT(9))) {
-    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
-    setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    setLibcallName(RTLIB::SINCOS_F80, "sincosl");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosl");
-    setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
-  }
-
-  if (TT.isPS()) {
-    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
-    setLibcallName(RTLIB::SINCOS_F64, "sincos");
-  }
-
-  if (TT.isOSOpenBSD()) {
-    setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
-  }
-
-  if (TT.isOSWindows() && !TT.isOSCygMing()) {
-    setLibcallName(RTLIB::LDEXP_F32, nullptr);
-    setLibcallName(RTLIB::LDEXP_F80, nullptr);
-    setLibcallName(RTLIB::LDEXP_F128, nullptr);
-    setLibcallName(RTLIB::LDEXP_PPCF128, nullptr);
-
-    setLibcallName(RTLIB::FREXP_F32, nullptr);
-    setLibcallName(RTLIB::FREXP_F80, nullptr);
-    setLibcallName(RTLIB::FREXP_F128, nullptr);
-    setLibcallName(RTLIB::FREXP_PPCF128, nullptr);
-  }
-
-  if (TT.isAArch64()) {
-    if (TT.isOSMSVCRT()) {
-      // MSVCRT doesn't have powi; fall back to pow
-      setLibcallName(RTLIB::POWI_F32, nullptr);
-      setLibcallName(RTLIB::POWI_F64, nullptr);
-    }
-  }
-
-  // Disable most libcalls on AMDGPU.
-  if (TT.isAMDGPU()) {
-    for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
-      if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
-        setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
-    }
-  }
-
-  // Disable most libcalls on NVPTX.
-  if (TT.isNVPTX()) {
-    for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
-      if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
-        setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
-  }
-
-  if (TT.isARM() || TT.isThumb()) {
-    // These libcalls are not available in 32-bit.
-    setLibcallName(RTLIB::SHL_I128, nullptr);
-    setLibcallName(RTLIB::SRL_I128, nullptr);
-    setLibcallName(RTLIB::SRA_I128, nullptr);
-    setLibcallName(RTLIB::MUL_I128, nullptr);
-    setLibcallName(RTLIB::MULO_I64, nullptr);
-    setLibcallName(RTLIB::MULO_I128, nullptr);
-
-    if (TT.isOSMSVCRT()) {
-      // MSVCRT doesn't have powi; fall back to pow
-      setLibcallName(RTLIB::POWI_F32, nullptr);
-      setLibcallName(RTLIB::POWI_F64, nullptr);
-    }
-  }
-
-  if (TT.getArch() == Triple::ArchType::avr) {
-    // Division rtlib functions (not supported), use divmod functions instead
-    setLibcallName(RTLIB::SDIV_I8, nullptr);
-    setLibcallName(RTLIB::SDIV_I16, nullptr);
-    setLibcallName(RTLIB::SDIV_I32, nullptr);
-    setLibcallName(RTLIB::UDIV_I8, nullptr);
-    setLibcallName(RTLIB::UDIV_I16, nullptr);
-    setLibcallName(RTLIB::UDIV_I32, nullptr);
-
-    // Modulus rtlib functions (not supported), use divmod functions instead
-    setLibcallName(RTLIB::SREM_I8, nullptr);
-    setLibcallName(RTLIB::SREM_I16, nullptr);
-    setLibcallName(RTLIB::SREM_I32, nullptr);
-    setLibcallName(RTLIB::UREM_I8, nullptr);
-    setLibcallName(RTLIB::UREM_I16, nullptr);
-    setLibcallName(RTLIB::UREM_I32, nullptr);
-  }
-
-  if (TT.getArch() == Triple::ArchType::hexagon) {
-    // These cause problems when the shift amount is non-constant.
-    setLibcallName(RTLIB::SHL_I128, nullptr);
-    setLibcallName(RTLIB::SRL_I128, nullptr);
-    setLibcallName(RTLIB::SRA_I128, nullptr);
-  }
-
-  if (TT.isLoongArch()) {
-    if (!TT.isLoongArch64()) {
-      // Set libcalls.
-      setLibcallName(RTLIB::MUL_I128, nullptr);
-      // The MULO libcall is not part of libgcc, only compiler-rt.
-      setLibcallName(RTLIB::MULO_I64, nullptr);
-    }
-    // The MULO libcall is not part of libgcc, only compiler-rt.
-    setLibcallName(RTLIB::MULO_I128, nullptr);
-  }
-
-  if (TT.isMIPS32()) {
-    // These libcalls are not available in 32-bit.
-    setLibcallName(RTLIB::SHL_I128, nullptr);
-    setLibcallName(RTLIB::SRL_I128, nullptr);
-    setLibcallName(RTLIB::SRA_I128, nullptr);
-    setLibcallName(RTLIB::MUL_I128, nullptr);
-    setLibcallName(RTLIB::MULO_I64, nullptr);
-    setLibcallName(RTLIB::MULO_I128, nullptr);
-  }
-
-  if (TT.isPPC()) {
-    if (!TT.isPPC64()) {
-      // These libcalls are not available in 32-bit.
-      setLibcallName(RTLIB::SHL_I128, nullptr);
-      setLibcallName(RTLIB::SRL_I128, nullptr);
-      setLibcallName(RTLIB::SRA_I128, nullptr);
-      setLibcallName(RTLIB::MUL_I128, nullptr);
-      setLibcallName(RTLIB::MULO_I64, nullptr);
-    }
-    setLibcallName(RTLIB::MULO_I128, nullptr);
-  }
-
-  if (TT.isRISCV32()) {
-    // These libcalls are not available in 32-bit.
-    setLibcallName(RTLIB::SHL_I128, nullptr);
-    setLibcallName(RTLIB::SRL_I128, nullptr);
-    setLibcallName(RTLIB::SRA_I128, nullptr);
-    setLibcallName(RTLIB::MUL_I128, nullptr);
-    setLibcallName(RTLIB::MULO_I64, nullptr);
-  }
-
-  if (TT.isSPARC()) {
-    if (!TT.isSPARC64()) {
-      // These libcalls are not available in 32-bit.
-      setLibcallName(RTLIB::MULO_I64, nullptr);
-      setLibcallName(RTLIB::MUL_I128, nullptr);
-      setLibcallName(RTLIB::SHL_I128, nullptr);
-      setLibcallName(RTLIB::SRL_I128, nullptr);
-      setLibcallName(RTLIB::SRA_I128, nullptr);
-    }
-    setLibcallName(RTLIB::MULO_I128, nullptr);
-  }
-
-  if (TT.isSystemZ()) {
-    setLibcallName(RTLIB::SRL_I128, nullptr);
-    setLibcallName(RTLIB::SHL_I128, nullptr);
-    setLibcallName(RTLIB::SRA_I128, nullptr);
-  }
-
-  if (TT.isX86()) {
-    if (TT.getArch() == Triple::ArchType::x86) {
-      // These libcalls are not available in 32-bit.
-      setLibcallName(RTLIB::SHL_I128, nullptr);
-      setLibcallName(RTLIB::SRL_I128, nullptr);
-      setLibcallName(RTLIB::SRA_I128, nullptr);
-      setLibcallName(RTLIB::MUL_I128, nullptr);
-      // The MULO libcall is not part of libgcc, only compiler-rt.
-      setLibcallName(RTLIB::MULO_I64, nullptr);
-    }
-
-    // The MULO libcall is not part of libgcc, only compiler-rt.
-    setLibcallName(RTLIB::MULO_I128, nullptr);
-
-    if (TT.isOSMSVCRT()) {
-      // MSVCRT doesn't have powi; fall back to pow
-      setLibcallName(RTLIB::POWI_F32, nullptr);
-      setLibcallName(RTLIB::POWI_F64, nullptr);
-    }
-  }
-}
-
-void RuntimeLibcallsInfo::initCmpLibcallCCs() {
-  std::fill(CmpLibcallCCs, CmpLibcallCCs + RTLIB::UNKNOWN_LIBCALL,
-            ISD::SETCC_INVALID);
-  CmpLibcallCCs[RTLIB::OEQ_F32] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::OEQ_F64] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::OEQ_F128] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
-  CmpLibcallCCs[RTLIB::UNE_F32] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UNE_F64] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UNE_F128] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::OGE_F32] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OGE_F64] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OGE_F128] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
-  CmpLibcallCCs[RTLIB::OLT_F32] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLT_F64] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLT_F128] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
-  CmpLibcallCCs[RTLIB::OLE_F32] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OLE_F64] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OLE_F128] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
-  CmpLibcallCCs[RTLIB::OGT_F32] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::OGT_F64] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::OGT_F128] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
-  CmpLibcallCCs[RTLIB::UO_F32] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UO_F64] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UO_F128] = ISD::SETNE;
-  CmpLibcallCCs[RTLIB::UO_PPCF128] = ISD::SETNE;
-}
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index d303f228aa72c..4ffacbb9f0ffd 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -30,7 +30,6 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/Linker/IRMover.h"
@@ -1358,13 +1357,14 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
   return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 }
 
-SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
-  RTLIB::RuntimeLibcallsInfo Libcalls(TT);
+static const char *libcallRoutineNames[] = {
+#define HANDLE_LIBCALL(code, name) name,
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+};
 
-  SmallVector<const char *> LibcallSymbols;
-  copy_if(Libcalls.getLibcallNames(), std::back_inserter(LibcallSymbols),
-          [](const char *Name) { return Name; });
-  return LibcallSymbols;
+ArrayRef<const char*> LTO::getRuntimeLibcallSymbols() {
+  return ArrayRef(libcallRoutineNames);
 }
 
 /// This class defines the interface to the ThinLTO backend.
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 2a2b235461a55..7e29da4d5e948 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -22,7 +22,6 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Object/SymbolicFile.h"
@@ -47,6 +46,9 @@ static cl::opt<bool> DisableBitcodeVersionUpgrade(
     cl::desc("Disable automatic bitcode upgrade for version mismatch"));
 
 static const char *PreservedSymbols[] = {
+#define HANDLE_LIBCALL(code, name) name,
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
     // There are global variables, so put it here instead of in
     // RuntimeLibcalls.def.
     // TODO: Are there similar such variables?
@@ -213,16 +215,9 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
   return P.first->second;
 }
 
-static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) {
-  DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols),
-                                         std::end(PreservedSymbols));
-
-  RTLIB::RuntimeLibcallsInfo Libcalls(TT);
-  for (const char *Name : Libcalls.getLibcallNames()) {
-    if (Name)
-      PreservedSymbolSet.insert(Name);
-  }
-  return PreservedSymbolSet;
+static DenseSet<StringRef> buildPreservedSymbolsSet() {
+  return DenseSet<StringRef>(std::begin(PreservedSymbols),
+                             std::end(PreservedSymbols));
 }
 
 Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
@@ -281,8 +276,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
   setStr(Sym.IRName, GV->getName());
 
   static const DenseSet<StringRef> PreservedSymbolsSet =
-      buildPreservedSymbolsSet(
-          llvm::Triple(GV->getParent()->getTargetTriple()));
+      buildPreservedSymbolsSet();
   bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
 
   if (Used.count(GV) || IsPreservedSymbol)
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 527496f1a6374..e3c5a143b2889 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -35,7 +35,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/IR/Argument.h"
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 84de1ee8f8923..dbaf31166229b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -48,7 +48,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 1cc21da51d1e6..61d2928fe6d41 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -40,7 +40,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75d16a42d0205..2683b5741d459 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -53,7 +53,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index 9e10638233e6d..d6ce4eb1055b4 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -16,7 +16,7 @@
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/IR/Instructions.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index 275b1c0f8dc01..ffa8b50493510 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -14,7 +14,7 @@
 #define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
 
 #include "MCTargetDesc/ARMAddressingModes.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 7aeaebc584c64..918fc127762fc 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -27,7 +27,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -39,12 +39,12 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index f6763a35cc0d5..c7848356682d0 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -27,7 +27,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 6072e5e244263..d5d047b3ee1a5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsLoongArch.h"
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0f2047fcac640..8cb64a0e6f4bf 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -40,7 +40,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 898d1f80d0564..564c0f74d975d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -48,7 +48,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index ba3ab5164af26..53619eaa6f562 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -21,7 +21,7 @@
 #include "WebAssemblyRuntimeLibcallSignatures.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index 966b84aa4951b..ff6515d5bf4e6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -16,7 +16,7 @@
 
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 
 namespace llvm {
 
diff --git a/llvm/tools/lto/lto.cpp b/llvm/tools/lto/lto.cpp
index d68cff839604f..ece6dd0f10830 100644
--- a/llvm/tools/lto/lto.cpp
+++ b/llvm/tools/lto/lto.cpp
@@ -691,7 +691,7 @@ extern const char *lto_input_get_dependent_library(lto_input_t input,
 }
 
 extern const char *const *lto_runtime_lib_symbols_list(size_t *size) {
-  auto symbols = lto::LTO::getRuntimeLibcallSymbols(Triple());
+  auto symbols = lto::LTO::getRuntimeLibcallSymbols();
   *size = symbols.size();
   return symbols.data();
 }

>From e2965fe512162baeb5bdb66f1a675a47c7fab5ca Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 20 Jul 2024 12:10:09 +0900
Subject: [PATCH 34/39] Revert "[LLVM][LTO] Add missing dependency"

This reverts commit bb604ae9b8adbc0a0852eb68545eff685902f41b.
(llvmorg-19-init-17804-gbb604ae9b8ad)
See #99610
---
 llvm/lib/IR/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 4eb08f648760d..6b3224e22ffb6 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -81,7 +81,6 @@ add_llvm_component_library(LLVMCore
   ${LLVM_PTHREAD_LIB}
 
   DEPENDS
-  vt_gen
   intrinsics_gen
 
   LINK_COMPONENTS

>From 168ecd706904d6ce221dc5107da92c56aea7c8e9 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 20 Jul 2024 12:10:56 +0900
Subject: [PATCH 35/39] Revert "[bazel] Fix llvm:Core build (#99054)"

This reverts commit 5b54f36fb607d21c18f9eb56dcf481a9841dee8e.
(llvmorg-19-init-17774-g5b54f36fb607)
See #99610
---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index ae17746c72882..64d36c7b7f664 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -947,14 +947,7 @@ cc_library(
     ]) + [
         # To avoid a dependency cycle.
         "include/llvm/Analysis/IVDescriptors.h",
-        "include/llvm/CodeGen/GenVT.inc",
-    ] + glob(
-        # To avoid a dependency cycle.
-        [
-            "include/llvm/CodeGen/**/*.h",
-            "include/llvm/CodeGenTypes/**/*.h",
-        ],
-    ),
+    ],
     hdrs = glob(
         [
             "include/llvm/*.h",

>From 5893b1e297f3a333b30a9d32d0909b77a9fcd31c Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sat, 20 Jul 2024 12:09:30 +0900
Subject: [PATCH 36/39] Reformat

---
 llvm/include/llvm/CodeGen/RuntimeLibcalls.h   | 172 +++++++++---------
 llvm/include/llvm/LTO/LTO.h                   |   2 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   6 +-
 llvm/lib/LTO/LTO.cpp                          |   2 +-
 .../Target/Hexagon/HexagonISelLowering.cpp    |   2 +-
 5 files changed, 90 insertions(+), 94 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
index 3a407c4a4d940..f3170418f3364 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -19,95 +19,91 @@
 
 namespace llvm {
 namespace RTLIB {
-  /// RTLIB::Libcall enum - This enum defines all of the runtime library calls
-  /// the backend can emit.  The various long double types cannot be merged,
-  /// because 80-bit library functions use "xf" and 128-bit use "tf".
-  ///
-  /// When adding PPCF128 functions here, note that their names generally need
-  /// to be overridden for Darwin with the xxx$LDBL128 form.  See
-  /// PPCISelLowering.cpp.
-  ///
-  enum Libcall {
+/// RTLIB::Libcall enum - This enum defines all of the runtime library calls
+/// the backend can emit.  The various long double types cannot be merged,
+/// because 80-bit library functions use "xf" and 128-bit use "tf".
+///
+/// When adding PPCF128 functions here, note that their names generally need
+/// to be overridden for Darwin with the xxx$LDBL128 form.  See
+/// PPCISelLowering.cpp.
+///
+enum Libcall {
 #define HANDLE_LIBCALL(code, name) code,
-    #include "llvm/IR/RuntimeLibcalls.def"
+#include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
-  };
-
-  /// GetFPLibCall - Helper to return the right libcall for the given floating
-  /// point type, or UNKNOWN_LIBCALL if there is none.
-  Libcall getFPLibCall(EVT VT,
-                       Libcall Call_F32,
-                       Libcall Call_F64,
-                       Libcall Call_F80,
-                       Libcall Call_F128,
-                       Libcall Call_PPCF128);
-
-  /// getFPEXT - Return the FPEXT_*_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getFPEXT(EVT OpVT, EVT RetVT);
-
-  /// getFPROUND - Return the FPROUND_*_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getFPROUND(EVT OpVT, EVT RetVT);
-
-  /// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getFPTOSINT(EVT OpVT, EVT RetVT);
-
-  /// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getFPTOUINT(EVT OpVT, EVT RetVT);
-
-  /// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getSINTTOFP(EVT OpVT, EVT RetVT);
-
-  /// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getUINTTOFP(EVT OpVT, EVT RetVT);
-
-  /// getPOWI - Return the POWI_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getPOWI(EVT RetVT);
-
-  /// getLDEXP - Return the LDEXP_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getLDEXP(EVT RetVT);
-
-  /// getFREXP - Return the FREXP_* value for the given types, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getFREXP(EVT RetVT);
-
-  /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
-  /// UNKNOWN_LIBCALL if there is none.
-  Libcall getSYNC(unsigned Opc, MVT VT);
-
-  /// Return the outline atomics value for the given atomic ordering, access
-  /// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there
-  /// is none.
-  Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4],
-                                 AtomicOrdering Order, uint64_t MemSize);
-
-  /// Return the outline atomics value for the given opcode, atomic ordering
-  /// and type, or UNKNOWN_LIBCALL if there is none.
-  Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
-
-  /// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return
-  /// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
-  /// UNKNOW_LIBCALL if there is none.
-  Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
-
-  /// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return
-  /// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
-  /// UNKNOW_LIBCALL if there is none.
-  Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
-
-  /// getMEMSET_ELEMENT_UNORDERED_ATOMIC - Return
-  /// MEMSET_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
-  /// UNKNOW_LIBCALL if there is none.
-  Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
-
-}
-}
+};
+
+/// GetFPLibCall - Helper to return the right libcall for the given floating
+/// point type, or UNKNOWN_LIBCALL if there is none.
+Libcall getFPLibCall(EVT VT, Libcall Call_F32, Libcall Call_F64,
+                     Libcall Call_F80, Libcall Call_F128, Libcall Call_PPCF128);
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFPEXT(EVT OpVT, EVT RetVT);
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFPROUND(EVT OpVT, EVT RetVT);
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFPTOSINT(EVT OpVT, EVT RetVT);
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFPTOUINT(EVT OpVT, EVT RetVT);
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getSINTTOFP(EVT OpVT, EVT RetVT);
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getUINTTOFP(EVT OpVT, EVT RetVT);
+
+/// getPOWI - Return the POWI_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getPOWI(EVT RetVT);
+
+/// getLDEXP - Return the LDEXP_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getLDEXP(EVT RetVT);
+
+/// getFREXP - Return the FREXP_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFREXP(EVT RetVT);
+
+/// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getSYNC(unsigned Opc, MVT VT);
+
+/// Return the outline atomics value for the given atomic ordering, access
+/// size and set of libcalls for a given atomic, or UNKNOWN_LIBCALL if there
+/// is none.
+Libcall getOutlineAtomicHelper(const Libcall (&LC)[5][4], AtomicOrdering Order,
+                               uint64_t MemSize);
+
+/// Return the outline atomics value for the given opcode, atomic ordering
+/// and type, or UNKNOWN_LIBCALL if there is none.
+Libcall getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT);
+
+/// getMEMCPY_ELEMENT_UNORDERED_ATOMIC - Return
+/// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
+/// UNKNOW_LIBCALL if there is none.
+Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
+
+/// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return
+/// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
+/// UNKNOW_LIBCALL if there is none.
+Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
+
+/// getMEMSET_ELEMENT_UNORDERED_ATOMIC - Return
+/// MEMSET_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or
+/// UNKNOW_LIBCALL if there is none.
+Libcall getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize);
+
+} // namespace RTLIB
+} // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 94996ae89e35d..63c8435838f27 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -301,7 +301,7 @@ class LTO {
 
   /// Static method that returns a list of libcall symbols that can be generated
   /// by LTO but might not be visible from bitcode symbol table.
-  static ArrayRef<const char*> getRuntimeLibcallSymbols();
+  static ArrayRef<const char *> getRuntimeLibcallSymbols();
 
 private:
   Config Conf;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index f0fa6aa772e37..914c67abdd617 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -114,8 +114,7 @@ static bool darwinHasSinCos(const Triple &TT) {
 }
 
 void TargetLoweringBase::InitLibcalls(const Triple &TT) {
-#define HANDLE_LIBCALL(code, name) \
-  setLibcallName(RTLIB::code, name);
+#define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
   // Initialize calling conventions to their default.
@@ -985,7 +984,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinCmpXchgSizeInBits = 0;
   SupportsUnalignedAtomics = false;
 
-  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);
+  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
+            nullptr);
 
   InitLibcalls(TM.getTargetTriple());
   InitCmpLibcallCCs(CmpLibcallCCs);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 4ffacbb9f0ffd..94db6660e0ba5 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1363,7 +1363,7 @@ static const char *libcallRoutineNames[] = {
 #undef HANDLE_LIBCALL
 };
 
-ArrayRef<const char*> LTO::getRuntimeLibcallSymbols() {
+ArrayRef<const char *> LTO::getRuntimeLibcallSymbols() {
   return ArrayRef(libcallRoutineNames);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 918fc127762fc..84defa8d7b91b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -39,12 +39,12 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"

>From 72d8c2737bb557af9d0c735b9fa30b1b03485627 Mon Sep 17 00:00:00 2001
From: Haowei <haowei at google.com>
Date: Fri, 19 Jul 2024 21:42:57 -0700
Subject: [PATCH 37/39] [Fuchsia] Remove linker flags from stage2 pass through
 (#99722)

This patch removes CMAKE_XXX_LINKER_FLAGS from list of flags that passed
through to stage2 build.
---
 clang/cmake/caches/Fuchsia.cmake | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 4d3af3ad3f403..2d2dcb9ae6798 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -67,9 +67,6 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   SWIG_EXECUTABLE
   CMAKE_FIND_PACKAGE_PREFER_CONFIG
   CMAKE_SYSROOT
-  CMAKE_MODULE_LINKER_FLAGS
-  CMAKE_SHARED_LINKER_FLAGS
-  CMAKE_EXE_LINKER_FLAGS
   LLVM_WINSYSROOT
   LLVM_VFSOVERLAY
 )

>From 672cc8bce0a9306b07680b340617b8afd862adf5 Mon Sep 17 00:00:00 2001
From: darkbuck <michael.hliao at gmail.com>
Date: Sat, 20 Jul 2024 02:06:38 -0400
Subject: [PATCH 38/39] [GlobalISel] Allow customizing instruction-select pass

- Allow customizing instruction-select pass to add additional analysis
  dependencies.

Reviewers: aemerson, arsenm, aeubanks

Reviewed By: arsenm

Pull Request: https://github.com/llvm/llvm-project/pull/95724
---
 .../llvm/CodeGen/GlobalISel/InstructionSelect.h        |  4 ++--
 llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp      | 10 ++--------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
index cada7f30072e2..8017f09aa3c8b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelect.h
@@ -49,8 +49,8 @@ class InstructionSelect : public MachineFunctionPass {
         MachineFunctionProperties::Property::Selected);
   }
 
-  InstructionSelect(CodeGenOptLevel OL);
-  InstructionSelect();
+  InstructionSelect(CodeGenOptLevel OL = CodeGenOptLevel::Default,
+                    char &PassID = ID);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 4cb1d01f3e8ca..9a27728dcb4dd 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -62,14 +62,8 @@ INITIALIZE_PASS_END(InstructionSelect, DEBUG_TYPE,
                     "Select target instructions out of generic instructions",
                     false, false)
 
-InstructionSelect::InstructionSelect(CodeGenOptLevel OL)
-    : MachineFunctionPass(ID), OptLevel(OL) {}
-
-// In order not to crash when calling getAnalysis during testing with -run-pass
-// we use the default opt level here instead of None, so that the addRequired()
-// calls are made in getAnalysisUsage().
-InstructionSelect::InstructionSelect()
-    : MachineFunctionPass(ID), OptLevel(CodeGenOptLevel::Default) {}
+InstructionSelect::InstructionSelect(CodeGenOptLevel OL, char &PassID)
+    : MachineFunctionPass(PassID), OptLevel(OL) {}
 
 void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();

>From e4c570ae371d7ae2af1a5a93e1c473a215d25048 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 19 Jul 2024 23:10:25 +0400
Subject: [PATCH 39/39] SelectionDAG: Avoid using MachineFunction::getMMI

---
 llvm/include/llvm/CodeGen/SelectionDAG.h          |  9 ++++++---
 llvm/include/llvm/CodeGen/SelectionDAGISel.h      |  1 +
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp    |  3 ++-
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp  |  6 +++---
 .../lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 15 +++++++++------
 .../unittests/CodeGen/AArch64SelectionDAGTest.cpp |  3 ++-
 .../CodeGen/SelectionDAGAddressAnalysisTest.cpp   |  3 ++-
 .../CodeGen/SelectionDAGPatternMatchTest.cpp      |  3 ++-
 8 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 16ec65f2e7daa..24eab7b408675 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -245,6 +245,7 @@ class SelectionDAG {
 
   ProfileSummaryInfo *PSI = nullptr;
   BlockFrequencyInfo *BFI = nullptr;
+  MachineModuleInfo *MMI = nullptr;
 
   /// List of non-single value types.
   FoldingSet<SDVTListNode> VTListMap;
@@ -459,14 +460,15 @@ class SelectionDAG {
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
             Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
             UniformityInfo *UA, ProfileSummaryInfo *PSIin,
-            BlockFrequencyInfo *BFIin, FunctionVarLocs const *FnVarLocs);
+            BlockFrequencyInfo *BFIin, MachineModuleInfo &MMI,
+            FunctionVarLocs const *FnVarLocs);
 
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
             MachineFunctionAnalysisManager &AM,
             const TargetLibraryInfo *LibraryInfo, UniformityInfo *UA,
             ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin,
-            FunctionVarLocs const *FnVarLocs) {
-    init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, FnVarLocs);
+            MachineModuleInfo &MMI, FunctionVarLocs const *FnVarLocs) {
+    init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, MMI, FnVarLocs);
     MFAM = &AM;
   }
 
@@ -500,6 +502,7 @@ class SelectionDAG {
   OptimizationRemarkEmitter &getORE() const { return *ORE; }
   ProfileSummaryInfo *getPSI() const { return PSI; }
   BlockFrequencyInfo *getBFI() const { return BFI; }
+  MachineModuleInfo *getMMI() const { return MMI; }
 
   FlagInserter *getFlagInserter() { return Inserter; }
   void setFlagInserter(FlagInserter *FI) { Inserter = FI; }
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
index aa0efa5d9bf5d..fc0590b1a1b69 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -48,6 +48,7 @@ class SelectionDAGISel {
   std::unique_ptr<FunctionLoweringInfo> FuncInfo;
   SwiftErrorValueTracking *SwiftError;
   MachineFunction *MF;
+  MachineModuleInfo *MMI;
   MachineRegisterInfo *RegInfo;
   SelectionDAG *CurDAG;
   std::unique_ptr<SelectionDAGBuilder> SDB;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0ccae50286345..e4d08228ee903 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1333,7 +1333,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE, Pass *PassPtr,
                         const TargetLibraryInfo *LibraryInfo,
                         UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
-                        BlockFrequencyInfo *BFIin,
+                        BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
                         FunctionVarLocs const *VarLocs) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
@@ -1345,6 +1345,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
   UA = NewUA;
   PSI = PSIin;
   BFI = BFIin;
+  MMI = &MMIin;
   FnVarLocs = VarLocs;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c179c97029c7a..cf133e162c65f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6707,11 +6707,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0))));
     return;
   case Intrinsic::eh_sjlj_callsite: {
-    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
-    assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
+    assert(DAG.getMMI()->getCurrentCallSite() == 0 &&
+           "Overlapping call sites!");
 
-    MMI.setCurrentCallSite(CI->getZExtValue());
+    DAG.getMMI()->setCurrentCallSite(CI->getZExtValue());
     return;
   }
   case Intrinsic::eh_sjlj_functioncontext: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index df3d207d85d35..2670ff488fbed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -509,7 +509,7 @@ void SelectionDAGISel::initializeAnalysisResults(
     FnVarLocs = &FAM.getResult<DebugAssignmentTrackingAnalysis>(Fn);
 
   auto *UA = FAM.getCachedResult<UniformityInfoAnalysis>(Fn);
-  CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, FnVarLocs);
+  CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, *MMI, FnVarLocs);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
@@ -562,7 +562,11 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   UniformityInfo *UA = nullptr;
   if (auto *UAPass = MFP.getAnalysisIfAvailable<UniformityInfoWrapperPass>())
     UA = &UAPass->getUniformityInfo();
-  CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, FnVarLocs);
+
+  MachineModuleInfo &MMI =
+      MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+  CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
@@ -796,7 +800,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Determine if floating point is used for msvc
-  computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI());
+  computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, *CurDAG->getMMI());
 
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
@@ -1443,7 +1447,6 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
 
 // Mark and Report IPToState for each Block under IsEHa
 void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
-  MachineModuleInfo &MMI = MF->getMMI();
   llvm::WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo();
   if (!EHInfo)
     return;
@@ -1458,8 +1461,8 @@ void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
         continue;
 
       // Insert EH Labels
-      MCSymbol *BeginLabel = MMI.getContext().createTempSymbol();
-      MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
+      MCSymbol *BeginLabel = MF->getContext().createTempSymbol();
+      MCSymbol *EndLabel = MF->getContext().createTempSymbol();
       EHInfo->addIPToStateRange(State, BeginLabel, EndLabel);
       BuildMI(MBB, MBBb, SDB->getCurDebugLoc(),
               TII->get(TargetOpcode::EH_LABEL))
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index cc574b5887efc..877793a28b769 100644
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -68,7 +68,8 @@ class AArch64SelectionDAGTest : public testing::Test {
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+              nullptr);
   }
 
   TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 88c4dbc2eec78..ef30bd491d352 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -78,7 +78,8 @@ class SelectionDAGAddressAnalysisTest : public testing::Test {
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+              nullptr);
   }
 
   TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index a3d5e5f94b610..e318f467bbf27 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -77,7 +77,8 @@ class SelectionDAGPatternMatchTest : public testing::Test {
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
+              nullptr);
   }
 
   TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {