[Lldb-commits] [clang] [libc] [lldb] [llvm] [mlir] [clang-repl] Add support for running custom code in Remote JIT executor (PR #157358)

Fri Sep 12 22:35:47 PDT 2025

https://github.com/kr-2003 updated https://github.com/llvm/llvm-project/pull/157358

>From 4410aeb08dc14a4f29c9ec0e8730a1bde3386665 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:19:44 +0530
Subject: [PATCH 01/39] [clang-repl] Adding custom lambda in launchExecutor

---
 clang/include/clang/Interpreter/Interpreter.h | 4 +++-
 clang/lib/Interpreter/IncrementalExecutor.cpp | 6 +++++-
 clang/lib/Interpreter/IncrementalExecutor.h   | 3 ++-
 clang/lib/Interpreter/Interpreter.cpp         | 2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index 61af7bf762d5e..54be57684c03f 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -135,11 +135,13 @@ class Interpreter {
     std::string OrcRuntimePath = "";
     /// PID of the out-of-process JIT executor.
     uint32_t ExecutorPID = 0;
+    /// Custom lambda to be executed inside child process/executor
+    std::function<void()> CustomizeFork = nullptr;
 
     JITConfig()
         : IsOutOfProcess(false), OOPExecutor(""), OOPExecutorConnect(""),
           UseSharedMemory(false), SlabAllocateSize(0), OrcRuntimePath(""),
-          ExecutorPID(0) {}
+          ExecutorPID(0), CustomizeFork(nullptr) {}
   };
 
 protected:
diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index b0eb7d0e9f072..0cf11939fefd1 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -138,7 +138,8 @@ IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
 
 Expected<std::unique_ptr<llvm::jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
-                          unsigned SlabAllocateSize) {
+                          unsigned SlabAllocateSize,
+                                    std::function<void()> CustomizeFork) {
   llvm::orc::SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Instance,
@@ -215,6 +216,9 @@ IncrementalExecutor::launchExecutor(llvm::StringRef ExecutablePath,
     close(ToExecutor[WriteEnd]);
     close(FromExecutor[ReadEnd]);
 
+    if (CustomizeFork)
+      CustomizeFork();
+
     // Execute the child process.
     std::unique_ptr<char[]> ExecutorPath, FDSpecifier;
     {
diff --git a/clang/lib/Interpreter/IncrementalExecutor.h b/clang/lib/Interpreter/IncrementalExecutor.h
index d091535166770..bb1ec33452515 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.h
+++ b/clang/lib/Interpreter/IncrementalExecutor.h
@@ -79,7 +79,8 @@ class IncrementalExecutor {
   static llvm::Expected<
       std::pair<std::unique_ptr<llvm::orc::SimpleRemoteEPC>, uint32_t>>
   launchExecutor(llvm::StringRef ExecutablePath, bool UseSharedMemory,
-                 unsigned SlabAllocateSize);
+                 unsigned SlabAllocateSize,
+                 std::function<void()> CustomizeFork = nullptr);
 
 #if LLVM_ON_UNIX && LLVM_ENABLE_THREADS
   static llvm::Expected<std::unique_ptr<llvm::orc::SimpleRemoteEPC>>
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 043e0c1e5754e..e17229a853a6f 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -355,7 +355,7 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
   if (!Config.OOPExecutor.empty()) {
     // Launch an out-of-process executor locally in a child process.
     auto ResultOrErr = IncrementalExecutor::launchExecutor(
-        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize);
+        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize, Config.CustomizeFork);
     if (!ResultOrErr)
       return ResultOrErr.takeError();
     childPid = ResultOrErr->second;

>From 0a09e011672db57c4a041a3719144dd90afdeb8d Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:20:09 +0530
Subject: [PATCH 02/39] Formatting changes

---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 2 +-
 clang/lib/Interpreter/Interpreter.cpp         | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 0cf11939fefd1..792ecb08c5f33 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -139,7 +139,7 @@ IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
 Expected<std::unique_ptr<llvm::jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
                           unsigned SlabAllocateSize,
-                                    std::function<void()> CustomizeFork) {
+                          std::function<void()> CustomizeFork) {
   llvm::orc::SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Instance,
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index e17229a853a6f..2425a628b59b9 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -355,7 +355,8 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
   if (!Config.OOPExecutor.empty()) {
     // Launch an out-of-process executor locally in a child process.
     auto ResultOrErr = IncrementalExecutor::launchExecutor(
-        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize, Config.CustomizeFork);
+        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize,
+        Config.CustomizeFork);
     if (!ResultOrErr)
       return ResultOrErr.takeError();
     childPid = ResultOrErr->second;

>From 268826a35221f15549d595226d709922bca98abc Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:35:44 +0530
Subject: [PATCH 03/39] Formatting changes & fixing bug

---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 792ecb08c5f33..5bec3b44a0dc0 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -173,7 +173,8 @@ createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
 llvm::Expected<std::pair<std::unique_ptr<llvm::orc::SimpleRemoteEPC>, uint32_t>>
 IncrementalExecutor::launchExecutor(llvm::StringRef ExecutablePath,
                                     bool UseSharedMemory,
-                                    unsigned SlabAllocateSize) {
+                                    unsigned SlabAllocateSize,
+                                    std::function<void()> CustomizeFork) {
 #ifndef LLVM_ON_UNIX
   // FIXME: Add support for Windows.
   return llvm::make_error<llvm::StringError>(

>From cf4c8766088524ad3b7ffe4d927325185c8c262a Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:49:32 +0530
Subject: [PATCH 04/39] Removing extra arg from sharedMem

---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 5bec3b44a0dc0..45620fcd358c8 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -138,8 +138,7 @@ IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
 
 Expected<std::unique_ptr<llvm::jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
-                          unsigned SlabAllocateSize,
-                          std::function<void()> CustomizeFork) {
+                          unsigned SlabAllocateSize) {
   llvm::orc::SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Instance,

>From df67ed1320ebef0997ed78a27b8275034b3b50a8 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Sat, 13 Sep 2025 03:23:53 +0530
Subject: [PATCH 05/39] ClangRepl Interpreter test for out-of-process

---
 clang/unittests/Interpreter/CMakeLists.txt    |  23 +-
 .../OutOfProcessInterpreterTests.cpp          | 203 ++++++++++++++++++
 2 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp

diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index db9f80d9f53fe..7b8dcfc9b0546 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -29,12 +29,25 @@ set(CLANG_LIBS_TO_LINK
   )
 endif()
 
-add_distinct_clang_unittest(ClangReplInterpreterTests
+set(CLANG_REPL_TEST_SOURCES
   IncrementalCompilerBuilderTest.cpp
   IncrementalProcessingTest.cpp
   InterpreterTest.cpp
   InterpreterExtensionsTest.cpp
   CodeCompletionTest.cpp
+)
+
+if(TARGET compiler-rt)
+  list(APPEND CLANG_REPL_TEST_SOURCES
+    OutOfProcessInterpreterTests.cpp
+  )
+  message(STATUS "Compiler-RT found, enabling out of process JIT tests")
+endif()
+
+add_distinct_clang_unittest(ClangReplInterpreterTests
+  ${CLANG_REPL_TEST_SOURCES}
+
+  PARTIAL_SOURCES_INTENDED
 
   EXPORT_SYMBOLS
 
@@ -48,6 +61,14 @@ add_distinct_clang_unittest(ClangReplInterpreterTests
   ${LLVM_COMPONENTS_TO_LINK}
   )
 
+if(TARGET compiler-rt)
+  add_dependencies(ClangReplInterpreterTests 
+    llvm-jitlink-executor 
+    compiler-rt
+  )
+  message(STATUS "Adding dependency on compiler-rt for out of process JIT tests")
+endif()
+
 if(EMSCRIPTEN)
 # Without the above you try to link to LLVMSupport twice, and end
 # up with a duplicate symbol error when creating the main module
diff --git a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
new file mode 100644
index 0000000000000..271820e4e5f25
--- /dev/null
+++ b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
@@ -0,0 +1,203 @@
+//===- unittests/Interpreter/OutOfProcessInterpreterTest.cpp --- Interpreter
+// tests when Out-of-Process ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for Clang's Interpreter library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InterpreterTestFixture.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclGroup.h"
+#include "clang/AST/Mangle.h"
+#include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "clang/Interpreter/Interpreter.h"
+#include "clang/Interpreter/Value.h"
+#include "clang/Sema/Lookup.h"
+#include "clang/Sema/Sema.h"
+#include "llvm/Support/Error.h"
+#include "llvm/TargetParser/Host.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <signal.h>
+#include <sstream>
+#include <unistd.h>
+
+using namespace clang;
+
+llvm::ExitOnError ExitOnError;
+
+namespace {
+
+using Args = std::vector<const char *>;
+
+struct FileDeleter {
+  void operator()(FILE *f) {
+    if (f)
+      fclose(f);
+  }
+};
+
+struct IOContext {
+  std::unique_ptr<FILE, FileDeleter> stdin_file;
+  std::unique_ptr<FILE, FileDeleter> stdout_file;
+  std::unique_ptr<FILE, FileDeleter> stderr_file;
+
+  bool initializeTempFiles() {
+    stdin_file.reset(tmpfile());
+    stdout_file.reset(tmpfile());
+    stderr_file.reset(tmpfile());
+    return stdin_file && stdout_file && stderr_file;
+  }
+
+  std::string readStdoutContent() {
+    if (!stdout_file)
+      return "";
+    rewind(stdout_file.get());
+    std::ostringstream content;
+    char buffer[1024];
+    size_t bytes_read;
+    while ((bytes_read = fread(buffer, 1, sizeof(buffer), stdout_file.get())) >
+           0) {
+      content.write(buffer, bytes_read);
+    }
+    return content.str();
+  }
+
+  std::string readStderrContent() {
+    if (!stderr_file)
+      return "";
+    rewind(stderr_file.get());
+    std::ostringstream content;
+    char buffer[1024];
+    size_t bytes_read;
+    while ((bytes_read = fread(buffer, 1, sizeof(buffer), stderr_file.get())) >
+           0) {
+      content.write(buffer, bytes_read);
+    }
+    return content.str();
+  }
+};
+
+static void removePathComponent(unsigned N, llvm::SmallString<256> &Path) {
+  for (unsigned i = 0; i < N; ++i)
+    llvm::sys::path::remove_filename(Path);
+}
+
+static std::string getExecutorPath() {
+  llvm::SmallString<256> ExecutorPath(llvm::sys::fs::getMainExecutable(
+      nullptr, reinterpret_cast<void *>(&getExecutorPath)));
+  removePathComponent(5, ExecutorPath);
+  llvm::sys::path::append(ExecutorPath, "bin", "llvm-jitlink-executor");
+  return ExecutorPath.str().str();
+}
+
+static std::string getOrcRuntimePath() {
+  llvm::SmallString<256> RuntimePath(llvm::sys::fs::getMainExecutable(
+      nullptr, reinterpret_cast<void *>(&getOrcRuntimePath)));
+  removePathComponent(5, RuntimePath);
+  llvm::sys::path::append(RuntimePath, CLANG_INSTALL_LIBDIR_BASENAME, "clang",
+                          CLANG_VERSION_MAJOR_STRING, "lib");
+
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+  if (SystemTriple.isOSBinFormatMachO()) {
+    llvm::sys::path::append(RuntimePath, "darwin", "liborc_rt_osx.a");
+  } else if (SystemTriple.isOSBinFormatELF()) {
+    llvm::sys::path::append(RuntimePath, "x86_64-unknown-linux-gnu",
+                            "liborc_rt.a");
+  }
+  return RuntimePath.str().str();
+}
+
+static std::unique_ptr<Interpreter>
+createInterpreterWithRemoteExecution(std::shared_ptr<IOContext> io_ctx,
+                                     const Args &ExtraArgs = {}) {
+  Args ClangArgs = {"-Xclang", "-emit-llvm-only"};
+  llvm::append_range(ClangArgs, ExtraArgs);
+  auto CB = clang::IncrementalCompilerBuilder();
+  CB.SetCompilerArgs(ClangArgs);
+  auto CI = cantFail(CB.CreateCpp());
+
+  clang::Interpreter::JITConfig Config;
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+
+  if (SystemTriple.isOSBinFormatELF() || SystemTriple.isOSBinFormatMachO()) {
+    Config.IsOutOfProcess = true;
+    Config.OOPExecutor = getExecutorPath();
+    Config.UseSharedMemory = false;
+    Config.SlabAllocateSize = 0;
+    Config.OrcRuntimePath = getOrcRuntimePath();
+
+    int stdin_fd = fileno(io_ctx->stdin_file.get());
+    int stdout_fd = fileno(io_ctx->stdout_file.get());
+    int stderr_fd = fileno(io_ctx->stderr_file.get());
+
+    Config.CustomizeFork = [=] {
+      auto redirect = [](int from, int to) {
+        if (from != to) {
+          dup2(from, to);
+          close(from);
+        }
+      };
+
+      redirect(stdin_fd, STDIN_FILENO);
+      redirect(stdout_fd, STDOUT_FILENO);
+      redirect(stderr_fd, STDERR_FILENO);
+
+      setvbuf(stdout, nullptr, _IONBF, 0);
+      setvbuf(stderr, nullptr, _IONBF, 0);
+
+      printf("CustomizeFork executed\n");
+      fflush(stdout);
+    };
+  }
+
+  return cantFail(clang::Interpreter::create(std::move(CI), Config));
+}
+
+static size_t DeclsSize(TranslationUnitDecl *PTUDecl) {
+  return std::distance(PTUDecl->decls().begin(), PTUDecl->decls().end());
+}
+
+TEST_F(InterpreterTestBase, SanityWithRemoteExecution) {
+  if (!HostSupportsJIT())
+    GTEST_SKIP();
+
+  std::string OrcRuntimePath = getOrcRuntimePath();
+  std::string ExecutorPath = getExecutorPath();
+  
+  if (!llvm::sys::fs::exists(OrcRuntimePath) ||
+      !llvm::sys::fs::exists(ExecutorPath))
+    GTEST_SKIP();
+
+  auto io_ctx = std::make_shared<IOContext>();
+  ASSERT_TRUE(io_ctx->initializeTempFiles());
+
+  std::unique_ptr<Interpreter> Interp =
+      createInterpreterWithRemoteExecution(io_ctx);
+  ASSERT_TRUE(Interp);
+
+  using PTU = PartialTranslationUnit;
+  PTU &R1(cantFail(Interp->Parse("void g(); void g() {}")));
+  EXPECT_EQ(2U, DeclsSize(R1.TUPart));
+
+  PTU &R2(cantFail(Interp->Parse("int i = 42;")));
+  EXPECT_EQ(1U, DeclsSize(R2.TUPart));
+
+  std::string captured_stdout = io_ctx->readStdoutContent();
+  std::string captured_stderr = io_ctx->readStderrContent();
+
+  EXPECT_TRUE(captured_stdout.find("CustomizeFork executed") !=
+              std::string::npos);
+}
+
+} // end anonymous namespace
\ No newline at end of file

>From f019e2368b137371d248a7ddbe37f76466c2d44d Mon Sep 17 00:00:00 2001
From: lntue <lntue at google.com>
Date: Fri, 12 Sep 2025 17:57:08 -0400
Subject: [PATCH 06/39] [libc] Change __builtin_memcpy to inline_memcpy.
 (#158345)

---
 libc/src/__support/CMakeLists.txt                     |  1 +
 libc/src/__support/arg_list.h                         |  3 ++-
 libc/src/stdio/printf_core/CMakeLists.txt             |  1 +
 .../stdio/printf_core/float_dec_converter_limited.h   |  5 +++--
 libc/src/stdlib/CMakeLists.txt                        |  1 +
 libc/src/stdlib/qsort_data.h                          | 11 ++++++-----
 libc/src/string/CMakeLists.txt                        |  1 +
 libc/src/string/stpcpy.cpp                            |  3 ++-
 libc/src/string/string_utils.h                        |  3 ++-
 libc/src/wchar/CMakeLists.txt                         |  1 +
 libc/src/wchar/wcpcpy.cpp                             |  2 +-
 libc/src/wchar/wcscpy.cpp                             |  2 +-
 libc/src/wchar/wmemcpy.cpp                            |  3 ++-
 libc/src/wchar/wmempcpy.cpp                           |  3 ++-
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel     |  4 ++++
 15 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index b6e87ac336fb2..0ef09a9b8c9d0 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -302,6 +302,7 @@ add_header_library(
   DEPENDS
     libc.hdr.stdint_proxy
     libc.src.__support.common
+    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_header_library(
diff --git a/libc/src/__support/arg_list.h b/libc/src/__support/arg_list.h
index 1e26a5e8ef9c7..7b78a9c0fe619 100644
--- a/libc/src/__support/arg_list.h
+++ b/libc/src/__support/arg_list.h
@@ -12,6 +12,7 @@
 #include "hdr/stdint_proxy.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 #include <stdarg.h>
 #include <stddef.h>
@@ -126,7 +127,7 @@ template <bool packed> class StructArgList {
 
     // Memcpy because pointer alignment may be illegal given a packed struct.
     T val;
-    __builtin_memcpy(&val, ptr, sizeof(T));
+    inline_memcpy(&val, ptr, sizeof(T));
 
     ptr =
         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(ptr) + sizeof(T));
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 76eb0a2fdaaa5..ee66145e60156 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -112,6 +112,7 @@ add_header_library(
     libc.src.__support.libc_assert
     libc.src.__support.uint128
     libc.src.__support.StringUtil.error_to_string
+    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_header_library(
diff --git a/libc/src/stdio/printf_core/float_dec_converter_limited.h b/libc/src/stdio/printf_core/float_dec_converter_limited.h
index f468dbc8e2ae8..9cdc13573d320 100644
--- a/libc/src/stdio/printf_core/float_dec_converter_limited.h
+++ b/libc/src/stdio/printf_core/float_dec_converter_limited.h
@@ -53,6 +53,7 @@
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/float_inf_nan_converter.h"
 #include "src/stdio/printf_core/writer.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
@@ -250,7 +251,7 @@ DigitsOutput decimal_digits(DigitsInput input, int precision, bool e_mode) {
   // there's space for it in the DigitsOutput buffer).
   DigitsOutput output;
   output.ndigits = view.size();
-  __builtin_memcpy(output.digits, view.data(), output.ndigits);
+  inline_memcpy(output.digits, view.data(), output.ndigits);
 
   // Set up the output exponent, which is done differently depending on mode.
   // Also, figure out whether we have one digit too many, and if so, set the
@@ -551,7 +552,7 @@ convert_float_inner(Writer<write_mode> *writer, const FormatSection &to_conv,
     cpp::string_view expview = expcvt.view();
     expbuf[0] = internal::islower(to_conv.conv_name) ? 'e' : 'E';
     explen = expview.size() + 1;
-    __builtin_memcpy(expbuf + 1, expview.data(), expview.size());
+    inline_memcpy(expbuf + 1, expview.data(), expview.size());
   }
 
   // Now we know enough to work out the length of the unpadded output:
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index aa653c38a8c3f..c464f82dcbda7 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -292,6 +292,7 @@ add_header_library(
     libc.hdr.stdint_proxy
     libc.include.stdlib
     libc.src.__support.CPP.cstddef
+    libc.src.string.memory_utils.inline_memcpy    
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
index 739fce88ab75d..4f9774088fbd3 100644
--- a/libc/src/stdlib/qsort_data.h
+++ b/libc/src/stdlib/qsort_data.h
@@ -12,6 +12,7 @@
 #include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/cstddef.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -54,9 +55,9 @@ class ArrayGenericSize {
     const cpp::byte *elem_i_block_end = elem_i + (elem_size - elem_size_rem);
 
     while (elem_i != elem_i_block_end) {
-      __builtin_memcpy(tmp_block, elem_i, BLOCK_SIZE);
-      __builtin_memcpy(elem_i, elem_j, BLOCK_SIZE);
-      __builtin_memcpy(elem_j, tmp_block, BLOCK_SIZE);
+      inline_memcpy(tmp_block, elem_i, BLOCK_SIZE);
+      inline_memcpy(elem_i, elem_j, BLOCK_SIZE);
+      inline_memcpy(elem_j, tmp_block, BLOCK_SIZE);
 
       elem_i += BLOCK_SIZE;
       elem_j += BLOCK_SIZE;
@@ -112,9 +113,9 @@ template <size_t ELEM_SIZE> class ArrayFixedSize {
     cpp::byte *elem_i = get_internal(i);
     cpp::byte *elem_j = get_internal(j);
 
-    __builtin_memcpy(tmp, elem_i, ELEM_SIZE);
+    inline_memcpy(tmp, elem_i, ELEM_SIZE);
     __builtin_memmove(elem_i, elem_j, ELEM_SIZE);
-    __builtin_memcpy(elem_j, tmp, ELEM_SIZE);
+    inline_memcpy(elem_j, tmp, ELEM_SIZE);
   }
 
   LIBC_INLINE size_t len() const { return array_len; }
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 5c9f622d44397..b8cdb2a7d3538 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -22,6 +22,7 @@ add_header_library(
     libc.src.__support.CPP.type_traits
     libc.src.__support.CPP.simd
     libc.src.__support.common
+    libc.src.string.memory_utils.inline_memcpy
   ${string_config_options}
 )
 
diff --git a/libc/src/string/stpcpy.cpp b/libc/src/string/stpcpy.cpp
index 48c0db950ace0..fefae81172585 100644
--- a/libc/src/string/stpcpy.cpp
+++ b/libc/src/string/stpcpy.cpp
@@ -8,6 +8,7 @@
 
 #include "src/string/stpcpy.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 #include "src/string/string_utils.h"
 
 #include "src/__support/common.h"
@@ -17,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, stpcpy,
                    (char *__restrict dest, const char *__restrict src)) {
   size_t size = internal::string_length(src) + 1;
-  __builtin_memcpy(dest, src, size);
+  inline_memcpy(dest, src, size);
   char *result = dest + size;
 
   if (result != nullptr)
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 10803488b6cf5..9d636d02f4756 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -21,6 +21,7 @@
 #include "src/__support/CPP/type_traits.h" // cpp::is_same_v
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/string/memory_utils/inline_memcpy.h"
 
 #if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
 #if LIBC_HAS_VECTOR_TYPE
@@ -242,7 +243,7 @@ LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
   if (!size)
     return len;
   size_t n = len < size - 1 ? len : size - 1;
-  __builtin_memcpy(dst, src, n);
+  inline_memcpy(dst, src, n);
   dst[n] = '\0';
   return len;
 }
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 9ba0a06c57b7f..adde382bf0950 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -452,6 +452,7 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
+    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_entrypoint_object(
diff --git a/libc/src/wchar/wcpcpy.cpp b/libc/src/wchar/wcpcpy.cpp
index 9e2b12f09eb05..b6d80d4d671d9 100644
--- a/libc/src/wchar/wcpcpy.cpp
+++ b/libc/src/wchar/wcpcpy.cpp
@@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(wchar_t *, wcpcpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2)) {
   size_t size = internal::string_length(s2);
-  __builtin_memcpy(s1, s2, (size + 1) * sizeof(wchar_t));
+  inline_memcpy(s1, s2, (size + 1) * sizeof(wchar_t));
   wchar_t *result = s1 + size;
   return result;
 }
diff --git a/libc/src/wchar/wcscpy.cpp b/libc/src/wchar/wcscpy.cpp
index 01ba994cecbb2..703706e6a7be8 100644
--- a/libc/src/wchar/wcscpy.cpp
+++ b/libc/src/wchar/wcscpy.cpp
@@ -19,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(wchar_t *, wcscpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2)) {
   size_t size = internal::string_length(s2) + 1;
-  __builtin_memcpy(s1, s2, size * sizeof(wchar_t));
+  inline_memcpy(s1, s2, size * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wmemcpy.cpp b/libc/src/wchar/wmemcpy.cpp
index bf92309b20944..56708d6cee496 100644
--- a/libc/src/wchar/wmemcpy.cpp
+++ b/libc/src/wchar/wmemcpy.cpp
@@ -12,13 +12,14 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmemcpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2,
                     size_t n)) {
-  __builtin_memcpy(s1, s2, n * sizeof(wchar_t));
+  inline_memcpy(s1, s2, n * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wmempcpy.cpp b/libc/src/wchar/wmempcpy.cpp
index 21e16210a757a..d8b89c0a88d05 100644
--- a/libc/src/wchar/wmempcpy.cpp
+++ b/libc/src/wchar/wmempcpy.cpp
@@ -11,13 +11,14 @@
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmempcpy,
                    (wchar_t *__restrict to, const wchar_t *__restrict from,
                     size_t size)) {
-  __builtin_memcpy(to, from, size * sizeof(wchar_t));
+  inline_memcpy(to, from, size * sizeof(wchar_t));
   return reinterpret_cast<wchar_t *>(to) + size;
 }
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index d9b1bb5635aaf..a955f7f4916ac 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -818,6 +818,7 @@ libc_support_library(
     hdrs = ["src/__support/arg_list.h"],
     deps = [
         ":__support_common",
+        ":string_memory_utils",
     ],
 )
 
@@ -5003,6 +5004,7 @@ libc_support_library(
         ":__support_cpp_bit",
         ":__support_cpp_cstddef",
         ":__support_macros_attributes",
+        ":string_memory_utils",
     ],
 )
 
@@ -6945,6 +6947,7 @@ libc_function(
     deps = [
         ":__support_common",
         ":__support_macros_config",
+        ":string_memory_utils",
         ":types_size_t",
         ":types_wchar_t",
     ],
@@ -6968,6 +6971,7 @@ libc_function(
     hdrs = ["src/wchar/wmempcpy.h"],
     deps = [
         ":__support_common",
+        ":string_memory_utils",
         ":types_size_t",
         ":types_wchar_t",
     ],

>From 9e33997242800d49964bfbd056288cbb0cf073ed Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Fri, 12 Sep 2025 15:04:38 -0700
Subject: [PATCH 07/39] [IR] Add `MD_prof` to the `Keep` list of
 `dropUBImplyingAttrsAndMetadata` (#154635)

`MD_prof` is safe to keep when e.g. hoisting instructions.

Issue #147390
---
 llvm/lib/IR/Instruction.cpp                   |  7 ++++---
 llvm/lib/Transforms/Scalar/LICM.cpp           |  5 +----
 .../Transforms/SimplifyCFG/PhiBlockMerge.ll   | 21 ++++++++++++-------
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 5e87b5ff941ad..c1fafd759b5ab 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -553,16 +553,17 @@ void Instruction::dropUBImplyingAttrsAndUnknownMetadata(
 }
 
 void Instruction::dropUBImplyingAttrsAndMetadata(ArrayRef<unsigned> Keep) {
-  // !annotation metadata does not impact semantics.
+  // !annotation and !prof metadata does not impact semantics.
   // !range, !nonnull and !align produce poison, so they are safe to speculate.
   // !noundef and various AA metadata must be dropped, as it generally produces
   // immediate undefined behavior.
   static const unsigned KnownIDs[] = {
       LLVMContext::MD_annotation, LLVMContext::MD_range,
-      LLVMContext::MD_nonnull, LLVMContext::MD_align};
+      LLVMContext::MD_nonnull, LLVMContext::MD_align, LLVMContext::MD_prof};
   SmallVector<unsigned> KeepIDs;
   KeepIDs.reserve(Keep.size() + std::size(KnownIDs));
-  append_range(KeepIDs, KnownIDs);
+  append_range(KeepIDs, (!ProfcheckDisableMetadataFixes ? KnownIDs
+                                                        : drop_end(KnownIDs)));
   append_range(KeepIDs, Keep);
   dropUBImplyingAttrsAndUnknownMetadata(KeepIDs);
 }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 40104e8fb4249..092a0fb264c28 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1705,10 +1705,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       // time in isGuaranteedToExecute if we don't actually have anything to
       // drop.  It is a compile time optimization, not required for correctness.
       !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
-    if (ProfcheckDisableMetadataFixes)
-      I.dropUBImplyingAttrsAndMetadata();
-    else
-      I.dropUBImplyingAttrsAndMetadata({LLVMContext::MD_prof});
+    I.dropUBImplyingAttrsAndMetadata();
   }
 
   if (isa<PHINode>(I))
diff --git a/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll b/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
index 2c5889a981db2..08397b5755a3f 100644
--- a/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
+++ b/llvm/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
@@ -1,20 +1,21 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; Test merging of blocks that only have PHI nodes in them
 ;
 ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
 ;
 
 define i32 @test(i1 %a, i1 %b) {
-; CHECK-LABEL: @test(
-; CHECK:       M:
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 %b, i32 0, i32 1
-; CHECK-NEXT:    [[W:%.*]] = select i1 %a, i32 2, i32 [[DOT]]
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i1 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:  [[M:.*:]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[B]], i32 0, i32 1, !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    [[W:%.*]] = select i1 [[A]], i32 2, i32 [[SPEC_SELECT]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[R:%.*]] = add i32 [[W]], 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
-  br i1 %a, label %M, label %O
+  br i1 %a, label %M, label %O, !prof !0
 O:              ; preds = %0
-  br i1 %b, label %N, label %Q
+  br i1 %b, label %N, label %Q, !prof !1
 Q:              ; preds = %O
   br label %N
 N:              ; preds = %Q, %O
@@ -27,3 +28,9 @@ M:              ; preds = %N, %0
   ret i32 %R
 }
 
+!0 = !{!"branch_weights", i32 11, i32 7}
+!1 = !{!"branch_weights", i32 3, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 3, i32 5}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 11, i32 7}
+;.

>From 8f25ea2d73d9a4a64e7ab26e6b1d7a8f73605713 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Fri, 12 Sep 2025 15:05:16 -0700
Subject: [PATCH 08/39] [NFC] Leave a comment in `Local.cpp` about debug info &
 sample profiling (#155296)

Issue #152767
---
 llvm/lib/Transforms/Utils/Local.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2cfd70a1746c8..57dc1b38b8ec3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3342,8 +3342,11 @@ void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
   // retain their original debug locations (DILocations) and debug intrinsic
   // instructions.
   //
-  // Doing so would degrade the debugging experience and adversely affect the
-  // accuracy of profiling information.
+  // Doing so would degrade the debugging experience.
+  //
+  // FIXME: Issue #152767: debug info should also be the same as the
+  // original branch, **if** the user explicitly indicated that (for sampling
+  // PGO)
   //
   // Currently, when hoisting the instructions, we take the following actions:
   // - Remove their debug intrinsic instructions.

>From 0d4a615998a7d5a6ad1f2866e9f3276acfc70fc0 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Fri, 12 Sep 2025 15:07:25 -0700
Subject: [PATCH 09/39] [InstCombine] Make test resilient to metadata presence
 (#157607)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modernized it to using `update_test_checks` which addresses an ambgiuty
in the previous test formulation, where a profile metadaat of value `i32
1` would have (incorrectly matched.
---
 .../InstCombine/2004-09-20-BadLoadCombine2.ll | 38 ++++++++++++-------
 llvm/utils/profcheck-xfail.txt                |  1 -
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll b/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
index f558e35ebe015..1d89dd6195032 100644
--- a/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
+++ b/llvm/test/Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
@@ -1,25 +1,35 @@
-; RUN: opt < %s -passes=instcombine,mem2reg,simplifycfg -simplifycfg-require-and-preserve-domtree=1 | \
-; RUN:   llvm-dis | grep -v store | not grep "i32 1"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -passes=instcombine,mem2reg,simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -o - | FileCheck %s
 
 ; Test to make sure that instcombine does not accidentally propagate the load
 ; into the PHI, which would break the program.
 
 define i32 @test(i1 %C) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[X2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, ptr [[X]], align 4
+; CHECK-NEXT:    store i32 2, ptr [[X2]], align 4
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C]], ptr [[X]], ptr [[X2]]
+; CHECK-NEXT:    store i32 3, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP_3:%.*]] = load i32, ptr [[SPEC_SELECT]], align 4
+; CHECK-NEXT:    ret i32 [[TMP_3]]
+;
 entry:
-        %X = alloca i32         ; <ptr> [#uses=3]
-        %X2 = alloca i32                ; <ptr> [#uses=2]
-        store i32 1, ptr %X
-        store i32 2, ptr %X2
-        br i1 %C, label %cond_true.i, label %cond_continue.i
+  %X = alloca i32         ; <ptr> [#uses=3]
+  %X2 = alloca i32                ; <ptr> [#uses=2]
+  store i32 1, ptr %X
+  store i32 2, ptr %X2
+  br i1 %C, label %cond_true.i, label %cond_continue.i
 
 cond_true.i:            ; preds = %entry
-        br label %cond_continue.i
+  br label %cond_continue.i
 
 cond_continue.i:                ; preds = %cond_true.i, %entry
-        %mem_tmp.i.0 = phi ptr [ %X, %cond_true.i ], [ %X2, %entry ]           ; <ptr> [#uses=1]
-        store i32 3, ptr %X
-        %tmp.3 = load i32, ptr %mem_tmp.i.0         ; <i32> [#uses=1]
-        ret i32 %tmp.3
+  %mem_tmp.i.0 = phi ptr [ %X, %cond_true.i ], [ %X2, %entry ]           ; <ptr> [#uses=1]
+  store i32 3, ptr %X
+  %tmp.3 = load i32, ptr %mem_tmp.i.0         ; <i32> [#uses=1]
+  ret i32 %tmp.3
 }
-
-
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index e1ee7c3664a51..9d170b392b6c7 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -830,7 +830,6 @@ Transforms/IndVarSimplify/invalidate-modified-lcssa-phi.ll
 Transforms/IndVarSimplify/pr45835.ll
 Transforms/IndVarSimplify/preserving-debugloc-rem-div.ll
 Transforms/Inline/optimization-remarks-hotness-threshold.ll
-Transforms/InstCombine/2004-09-20-BadLoadCombine2.ll
 Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
 Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll
 Transforms/InstCombine/2011-02-14-InfLoop.ll

>From 8ee31ab00b95fc58110956f8945b0232045e8d86 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa at webkit.org>
Date: Fri, 12 Sep 2025 15:08:23 -0700
Subject: [PATCH 10/39] [WebKit checkers] Treat function pointers with
 "Singleton" suffix as singleton. (#158012)

---
 .../Checkers/WebKit/ASTUtils.cpp              |  5 ++++
 .../Checkers/WebKit/PtrTypesSemantics.cpp     |  2 +-
 .../Checkers/WebKit/PtrTypesSemantics.h       |  3 +-
 .../Checkers/WebKit/unretained-call-args.mm   | 28 +++++++++++++++++++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 6f13d552b4b44..b629de3254ed3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -160,6 +160,11 @@ bool tryToFindPtrOrigin(
         if (Name == "__builtin___CFStringMakeConstantString" ||
             Name == "NSClassFromString")
           return callback(E, true);
+      } else if (auto *CalleeE = call->getCallee()) {
+        if (auto *E = dyn_cast<DeclRefExpr>(CalleeE->IgnoreParenCasts())) {
+          if (isSingleton(E->getFoundDecl()))
+            return callback(E, true);
+        }
       }
 
       // Sometimes, canonical type erroneously turns Ref<T> into T.
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 56747d72136e3..90b2343b4be77 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -479,7 +479,7 @@ bool isTrivialBuiltinFunction(const FunctionDecl *F) {
          Name.starts_with("os_log") || Name.starts_with("_os_log");
 }
 
-bool isSingleton(const FunctionDecl *F) {
+bool isSingleton(const NamedDecl *F) {
   assert(F);
   // FIXME: check # of params == 1
   if (auto *MethodDecl = dyn_cast<CXXMethodDecl>(F)) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 3c9560cb8059b..d2095d07e1434 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -21,6 +21,7 @@ class CXXMethodDecl;
 class CXXRecordDecl;
 class Decl;
 class FunctionDecl;
+class NamedDecl;
 class QualType;
 class RecordType;
 class Stmt;
@@ -156,7 +157,7 @@ bool isPtrConversion(const FunctionDecl *F);
 bool isTrivialBuiltinFunction(const FunctionDecl *F);
 
 /// \returns true if \p F is a static singleton function.
-bool isSingleton(const FunctionDecl *F);
+bool isSingleton(const NamedDecl *F);
 
 /// An inter-procedural analysis facility that detects functions with "trivial"
 /// behavior with respect to reference counting, such as simple field getters.
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index f39822ee2a8c6..75eead070fdf9 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -438,6 +438,34 @@ void use_const_local() {
 
 } // namespace const_global
 
+namespace var_decl_ref_singleton {
+
+static Class initSomeObject() { return nil; }
+static Class (*getSomeObjectClassSingleton)() = initSomeObject;
+
+bool foo(NSString *obj) {
+  return [obj isKindOfClass:getSomeObjectClassSingleton()];
+}
+
+class Bar {
+public:
+  Class someObject();
+  static Class staticSomeObject();
+};
+typedef Class (Bar::*SomeObjectSingleton)();
+
+bool bar(NSObject *obj, Bar *bar, SomeObjectSingleton someObjSingleton) {
+  return [obj isKindOfClass:(bar->*someObjSingleton)()];
+  // expected-warning at -1{{Call argument for parameter 'aClass' is unretained and unsafe}}
+}
+
+bool baz(NSObject *obj) {
+  Class (*someObjectSingleton)() = Bar::staticSomeObject;
+  return [obj isKindOfClass:someObjectSingleton()];
+}
+
+} // namespace var_decl_ref_singleton
+
 namespace ns_retained_return_value {
 
 NSString *provideNS() NS_RETURNS_RETAINED;

>From ba3bce0779fa195867aa804146c2ec24cfaf9976 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Fri, 12 Sep 2025 15:25:28 -0700
Subject: [PATCH 11/39] [Github] Switch back to tj-actions/changed-files
 (#158335)

We were using the step security fork after the tj-actions/changed-files
supply chain attack given Github disabled the repo and all our actions
were failing during that time. Switch away from the fork back to the
main repository to avoid an extra level of indirection until we can
probably just stop using this action/roll our own.
---
 .github/workflows/docs.yml           | 2 +-
 .github/workflows/pr-code-format.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b627803f61b27..8cdd39c164cca 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -60,7 +60,7 @@ jobs:
           fetch-depth: 2
       - name: Get subprojects that have doc changes
         id: docs-changed-subprojects
-        uses: step-security/changed-files at 3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+        uses: tj-actions/changed-files at ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
         with:
           skip_initial_fetch: true
           base_sha: 'HEAD~1'
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 9341eaf3ce7c2..9396bf019e1ac 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -25,7 +25,7 @@ jobs:
 
       - name: Get changed files
         id: changed-files
-        uses: step-security/changed-files at 3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+        uses: tj-actions/changed-files at ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
         with:
           separator: ","
           skip_initial_fetch: true

>From 13eecf7f9f42dfded46d8feaa01bc77962d10845 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 15:29:28 -0700
Subject: [PATCH 12/39] [RISC] Use hasBEXTILike in useInversedSetcc and
 shouldFoldSelectWithSingleBitTest. (#158366)

Add hasVendorXTHeadCondMov to shouldFoldSelectWithSingleBitTest.

The optimizations in these functions is equally applicable to these.

I changed the RUN line for xtheadcondmove in condops.ll to use XTHeadBs
to get coverage of the hasBEXTILike changes. I didn't think it was
worth an additional RUN line and check prefix.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  6 +++---
 llvm/test/CodeGen/RISCV/condops.ll          | 19 ++++++++-----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 523b857f9e6cd..c3071ad5cd697 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18950,7 +18950,7 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG,
 
   // Replace (setcc eq (and x, C)) with (setcc ne (and x, C))) to generate
   // BEXTI, where C is power of 2.
-  if (Subtarget.hasStdExtZbs() && VT.isScalarInteger() &&
+  if (Subtarget.hasBEXTILike() && VT.isScalarInteger() &&
       (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
@@ -24939,8 +24939,8 @@ RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 
 bool RISCVTargetLowering::shouldFoldSelectWithSingleBitTest(
     EVT VT, const APInt &AndMask) const {
-  if (Subtarget.hasCZEROLike())
-    return !Subtarget.hasStdExtZbs() && AndMask.ugt(1024);
+  if (Subtarget.hasCZEROLike() || Subtarget.hasVendorXTHeadCondMov())
+    return !Subtarget.hasBEXTILike() && AndMask.ugt(1024);
   return TargetLowering::shouldFoldSelectWithSingleBitTest(VT, AndMask);
 }
 
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 4fb3dff88017c..9d95f1f5c9615 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV64I
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV32XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS
-; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
+; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+xtheadbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
 
@@ -126,7 +126,7 @@ define i64 @zero_singlebit1(i64 %rs1, i64 %rs2) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: zero_singlebit1:
 ; RV64XTHEADCONDMOV:       # %bb.0:
-; RV64XTHEADCONDMOV-NEXT:    bexti a1, a1, 12
+; RV64XTHEADCONDMOV-NEXT:    th.tst a1, a1, 12
 ; RV64XTHEADCONDMOV-NEXT:    th.mvnez a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
@@ -179,9 +179,8 @@ define i64 @zero_singlebit2(i64 %rs1, i64 %rs2) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: zero_singlebit2:
 ; RV64XTHEADCONDMOV:       # %bb.0:
-; RV64XTHEADCONDMOV-NEXT:    slli a1, a1, 51
-; RV64XTHEADCONDMOV-NEXT:    srai a1, a1, 63
-; RV64XTHEADCONDMOV-NEXT:    and a0, a1, a0
+; RV64XTHEADCONDMOV-NEXT:    th.tst a1, a1, 12
+; RV64XTHEADCONDMOV-NEXT:    th.mveqz a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: zero_singlebit2:
@@ -4297,9 +4296,8 @@ define i64 @single_bit(i64 %x) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: single_bit:
 ; RV64XTHEADCONDMOV:       # %bb.0: # %entry
-; RV64XTHEADCONDMOV-NEXT:    slli a1, a0, 53
-; RV64XTHEADCONDMOV-NEXT:    srai a1, a1, 63
-; RV64XTHEADCONDMOV-NEXT:    and a0, a1, a0
+; RV64XTHEADCONDMOV-NEXT:    andi a1, a0, 1024
+; RV64XTHEADCONDMOV-NEXT:    th.mveqz a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: single_bit:
@@ -4353,9 +4351,8 @@ define i64 @single_bit2(i64 %x) {
 ;
 ; RV64XTHEADCONDMOV-LABEL: single_bit2:
 ; RV64XTHEADCONDMOV:       # %bb.0: # %entry
-; RV64XTHEADCONDMOV-NEXT:    slli a1, a0, 52
-; RV64XTHEADCONDMOV-NEXT:    srai a1, a1, 63
-; RV64XTHEADCONDMOV-NEXT:    and a0, a1, a0
+; RV64XTHEADCONDMOV-NEXT:    th.tst a1, a0, 11
+; RV64XTHEADCONDMOV-NEXT:    th.mveqz a0, zero, a1
 ; RV64XTHEADCONDMOV-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: single_bit2:

>From 1131e44ed3f5fadb2d22ff155d4e47f69757d02f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 15:29:44 -0700
Subject: [PATCH 13/39] [RISCV] Use hasCPOPLike in isCtpopFast and
 getPopcntSupport (#158371)

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  3 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  4 +-
 llvm/test/CodeGen/RISCV/xcvbitmanip.ll        | 47 +++++++++++++++++++
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c3071ad5cd697..f9b484b98739f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24844,8 +24844,7 @@ bool RISCVTargetLowering::isCtpopFast(EVT VT) const {
     return isTypeLegal(VT) && Subtarget.hasStdExtZvbb();
   if (VT.isFixedLengthVector() && Subtarget.hasStdExtZvbb())
     return true;
-  // FIXME: Should use hasCPOPLike here.
-  return Subtarget.hasStdExtZbb() &&
+  return Subtarget.hasCPOPLike() &&
          (VT == MVT::i32 || VT == MVT::i64 || VT.isFixedLengthVector());
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1ca513214f67c..a06faa414a2ef 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -289,9 +289,7 @@ bool RISCVTTIImpl::hasActiveVectorLength() const {
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
-             ? TTI::PSK_FastHardware
-             : TTI::PSK_Software;
+  return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
 }
 
 InstructionCost RISCVTTIImpl::getPartialReductionCost(
diff --git a/llvm/test/CodeGen/RISCV/xcvbitmanip.ll b/llvm/test/CodeGen/RISCV/xcvbitmanip.ll
index d25ff28475c4b..b2cebabb7df8b 100644
--- a/llvm/test/CodeGen/RISCV/xcvbitmanip.ll
+++ b/llvm/test/CodeGen/RISCV/xcvbitmanip.ll
@@ -229,3 +229,50 @@ define i32 @test.llvm.bitrev(i32 %a) {
   %1 = call i32 @llvm.bitreverse(i32 %a)
   ret i32 %1
 }
+
+define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_ult_two:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    sltiu a0, a0, 2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp ult i32 %1, 2
+  ret i1 %2
+}
+
+define i1 @ctpop_i32_ugt_one(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_ugt_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    sltiu a0, a0, 2
+; CHECK-NEXT:    xori a0, a0, 1
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp ugt i32 %1, 1
+  ret i1 %2
+}
+
+define i1 @ctpop_i32_eq_one(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_eq_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp eq i32 %1, 1
+  ret i1 %2
+}
+
+define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind {
+; CHECK-LABEL: ctpop_i32_ne_one:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cv.cnt a0, a0
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  %2 = icmp ne i32 %1, 1
+  ret i1 %2
+}

>From 52c583b3f95a0e666ab837e39a5db900b66adf15 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl at google.com>
Date: Fri, 12 Sep 2025 15:58:16 -0700
Subject: [PATCH 14/39] [SampleFDO][TypeProf]Support vtable type profiling for
 ext-binary and text format (#148002)

This change extends SampleFDO ext-binary and text format to record the
vtable symbols and their counts for virtual calls inside a function. The
vtable profiles will allow the compiler to annotate vtable types on IR
instructions and perform vtable-based indirect call promotion. An RFC is
in
https://discourse.llvm.org/t/rfc-vtable-type-profiling-for-samplefdo/87283

Given a function below, the before vs after of a function's profile is
illustrated in text format in the table:

```
__attribute__((noinline)) int loop_func(int i, int a, int b) {
    Base *ptr = createType(i);

    int sum = ptr->func(a, b);

    delete ptr;

    return sum;
}
```

| before | after |
| --- | --- |
| Samples collected in the function's body { <br> 0: 636241 <br> 1:
681458, calls: _Z10createTypei:681458 <br> 3: 543499, calls:
_ZN12_GLOBAL__N_18Derived24funcEii:410621 _ZN8Derived14funcEii:132878
<br> 5.1: 602201, calls: _ZN12_GLOBAL__N_18Derived2D0Ev:454635
_ZN8Derived1D0Ev:147566 <br> 7: 511057 <br> } | Samples collected in the
function's body { <br> 0: 636241 <br> 1: 681458, calls:
_Z10createTypei:681458 <br> 3: 543499, calls:
_ZN12_GLOBAL__N_18Derived24funcEii:410621 _ZN8Derived14funcEii:132878
<br> 3: vtables: _ZTV8Derived1:1377 _ZTVN12_GLOBAL__N_18Derived2E:4250
<br> 5.1: 602201, calls: _ZN12_GLOBAL__N_18Derived2D0Ev:454635
_ZN8Derived1D0Ev:147566 <br> 5.1: vtables: _ZTV8Derived1:227
_ZTVN12_GLOBAL__N_18Derived2E:765 <br> 7: 511057 <br> } |

Key points for this change:
1. In-memory representation of vtable profiles
* A field of type `map<LineLocation, map<FunctionId, uint64_t>>` is
introduced in a function's in-memory representation
[FunctionSamples](https://github.com/llvm/llvm-project/blob/ccc416312ed72e92a885425d9cb9c01f9afa58eb/llvm/include/llvm/ProfileData/SampleProf.h#L749-L754).
2. The vtable counters for one LineLocation represents the relative
frequency among vtables for this LineLocation. They are not required to
be comparable across LineLocations.
3. For backward compatibility of ext-binary format, we take one bit from
ProfSummaryFlag as illustrated in the enum class `SecProfSummaryFlags`.
The ext-binary profile reader parses the integer type flag and reads
this bit. If it's set, the profile reader will parse vtable profiles.
4. The vtable profiles are optional in ext-binary format, and not
serialized out by default, we introduce an LLVM boolean option (named
`-extbinary-write-vtable-type-prof`). The ext-binary profile writer
reads the boolean option and decide whether to set the section flag bit
and serialize the in-memory class members corresponding to vtables.
5. This change doesn't implement `llvm-profdata overlap --sample` for
the vtable profiles. A subsequent change will do it to keep this one
focused on the profile format change.

We don't plan to add the vtable support to non-extensible format mainly
because of the maintenance cost to keep backward compatibility for prior
versions of profile data.
* Currently, the [non-extensible binary
format](https://github.com/llvm/llvm-project/blob/5c28af409978c19a35021855a29dcaa65e95da00/llvm/lib/ProfileData/SampleProfWriter.cpp#L899-L900)
does not have feature parity with extensible binary format today, for
instance, the former doesn't support [profile symbol
list](https://github.com/llvm/llvm-project/blob/41e22aa31b1905aa3e9d83c0343a96ec0d5187ec/llvm/include/llvm/ProfileData/SampleProf.h#L1518-L1522)
or context-sensitive PGO, both of which give measurable performance
boost. Presumably the non-extensible format is not in wide use.

---------

Co-authored-by: Paschalis Mpeis <paschalis.mpeis at arm.com>
---
 llvm/include/llvm/ProfileData/SampleProf.h    | 101 ++++++++++++++-
 .../llvm/ProfileData/SampleProfReader.h       |  12 ++
 .../llvm/ProfileData/SampleProfWriter.h       |  14 +-
 llvm/lib/ProfileData/SampleProf.cpp           |  40 ++++++
 llvm/lib/ProfileData/SampleProfReader.cpp     | 120 +++++++++++++++++-
 llvm/lib/ProfileData/SampleProfWriter.cpp     |  80 +++++++++++-
 .../Inputs/profile-symbol-list-ext.expected   |  44 +++++++
 .../Inputs/sample-profile-ext.proftext        |  18 +++
 .../profile-symbol-list-compress.test         |   9 ++
 .../llvm-profdata/profile-symbol-list.test    |   9 ++
 llvm/test/tools/llvm-profdata/roundtrip.test  |   6 +
 11 files changed, 436 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext

diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index a626071d23915..c0e5d2d79cea2 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -62,7 +62,7 @@ enum class sampleprof_error {
   uncompress_failed,
   zlib_unavailable,
   hash_mismatch,
-  illegal_line_offset
+  illegal_line_offset,
 };
 
 inline std::error_code make_error_code(sampleprof_error E) {
@@ -91,6 +91,8 @@ struct is_error_code_enum<llvm::sampleprof_error> : std::true_type {};
 namespace llvm {
 namespace sampleprof {
 
+constexpr char kVTableProfPrefix[] = "vtables ";
+
 enum SampleProfileFormat {
   SPF_None = 0,
   SPF_Text = 0x1,
@@ -204,6 +206,9 @@ enum class SecProfSummaryFlags : uint32_t {
   /// SecFlagIsPreInlined means this profile contains ShouldBeInlined
   /// contexts thus this is CS preinliner computed.
   SecFlagIsPreInlined = (1 << 4),
+
+  /// SecFlagHasVTableTypeProf means this profile contains vtable type profiles.
+  SecFlagHasVTableTypeProf = (1 << 5),
 };
 
 enum class SecFuncMetadataFlags : uint32_t {
@@ -303,7 +308,7 @@ struct LineLocation {
   }
 
   uint64_t getHashCode() const {
-    return ((uint64_t) Discriminator << 32) | LineOffset;
+    return ((uint64_t)Discriminator << 32) | LineOffset;
   }
 
   uint32_t LineOffset;
@@ -318,16 +323,30 @@ struct LineLocationHash {
 
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const LineLocation &Loc);
 
+/// Key represents type of a C++ polymorphic class type by its vtable and value
+/// represents its counter.
+/// TODO: The class name FunctionId should be renamed to SymbolId in a refactor
+/// change.
+using TypeCountMap = std::map<FunctionId, uint64_t>;
+
+/// Write \p Map to the output stream. Keys are linearized using \p NameTable
+/// and written as ULEB128. Values are written as ULEB128 as well.
+std::error_code
+serializeTypeMap(const TypeCountMap &Map,
+                 const MapVector<FunctionId, uint32_t> &NameTable,
+                 raw_ostream &OS);
+
 /// Representation of a single sample record.
 ///
 /// A sample record is represented by a positive integer value, which
 /// indicates how frequently was the associated line location executed.
 ///
 /// Additionally, if the associated location contains a function call,
-/// the record will hold a list of all the possible called targets. For
-/// direct calls, this will be the exact function being invoked. For
-/// indirect calls (function pointers, virtual table dispatch), this
-/// will be a list of one or more functions.
+/// the record will hold a list of all the possible called targets and the types
+/// for virtual table dispatches. For direct calls, this will be the exact
+/// function being invoked. For indirect calls (function pointers, virtual table
+/// dispatch), this will be a list of one or more functions. For virtual table
+/// dispatches, this record will also hold the type of the object.
 class SampleRecord {
 public:
   using CallTarget = std::pair<FunctionId, uint64_t>;
@@ -746,6 +765,7 @@ using BodySampleMap = std::map<LineLocation, SampleRecord>;
 // memory, which is *very* significant for large profiles.
 using FunctionSamplesMap = std::map<FunctionId, FunctionSamples>;
 using CallsiteSampleMap = std::map<LineLocation, FunctionSamplesMap>;
+using CallsiteTypeMap = std::map<LineLocation, TypeCountMap>;
 using LocToLocMap =
     std::unordered_map<LineLocation, LineLocation, LineLocationHash>;
 
@@ -939,6 +959,14 @@ class FunctionSamples {
     return &Iter->second;
   }
 
+  /// Returns the TypeCountMap for inlined callsites at the given \p Loc.
+  const TypeCountMap *findCallsiteTypeSamplesAt(const LineLocation &Loc) const {
+    auto Iter = VirtualCallsiteTypeCounts.find(mapIRLocToProfileLoc(Loc));
+    if (Iter == VirtualCallsiteTypeCounts.end())
+      return nullptr;
+    return &Iter->second;
+  }
+
   /// Returns a pointer to FunctionSamples at the given callsite location
   /// \p Loc with callee \p CalleeName. If no callsite can be found, relax
   /// the restriction to return the FunctionSamples at callsite location
@@ -1000,6 +1028,46 @@ class FunctionSamples {
     return CallsiteSamples;
   }
 
+  /// Returns vtable access samples for the C++ types collected in this
+  /// function.
+  const CallsiteTypeMap &getCallsiteTypeCounts() const {
+    return VirtualCallsiteTypeCounts;
+  }
+
+  /// Returns the vtable access samples for the C++ types for \p Loc.
+  /// Under the hood, the caller-specified \p Loc will be un-drifted before the
+  /// type sample lookup if possible.
+  TypeCountMap &getTypeSamplesAt(const LineLocation &Loc) {
+    return VirtualCallsiteTypeCounts[mapIRLocToProfileLoc(Loc)];
+  }
+
+  /// Scale \p Other sample counts by \p Weight and add the scaled result to the
+  /// type samples for \p Loc. Under the hoold, the caller-provided \p Loc will
+  /// be un-drifted before the type sample lookup if possible.
+  /// typename T is either a std::map or a DenseMap.
+  template <typename T>
+  sampleprof_error addCallsiteVTableTypeProfAt(const LineLocation &Loc,
+                                               const T &Other,
+                                               uint64_t Weight = 1) {
+    static_assert((std::is_same_v<typename T::key_type, StringRef> ||
+                   std::is_same_v<typename T::key_type, FunctionId>) &&
+                      std::is_same_v<typename T::mapped_type, uint64_t>,
+                  "T must be a map with StringRef or FunctionId as key and "
+                  "uint64_t as value");
+    TypeCountMap &TypeCounts = getTypeSamplesAt(Loc);
+    bool Overflowed = false;
+
+    for (const auto [Type, Count] : Other) {
+      FunctionId TypeId(Type);
+      bool RowOverflow = false;
+      TypeCounts[TypeId] = SaturatingMultiplyAdd(
+          Count, Weight, TypeCounts[TypeId], &RowOverflow);
+      Overflowed |= RowOverflow;
+    }
+    return Overflowed ? sampleprof_error::counter_overflow
+                      : sampleprof_error::success;
+  }
+
   /// Return the maximum of sample counts in a function body. When SkipCallSite
   /// is false, which is the default, the return count includes samples in the
   /// inlined functions. When SkipCallSite is true, the return count only
@@ -1054,6 +1122,10 @@ class FunctionSamples {
         mergeSampleProfErrors(Result,
                               FSMap[Rec.first].merge(Rec.second, Weight));
     }
+    for (const auto &[Loc, OtherTypeMap] : Other.getCallsiteTypeCounts())
+      mergeSampleProfErrors(
+          Result, addCallsiteVTableTypeProfAt(Loc, OtherTypeMap, Weight));
+
     return Result;
   }
 
@@ -1297,6 +1369,23 @@ class FunctionSamples {
   /// collected in the call to baz() at line offset 8.
   CallsiteSampleMap CallsiteSamples;
 
+  /// Map a virtual callsite to the list of accessed vtables and vtable counts.
+  /// The callsite is referenced by its source location.
+  ///
+  /// For example, given:
+  ///
+  ///     void foo() {
+  ///       ...
+  ///  5    inlined_vcall_bar();
+  ///       ...
+  ///  5    inlined_vcall_baz();
+  ///       ...
+  ///  200  inlined_vcall_qux();
+  ///     }
+  /// This map will contain two entries. One with two types for line offset 5
+  /// and one with one type for line offset 200.
+  CallsiteTypeMap VirtualCallsiteTypeCounts;
+
   /// IR to profile location map generated by stale profile matching.
   ///
   /// Each entry is a mapping from the location on current build to the matched
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index bfe079fbe536f..799938ab901c1 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -589,6 +589,10 @@ class SampleProfileReader {
   /// Whether the function profiles use FS discriminators.
   bool ProfileIsFS = false;
 
+  /// If true, the profile has vtable profiles and reader should decode them
+  /// to parse profiles correctly.
+  bool ReadVTableProf = false;
+
   /// \brief The format of sample.
   SampleProfileFormat Format = SPF_None;
 
@@ -703,6 +707,14 @@ class LLVM_ABI SampleProfileReaderBinary : public SampleProfileReader {
   /// otherwise same as readStringFromTable, also return its hash value.
   ErrorOr<std::pair<SampleContext, uint64_t>> readSampleContextFromTable();
 
+  /// Read all virtual functions' vtable access counts for \p FProfile.
+  std::error_code readCallsiteVTableProf(FunctionSamples &FProfile);
+
+  /// Read bytes from the input buffer pointed by `Data` and decode them into
+  /// \p M. `Data` will be advanced to the end of the read bytes when this
+  /// function returns. Returns error if any.
+  std::error_code readVTableTypeCountMap(TypeCountMap &M);
+
   /// Points to the current location in the buffer.
   const uint8_t *Data = nullptr;
 
diff --git a/llvm/include/llvm/ProfileData/SampleProfWriter.h b/llvm/include/llvm/ProfileData/SampleProfWriter.h
index e84b2095efd7b..9dbeaf56509b0 100644
--- a/llvm/include/llvm/ProfileData/SampleProfWriter.h
+++ b/llvm/include/llvm/ProfileData/SampleProfWriter.h
@@ -217,13 +217,20 @@ class LLVM_ABI SampleProfileWriterBinary : public SampleProfileWriter {
   std::error_code writeBody(const FunctionSamples &S);
   inline void stablizeNameTable(MapVector<FunctionId, uint32_t> &NameTable,
                                 std::set<FunctionId> &V);
-  
+
   MapVector<FunctionId, uint32_t> NameTable;
-  
+
   void addName(FunctionId FName);
   virtual void addContext(const SampleContext &Context);
   void addNames(const FunctionSamples &S);
 
+  /// Write \p CallsiteTypeMap to the output stream \p OS.
+  std::error_code
+  writeCallsiteVTableProf(const CallsiteTypeMap &CallsiteTypeMap,
+                          raw_ostream &OS);
+
+  bool WriteVTableProf = false;
+
 private:
   LLVM_ABI friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
   SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
@@ -412,8 +419,7 @@ class LLVM_ABI SampleProfileWriterExtBinaryBase
 class LLVM_ABI SampleProfileWriterExtBinary
     : public SampleProfileWriterExtBinaryBase {
 public:
-  SampleProfileWriterExtBinary(std::unique_ptr<raw_ostream> &OS)
-      : SampleProfileWriterExtBinaryBase(OS) {}
+  SampleProfileWriterExtBinary(std::unique_ptr<raw_ostream> &OS);
 
 private:
   std::error_code writeDefaultLayout(const SampleProfileMap &ProfileMap);
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 60c1393616713..ac7513ef2cb49 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -47,6 +47,24 @@ bool FunctionSamples::ProfileIsPreInlined = false;
 bool FunctionSamples::UseMD5 = false;
 bool FunctionSamples::HasUniqSuffix = true;
 bool FunctionSamples::ProfileIsFS = false;
+
+std::error_code
+serializeTypeMap(const TypeCountMap &Map,
+                 const MapVector<FunctionId, uint32_t> &NameTable,
+                 raw_ostream &OS) {
+  encodeULEB128(Map.size(), OS);
+  for (const auto &[TypeName, SampleCount] : Map) {
+    if (auto NameIndexIter = NameTable.find(TypeName);
+        NameIndexIter != NameTable.end()) {
+      encodeULEB128(NameIndexIter->second, OS);
+    } else {
+      // If the type is not in the name table, we cannot serialize it.
+      return sampleprof_error::truncated_name_table;
+    }
+    encodeULEB128(SampleCount, OS);
+  }
+  return sampleprof_error::success;
+}
 } // namespace sampleprof
 } // namespace llvm
 
@@ -178,6 +196,17 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+static void printTypeCountMap(raw_ostream &OS, LineLocation Loc,
+                              const TypeCountMap &TypeCountMap) {
+  if (TypeCountMap.empty()) {
+    return;
+  }
+  OS << Loc << ": vtables: ";
+  for (const auto &[Type, Count] : TypeCountMap)
+    OS << Type << ":" << Count << " ";
+  OS << "\n";
+}
+
 /// Print the samples collected for a function on stream \p OS.
 void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   if (getFunctionHash())
@@ -192,7 +221,13 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
     SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples);
     for (const auto &SI : SortedBodySamples.get()) {
       OS.indent(Indent + 2);
+      const auto &Loc = SI->first;
       OS << SI->first << ": " << SI->second;
+      if (const TypeCountMap *TypeCountMap =
+              this->findCallsiteTypeSamplesAt(Loc)) {
+        OS.indent(Indent + 2);
+        printTypeCountMap(OS, Loc, *TypeCountMap);
+      }
     }
     OS.indent(Indent);
     OS << "}\n";
@@ -214,6 +249,11 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
         OS << Loc << ": inlined callee: " << FuncSample.getFunction() << ": ";
         FuncSample.print(OS, Indent + 4);
       }
+      auto TypeSamplesIter = VirtualCallsiteTypeCounts.find(Loc);
+      if (TypeSamplesIter != VirtualCallsiteTypeCounts.end()) {
+        OS.indent(Indent + 2);
+        printTypeCountMap(OS, Loc, TypeSamplesIter->second);
+      }
     }
     OS.indent(Indent);
     OS << "}\n";
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 12769a391286c..81ae792e70b99 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -197,8 +197,37 @@ enum class LineType {
   CallSiteProfile,
   BodyProfile,
   Metadata,
+  VirtualCallTypeProfile,
 };
 
+// Parse `Input` as a white-space separated list of `vtable:count` pairs. An
+// example input line is `_ZTVbar:1471 _ZTVfoo:630`.
+static bool parseTypeCountMap(StringRef Input,
+                              DenseMap<StringRef, uint64_t> &TypeCountMap) {
+  for (size_t Index = Input.find_first_not_of(' '); Index != StringRef::npos;) {
+    size_t ColonIndex = Input.find(':', Index);
+    if (ColonIndex == StringRef::npos)
+      return false; // No colon found, invalid format.
+    StringRef TypeName = Input.substr(Index, ColonIndex - Index);
+    // CountIndex is the start index of count.
+    size_t CountStartIndex = ColonIndex + 1;
+    // NextIndex is the start index after the 'target:count' pair.
+    size_t NextIndex = Input.find_first_of(' ', CountStartIndex);
+    uint64_t Count;
+    if (Input.substr(CountStartIndex, NextIndex - CountStartIndex)
+            .getAsInteger(10, Count))
+      return false; // Invalid count.
+    // Error on duplicated type names in one line of input.
+    auto [Iter, Inserted] = TypeCountMap.insert({TypeName, Count});
+    if (!Inserted)
+      return false;
+    Index = (NextIndex == StringRef::npos)
+                ? StringRef::npos
+                : Input.find_first_not_of(' ', NextIndex);
+  }
+  return true;
+}
+
 /// Parse \p Input as line sample.
 ///
 /// \param Input input line.
@@ -215,6 +244,7 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth,
                       uint64_t &NumSamples, uint32_t &LineOffset,
                       uint32_t &Discriminator, StringRef &CalleeName,
                       DenseMap<StringRef, uint64_t> &TargetCountMap,
+                      DenseMap<StringRef, uint64_t> &TypeCountMap,
                       uint64_t &FunctionHash, uint32_t &Attributes,
                       bool &IsFlat) {
   for (Depth = 0; Input[Depth] == ' '; Depth++)
@@ -306,6 +336,10 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth,
       // Change n3 to the next blank space after colon + integer pair.
       n3 = n4;
     }
+  } else if (Rest.starts_with(kVTableProfPrefix)) {
+    LineTy = LineType::VirtualCallTypeProfile;
+    return parseTypeCountMap(Rest.substr(strlen(kVTableProfPrefix)),
+                             TypeCountMap);
   } else {
     LineTy = LineType::CallSiteProfile;
     size_t n3 = Rest.find_last_of(':');
@@ -374,19 +408,27 @@ std::error_code SampleProfileReaderText::readImpl() {
       uint64_t NumSamples;
       StringRef FName;
       DenseMap<StringRef, uint64_t> TargetCountMap;
+      DenseMap<StringRef, uint64_t> TypeCountMap;
       uint32_t Depth, LineOffset, Discriminator;
       LineType LineTy = LineType::BodyProfile;
       uint64_t FunctionHash = 0;
       uint32_t Attributes = 0;
       bool IsFlat = false;
+      // TODO: Update ParseLine to return an error code instead of a bool and
+      // report it.
       if (!ParseLine(*LineIt, LineTy, Depth, NumSamples, LineOffset,
-                     Discriminator, FName, TargetCountMap, FunctionHash,
-                     Attributes, IsFlat)) {
+                     Discriminator, FName, TargetCountMap, TypeCountMap,
+                     FunctionHash, Attributes, IsFlat)) {
         switch (LineTy) {
         case LineType::Metadata:
           reportError(LineIt.line_number(),
                       "Cannot parse metadata: " + *LineIt);
           break;
+        case LineType::VirtualCallTypeProfile:
+          reportError(LineIt.line_number(),
+                      "Expected 'vtables [mangled_vtable:NUM]+', found " +
+                          *LineIt);
+          break;
         default:
           reportError(LineIt.line_number(),
                       "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " +
@@ -417,6 +459,14 @@ std::error_code SampleProfileReaderText::readImpl() {
         DepthMetadata = 0;
         break;
       }
+
+      case LineType::VirtualCallTypeProfile: {
+        mergeSampleProfErrors(
+            Result, InlineStack.back()->addCallsiteVTableTypeProfAt(
+                        LineLocation(LineOffset, Discriminator), TypeCountMap));
+        break;
+      }
+
       case LineType::BodyProfile: {
         FunctionSamples &FProfile = *InlineStack.back();
         for (const auto &name_count : TargetCountMap) {
@@ -598,6 +648,67 @@ SampleProfileReaderBinary::readSampleContextFromTable() {
   return std::make_pair(Context, Hash);
 }
 
+std::error_code
+SampleProfileReaderBinary::readVTableTypeCountMap(TypeCountMap &M) {
+  auto NumVTableTypes = readNumber<uint32_t>();
+  if (std::error_code EC = NumVTableTypes.getError())
+    return EC;
+
+  for (uint32_t I = 0; I < *NumVTableTypes; ++I) {
+    auto VTableType(readStringFromTable());
+    if (std::error_code EC = VTableType.getError())
+      return EC;
+
+    auto VTableSamples = readNumber<uint64_t>();
+    if (std::error_code EC = VTableSamples.getError())
+      return EC;
+    // The source profile should not have duplicate vtable records at the same
+    // location. In case duplicate vtables are found, reader can emit a warning
+    // but continue processing the profile.
+    if (!M.insert(std::make_pair(*VTableType, *VTableSamples)).second) {
+      Ctx.diagnose(DiagnosticInfoSampleProfile(
+          Buffer->getBufferIdentifier(), 0,
+          "Duplicate vtable type " + VTableType->str() +
+              " at the same location. Additional counters will be ignored.",
+          DS_Warning));
+      continue;
+    }
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code
+SampleProfileReaderBinary::readCallsiteVTableProf(FunctionSamples &FProfile) {
+  assert(ReadVTableProf &&
+         "Cannot read vtable profiles if ReadVTableProf is false");
+
+  // Read the vtable type profile for the callsite.
+  auto NumCallsites = readNumber<uint32_t>();
+  if (std::error_code EC = NumCallsites.getError())
+    return EC;
+
+  for (uint32_t I = 0; I < *NumCallsites; ++I) {
+    auto LineOffset = readNumber<uint64_t>();
+    if (std::error_code EC = LineOffset.getError())
+      return EC;
+
+    if (!isOffsetLegal(*LineOffset))
+      return sampleprof_error::illegal_line_offset;
+
+    auto Discriminator = readNumber<uint64_t>();
+    if (std::error_code EC = Discriminator.getError())
+      return EC;
+
+    // Here we handle FS discriminators:
+    const uint32_t DiscriminatorVal = (*Discriminator) & getDiscriminatorMask();
+
+    if (std::error_code EC = readVTableTypeCountMap(FProfile.getTypeSamplesAt(
+            LineLocation(*LineOffset, DiscriminatorVal))))
+      return EC;
+  }
+  return sampleprof_error::success;
+}
+
 std::error_code
 SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
   auto NumSamples = readNumber<uint64_t>();
@@ -678,6 +789,9 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
       return EC;
   }
 
+  if (ReadVTableProf)
+    return readCallsiteVTableProf(FProfile);
+
   return sampleprof_error::success;
 }
 
@@ -740,6 +854,8 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
       FunctionSamples::ProfileIsPreInlined = ProfileIsPreInlined = true;
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       FunctionSamples::ProfileIsFS = ProfileIsFS = true;
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagHasVTableTypeProf))
+      ReadVTableProf = true;
     break;
   case SecNameTable: {
     bool FixedLengthMD5 =
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 9173a0f94f69d..e5f31348578b8 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -41,6 +41,11 @@
 using namespace llvm;
 using namespace sampleprof;
 
+// To begin with, make this option off by default.
+static cl::opt<bool> ExtBinaryWriteVTableTypeProf(
+    "extbinary-write-vtable-type-prof", cl::init(false), cl::Hidden,
+    cl::desc("Write vtable type profile in ext-binary sample profile writer"));
+
 namespace llvm {
 namespace support {
 namespace endian {
@@ -435,6 +440,9 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagIsPreInlined);
   if (Type == SecProfSummary && FunctionSamples::ProfileIsFS)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator);
+  if (Type == SecProfSummary && ExtBinaryWriteVTableTypeProf)
+    addSectionFlag(SecProfSummary,
+                   SecProfSummaryFlags::SecFlagHasVTableTypeProf);
 
   uint64_t SectionStart = markSectionStart(Type, LayoutIdx);
   switch (Type) {
@@ -478,6 +486,12 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
   return sampleprof_error::success;
 }
 
+SampleProfileWriterExtBinary::SampleProfileWriterExtBinary(
+    std::unique_ptr<raw_ostream> &OS)
+    : SampleProfileWriterExtBinaryBase(OS) {
+  WriteVTableProf = ExtBinaryWriteVTableTypeProf;
+}
+
 std::error_code SampleProfileWriterExtBinary::writeDefaultLayout(
     const SampleProfileMap &ProfileMap) {
   // The const indices passed to writeOneSection below are specifying the
@@ -587,6 +601,19 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
       OS << " " << J.first << ":" << J.second;
     OS << "\n";
     LineCount++;
+
+    if (const TypeCountMap *Map = S.findCallsiteTypeSamplesAt(Loc);
+        Map && !Map->empty()) {
+      OS.indent(Indent + 1);
+      Loc.print(OS);
+      OS << ": ";
+      OS << kVTableProfPrefix;
+      for (const auto [TypeName, Count] : *Map) {
+        OS << TypeName << ":" << Count << " ";
+      }
+      OS << "\n";
+      LineCount++;
+    }
   }
 
   SampleSorter<LineLocation, FunctionSamplesMap> SortedCallsiteSamples(
@@ -603,7 +630,21 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
       if (std::error_code EC = writeSample(CalleeSamples))
         return EC;
     }
+
+    if (const TypeCountMap *Map = S.findCallsiteTypeSamplesAt(Loc);
+        Map && !Map->empty()) {
+      OS.indent(Indent);
+      Loc.print(OS);
+      OS << ": ";
+      OS << kVTableProfPrefix;
+      for (const auto [TypeId, Count] : *Map) {
+        OS << TypeId << ":" << Count << " ";
+      }
+      OS << "\n";
+      LineCount++;
+    }
   }
+
   Indent -= 1;
 
   if (FunctionSamples::ProfileIsProbeBased) {
@@ -663,6 +704,17 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
       addName(CalleeSamples.getFunction());
       addNames(CalleeSamples);
     }
+
+  if (!WriteVTableProf)
+    return;
+  // Add all the vtable names to NameTable.
+  for (const auto &VTableAccessCountMap :
+       llvm::make_second_range(S.getCallsiteTypeCounts())) {
+    // Add type name to NameTable.
+    for (const auto Type : llvm::make_first_range(VTableAccessCountMap)) {
+      addName(Type);
+    }
+  }
 }
 
 void SampleProfileWriterExtBinaryBase::addContext(
@@ -801,6 +853,22 @@ std::error_code SampleProfileWriterExtBinaryBase::writeHeader(
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileWriterBinary::writeCallsiteVTableProf(
+    const CallsiteTypeMap &CallsiteTypeMap, raw_ostream &OS) {
+  assert(WriteVTableProf &&
+         "writeCallsiteVTableProf should not be called if WriteVTableProf is "
+         "false");
+
+  encodeULEB128(CallsiteTypeMap.size(), OS);
+  for (const auto &[Loc, TypeMap] : CallsiteTypeMap) {
+    Loc.serialize(OS);
+    if (std::error_code EC = serializeTypeMap(TypeMap, getNameTable(), OS))
+      return EC;
+  }
+
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileWriterBinary::writeSummary() {
   auto &OS = *OutputStream;
   encodeULEB128(Summary->getTotalCount(), OS);
@@ -838,14 +906,16 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   for (const auto &J : S.getCallsiteSamples())
     NumCallsites += J.second.size();
   encodeULEB128(NumCallsites, OS);
-  for (const auto &[Loc, CalleeFunctionSampleMap] : S.getCallsiteSamples())
-    for (const auto &FunctionSample :
-         llvm::make_second_range(CalleeFunctionSampleMap)) {
-      Loc.serialize(OS);
-      if (std::error_code EC = writeBody(FunctionSample))
+  for (const auto &J : S.getCallsiteSamples())
+    for (const auto &FS : J.second) {
+      J.first.serialize(OS);
+      if (std::error_code EC = writeBody(FS.second))
         return EC;
     }
 
+  if (WriteVTableProf)
+    return writeCallsiteVTableProf(S.getCallsiteTypeCounts(), OS);
+
   return sampleprof_error::success;
 }
 
diff --git a/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected
new file mode 100644
index 0000000000000..f7e7499a2c781
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list-ext.expected
@@ -0,0 +1,44 @@
+Function: main: 368038, 0, 7 sampled lines
+Samples collected in the function's body {
+  4: 1068
+  4.2: 1068
+  5: 2150
+  5.1: 2150
+  6: 4160
+  7: 1068
+  9: 4128, calls: _Z3bari:2942 _Z3fooi:1262
+  9: vtables: _ZTVbar:2942 _ZTVfoo:1260
+}
+Samples collected in inlined callsites {
+  10: inlined callee: inline1: 2000, 0, 1 sampled lines
+    Samples collected in the function's body {
+      1: 2000
+    }
+    No inlined callsites in this function
+  10: inlined callee: inline2: 4000, 0, 1 sampled lines
+    Samples collected in the function's body {
+      1: 4000
+    }
+    No inlined callsites in this function
+  10: vtables: _ZTVinline1:2000 _ZTVinline2:4000
+}
+Function: _Z3bari: 40602, 2874, 1 sampled lines
+Samples collected in the function's body {
+  1: 2874
+}
+No inlined callsites in this function
+Function: _Z3fooi: 15422, 1220, 1 sampled lines
+Samples collected in the function's body {
+  1: 1220
+}
+No inlined callsites in this function
+======== Dump profile symbol list ========
+_Z3goov
+_Z3sumii
+__libc_csu_fini
+__libc_csu_init
+_dl_relocate_static_pie
+_fini
+_init
+_start
+main
diff --git a/llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext b/llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext
new file mode 100644
index 0000000000000..100133fa17ccb
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/sample-profile-ext.proftext
@@ -0,0 +1,18 @@
+main:184019:0
+ 4: 534
+ 4.2: 534
+ 5: 1075
+ 5.1: 1075
+ 6: 2080
+ 7: 534
+ 9: 2064 _Z3bari:1471 _Z3fooi:631
+ 9: vtables _ZTVbar:1471 _ZTVfoo:630
+ 10: inline1:1000
+  1: 1000
+ 10: inline2:2000
+  1: 2000
+ 10: vtables _ZTVinline1:1000 _ZTVinline2:2000
+_Z3bari:20301:1437
+ 1: 1437
+_Z3fooi:7711:610
+ 1: 610
diff --git a/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test b/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test
index b445695c8e8e4..8383bcc1a2fbe 100644
--- a/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test
+++ b/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test
@@ -4,3 +4,12 @@ REQUIRES: zlib
 ; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections %t.1.output %t.2.output -o %t.3.output
 ; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output
 ; RUN: diff -b %S/Inputs/profile-symbol-list.expected %t.4.output
+
+;; Generate two SampleFDO binary profiles and merge them.
+;; Tests that the vtable counters in the merged profile are the aggregated
+;; result from both sources.
+; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-1.text %S/Inputs/sample-profile-ext.proftext -o %t.1.output
+; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-2.text %S/Inputs/sample-profile-ext.proftext -o %t.2.output
+; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -extbinary-write-vtable-type-prof %t.1.output %t.2.output -o %t.3.output
+; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output
+; RUN: diff -b %S/Inputs/profile-symbol-list-ext.expected %t.4.output
diff --git a/llvm/test/tools/llvm-profdata/profile-symbol-list.test b/llvm/test/tools/llvm-profdata/profile-symbol-list.test
index 39dcd11ec1db7..6845531066c76 100644
--- a/llvm/test/tools/llvm-profdata/profile-symbol-list.test
+++ b/llvm/test/tools/llvm-profdata/profile-symbol-list.test
@@ -7,3 +7,12 @@
 ; RUN: llvm-profdata show -sample -show-sec-info-only %t.5.output  | FileCheck %s -check-prefix=NOSYMLIST
 
 ; NOSYMLIST: ProfileSymbolListSection {{.*}} Size: 0
+
+;; Generate two SampleFDO binary profiles and merge them.
+;; Tests that the vtable counters in the merged profile are the aggregated
+;; result from both sources.
+; RUN: llvm-profdata merge -sample -extbinary -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-1.text %S/Inputs/sample-profile-ext.proftext -o %t.1.output
+; RUN: llvm-profdata merge -sample -extbinary -extbinary-write-vtable-type-prof -prof-sym-list=%S/Inputs/profile-symbol-list-2.text %S/Inputs/sample-profile-ext.proftext -o %t.2.output
+; RUN: llvm-profdata merge -sample -extbinary -extbinary-write-vtable-type-prof %t.1.output %t.2.output -o %t.3.output
+; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output
+; RUN: diff -b %S/Inputs/profile-symbol-list-ext.expected %t.4.output
diff --git a/llvm/test/tools/llvm-profdata/roundtrip.test b/llvm/test/tools/llvm-profdata/roundtrip.test
index 7af76e0a58224..eb55534763877 100644
--- a/llvm/test/tools/llvm-profdata/roundtrip.test
+++ b/llvm/test/tools/llvm-profdata/roundtrip.test
@@ -16,3 +16,9 @@ RUN: llvm-profdata merge --sample --binary -output=%t.4.profdata %S/Inputs/sampl
 RUN: llvm-profdata merge --sample --extbinary -output=%t.5.profdata %t.4.profdata
 RUN: llvm-profdata merge --sample --text -output=%t.4.proftext %t.5.profdata
 RUN: diff -b %t.4.proftext %S/Inputs/sample-profile.proftext
+# Round trip from text --> extbinary --> text.
+# The vtable profile is supported by ext-binary profile but not raw binary profile format,
+# so we don't use raw binary profile format in this roundtrip.
+RUN: llvm-profdata merge --sample --extbinary -extbinary-write-vtable-type-prof --output=%t.5.profdata %S/Inputs/sample-profile-ext.proftext
+RUN: llvm-profdata merge --sample --text --output=%t.5.proftext %t.5.profdata
+RUN: diff -b %t.5.proftext %S/Inputs/sample-profile-ext.proftext

>From f32874f77b5a6065a705ffc35b48bff1545cd6cd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 16:09:39 -0700
Subject: [PATCH 15/39] [LegalizeIntegerTypes] Use getShiftAmountConstant.

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 9e85f08abb766..87570e6f44a6f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5254,9 +5254,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     SDValue MulLo, MulHi;
     TLI.forceExpandWideMUL(DAG, dl, /*Signed=*/true, N->getOperand(0),
                            N->getOperand(1), MulLo, MulHi);
-    SDValue SRA =
-        DAG.getNode(ISD::SRA, dl, VT, MulLo,
-                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+    SDValue SRA = DAG.getNode(
+        ISD::SRA, dl, VT, MulLo,
+        DAG.getShiftAmountConstant(VT.getScalarSizeInBits() - 1, VT, dl));
     SDValue Overflow =
         DAG.getSetCC(dl, N->getValueType(1), MulHi, SRA, ISD::SETNE);
     SplitInteger(MulLo, Lo, Hi);

>From bac9e463b1f77b7354fe68c87d58be67e3294806 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 12 Sep 2025 16:15:31 -0700
Subject: [PATCH 16/39] [NFC][CodeGen][CFI] Extract
 CreateMetadataIdentifierForFnType (#158189)

For #158193
---
 clang/lib/CodeGen/CGExpr.cpp        | 7 ++-----
 clang/lib/CodeGen/CodeGenModule.cpp | 7 +++++++
 clang/lib/CodeGen/CodeGenModule.h   | 3 +++
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e8456a44f8367..e6e4947882544 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6496,11 +6496,8 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
     SanitizerDebugLocation SanScope(this, {CheckOrdinal}, CheckHandler);
     EmitSanitizerStatReport(llvm::SanStat_CFI_ICall);
 
-    llvm::Metadata *MD;
-    if (CGM.getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
-      MD = CGM.CreateMetadataIdentifierGeneralized(QualType(FnType, 0));
-    else
-      MD = CGM.CreateMetadataIdentifierForType(QualType(FnType, 0));
+    llvm::Metadata *MD =
+        CGM.CreateMetadataIdentifierForFnType(QualType(FnType, 0));
 
     llvm::Value *TypeId = llvm::MetadataAsValue::get(getLLVMContext(), MD);
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a16dfb52f4d90..d45fb823d4c35 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7934,6 +7934,13 @@ CodeGenModule::CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
   return InternalId;
 }
 
+llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForFnType(QualType T) {
+  assert(isa<FunctionType>(T));
+  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
+    return CreateMetadataIdentifierGeneralized(T);
+  return CreateMetadataIdentifierForType(T);
+}
+
 llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForType(QualType T) {
   return CreateMetadataIdentifierImpl(T, MetadataIdMap, "");
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index f62350fd8d378..8b1ac2d976c5e 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1623,6 +1623,9 @@ class CodeGenModule : public CodeGenTypeCache {
   /// Generate a KCFI type identifier for T.
   llvm::ConstantInt *CreateKCFITypeId(QualType T, StringRef Salt);
 
+  /// Create a metadata identifier for the given function type.
+  llvm::Metadata *CreateMetadataIdentifierForFnType(QualType T);
+
   /// Create a metadata identifier for the given type. This may either be an
   /// MDString (for external identifiers) or a distinct unnamed MDNode (for
   /// internal identifiers).

>From 8ac67aa8a9ef0012a619e1395a23a04cbea3abe9 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 12 Sep 2025 16:38:21 -0700
Subject: [PATCH 17/39] [NFC][CFI][CodeGen] Move GeneralizeFunctionType out of
 CreateMetadataIdentifierGeneralized (#158190)

For #158193
---
 clang/lib/CodeGen/CodeGenModule.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d45fb823d4c35..a650f27f977c9 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3041,9 +3041,12 @@ void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
   if (isa<CXXMethodDecl>(FD) && !cast<CXXMethodDecl>(FD)->isStatic())
     return;
 
-  llvm::Metadata *MD = CreateMetadataIdentifierForType(FD->getType());
+  QualType FnType = FD->getType();
+  llvm::Metadata *MD = CreateMetadataIdentifierForType(FnType);
   F->addTypeMetadata(0, MD);
-  F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(FD->getType()));
+
+  QualType GenPtrFnType = GeneralizeFunctionType(getContext(), FD->getType());
+  F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(GenPtrFnType));
 
   // Emit a hash-based bit set entry for cross-DSO calls.
   if (CodeGenOpts.SanitizeCfiCrossDso)
@@ -7936,8 +7939,10 @@ CodeGenModule::CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
 
 llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForFnType(QualType T) {
   assert(isa<FunctionType>(T));
-  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
+  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers) {
+    T = GeneralizeFunctionType(getContext(), T);
     return CreateMetadataIdentifierGeneralized(T);
+  }
   return CreateMetadataIdentifierForType(T);
 }
 
@@ -7951,8 +7956,8 @@ CodeGenModule::CreateMetadataIdentifierForVirtualMemPtrType(QualType T) {
 }
 
 llvm::Metadata *CodeGenModule::CreateMetadataIdentifierGeneralized(QualType T) {
-  return CreateMetadataIdentifierImpl(GeneralizeFunctionType(getContext(), T),
-                                      GeneralizedMetadataIdMap, ".generalized");
+  return CreateMetadataIdentifierImpl(T, GeneralizedMetadataIdMap,
+                                      ".generalized");
 }
 
 /// Returns whether this module needs the "all-vtables" type identifier.

>From 120d7475d35fc16b25c9d7c9b05e0ba44cca6449 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford at apple.com>
Date: Fri, 12 Sep 2025 16:38:29 -0700
Subject: [PATCH 18/39] [lldb] Change directory creation logic in
 framework-header-fix (#158355)

It's possible for this logic to fail if the build system runs this
script in parallel. One instance could create the directory in between
another instance's checking of its existence and attempt at creation.

Instead, always try to create it and ignore any FileExistsErrors.

rdar://160120161
---
 lldb/scripts/framework-header-fix.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
index 36c5c67c59d36..3447dfc29a761 100755
--- a/lldb/scripts/framework-header-fix.py
+++ b/lldb/scripts/framework-header-fix.py
@@ -115,8 +115,10 @@ def main():
         unifdef_guards = ["-U" + guard for guard in args.unifdef_guards]
 
     # Create the framework's header dir if it doesn't already exist
-    if not os.path.exists(os.path.dirname(output_file_path)):
+    try:
         os.makedirs(os.path.dirname(output_file_path))
+    except FileExistsError:
+        pass
 
     if framework_version == "lldb_main":
         modify_main_includes(input_file_path, output_file_path)

>From 9ac1f3420db82d7446449b8dd1e4ff07f93e7176 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram at users.noreply.github.com>
Date: Fri, 12 Sep 2025 18:59:58 -0500
Subject: [PATCH 19/39] [Linalg] Fix bug in control function logic of push down
 extract pattern (#158348)

Current logic just bails out if the first extract producer fails the
control function, this PR fixes that.

Signed-off-by: Nirvedh Meshram <nirvedh at gmail.com>
---
 .../Transforms/DataLayoutPropagation.cpp      | 36 ++++++++++++-------
 .../Linalg/data-layout-propagation.mlir       | 30 ++++++++++++++++
 .../Linalg/TestDataLayoutPropagation.cpp      |  9 +++--
 3 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index ed2efd6fea5f7..6c17c3c2d0cab 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -1245,21 +1245,21 @@ struct SliceDimInfo {
   OpFoldResult outputSize;
 };
 
-/// Return the first input extract slice operand, if present, for the current
+/// Return all extract slice operands, if present, for the current
 /// generic op.
-static FailureOr<OpOperand *> getSliceOperand(GenericOp genericOp) {
-  OpOperand *sliceOperand = nullptr;
+static FailureOr<SmallVector<OpOperand *>>
+getSliceOperands(GenericOp genericOp) {
+  SmallVector<OpOperand *> sliceOperands;
   for (auto operand : genericOp.getDpsInputOperands()) {
     auto extractOp = operand->get().getDefiningOp<tensor::ExtractSliceOp>();
     if (!extractOp)
       continue;
-    sliceOperand = operand;
-    break;
+    sliceOperands.push_back(operand);
   }
-  if (!sliceOperand) {
+  if (sliceOperands.empty()) {
     return failure();
   }
-  return sliceOperand;
+  return sliceOperands;
 }
 
 // Return a map of dims that have partial slices on them so that other operands
@@ -1336,14 +1336,24 @@ pushDownExtractSliceOpThroughGenericOp(RewriterBase &rewriter,
         genericOp,
         "propagation through generic with gather semantics is unsupported.");
   // Collect the sliced operand, if present.
-  auto maybeSliceOperand = getSliceOperand(genericOp);
-  if (failed(maybeSliceOperand))
+  auto maybeSliceOperands = getSliceOperands(genericOp);
+  if (failed(maybeSliceOperands))
     return failure();
-  OpOperand *sliceOperand = *maybeSliceOperand;
-  unsigned OperandIndex = sliceOperand->getOperandNumber();
-
-  if (!controlFn(sliceOperand))
+  SmallVector<OpOperand *> sliceOperands = *maybeSliceOperands;
+  OpOperand *sliceOperand;
+
+  bool foundValidOperand = false;
+  for (auto currSliceOperand : sliceOperands) {
+    if (controlFn(currSliceOperand)) {
+      sliceOperand = currSliceOperand;
+      foundValidOperand = true;
+      break;
+    }
+  }
+  if (!foundValidOperand) {
     return failure();
+  }
+  unsigned OperandIndex = sliceOperand->getOperandNumber();
 
   tensor::ExtractSliceOp producerSliceOp =
       sliceOperand->get().getDefiningOp<tensor::ExtractSliceOp>();
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index fb16e1e7dcda4..a5f8d63a3e912 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -1577,3 +1577,33 @@ func.func @push_extract_through_generic_rank0_operand(%arg0: tensor<128x128xf32>
 // CHECK:         %[[GENERIC:.+]] = linalg.generic
 // CHECK:         %[[EXTRACT:.+]] = tensor.extract_slice %[[GENERIC]]         
 // CHECK:         return %[[EXTRACT]]
+
+// -----
+// Test that if one extract doesnt pass the control function which in this case is set to
+// only allow extracts from the same block, then an extract from a later operand can still be pushed
+// down.
+func.func @push_extract_through_generic_secondextract(%arg0: tensor<128x128xf32>, %arg1: tensor<?x?xbf16>, %arg2: index) -> tensor<?x?xbf16> {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %extracted_slice1 = tensor.extract_slice %arg0[%arg2, %arg2] [%arg2, %arg2] [1, 1] : tensor<128x128xf32> to tensor<?x?xf32>
+  %for = scf.for %arg3 = %c0 to %c32 step %arg2 iter_args(%arg4 = %arg1) -> tensor<?x?xbf16> {
+    %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg2] [%arg2, %arg2] [1, 1] : tensor<128x128xf32> to tensor<?x?xf32>
+    %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0, d1)> ,affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice1, %extracted_slice : tensor<?x?xf32>,  tensor<?x?xf32>) outs(%arg1 : tensor<?x?xbf16>) {
+    ^bb0(%in: f32, %in_1 : f32, %out: bf16):
+      %1 = arith.truncf %in : f32 to bf16
+      linalg.yield %1 : bf16
+    } -> tensor<?x?xbf16>
+    scf.yield %0 : tensor<?x?xbf16>
+  }
+ return %for : tensor<?x?xbf16>
+}
+
+// CHECK-LABEL: func.func @push_extract_through_generic_secondextract
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK:         %[[EXTRACT:.+]] = tensor.extract_slice
+// CHECK:         %[[FOR:.+]] = scf.for
+// CHECK:           %[[PAD:.+]] = tensor.pad %[[EXTRACT]]
+// CHECK:           %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:        ins(%[[PAD]], %[[ARG0]]
+// CHECK:           %[[EXTRACT2:.+]] =  tensor.extract_slice %[[GENERIC]]
+// CHECK:           scf.yield %[[EXTRACT2]]
diff --git a/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp b/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp
index 2cf25d8fc8c19..d332270468ea8 100644
--- a/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp
@@ -34,8 +34,13 @@ struct TestDataLayoutPropagationPass
     RewritePatternSet patterns(context);
     linalg::populateDataLayoutPropagationPatterns(
         patterns, [](OpOperand *opOperand) { return true; });
-    linalg::populateExtractSliceSinkingPatterns(
-        patterns, [](OpOperand *opOperand) { return true; });
+    linalg::ControlPropagationFn controlExtract =
+        [](OpOperand *opOperand) -> bool {
+      Operation *producer = opOperand->get().getDefiningOp();
+      Operation *consumer = opOperand->getOwner();
+      return consumer->getBlock() == producer->getBlock();
+    };
+    linalg::populateExtractSliceSinkingPatterns(patterns, controlExtract);
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
       return signalPassFailure();
   }

>From 1cbdb7370fd62b17762d1dfe19a471a70ae8b137 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Fri, 12 Sep 2025 14:32:12 -0700
Subject: [PATCH 20/39] Reapply "[lit] Implement ulimit builtin"

This reverts commit 330068a74bfb6333f9016e3c4053eeaf4989d601.

This was causing some test failures on MacOS that are now fixed in the reland.
These failures were related to calling ulimit -v despite XNU not having support
for that option. This patch simply disables the test on non-Linux platforms for
now until we can have a Linux specific test for ulimit -v.
---
 llvm/utils/lit/lit/TestRunner.py              | 38 ++++++++++++++++++-
 .../builtin_commands/_launch_with_limit.py    | 25 ++++++++++++
 .../lit/tests/Inputs/shtest-ulimit/lit.cfg    |  8 ++++
 .../Inputs/shtest-ulimit/print_limits.py      |  4 ++
 .../Inputs/shtest-ulimit/ulimit-bad-arg.txt   |  1 +
 .../Inputs/shtest-ulimit/ulimit_okay.txt      |  5 +++
 llvm/utils/lit/tests/shtest-ulimit.py         | 24 ++++++++++++
 7 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
 create mode 100644 llvm/utils/lit/tests/shtest-ulimit.py

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index a769919558a47..90c2c6479b004 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -92,11 +92,12 @@ class ShellEnvironment(object):
     we maintain a dir stack for pushd/popd.
     """
 
-    def __init__(self, cwd, env, umask=-1):
+    def __init__(self, cwd, env, umask=-1, ulimit={}):
         self.cwd = cwd
         self.env = dict(env)
         self.umask = umask
         self.dirStack = []
+        self.ulimit = ulimit
 
     def change_dir(self, newdir):
         if os.path.isabs(newdir):
@@ -595,6 +596,27 @@ def executeBuiltinUmask(cmd, shenv):
     return ShellCommandResult(cmd, "", "", 0, False)
 
 
+def executeBuiltinUlimit(cmd, shenv):
+    """executeBuiltinUlimit - Change the current limits."""
+    if os.name != "posix":
+        raise InternalShellError(cmd, "'ulimit' not supported on this system")
+    if len(cmd.args) != 3:
+        raise InternalShellError(cmd, "'ulimit' requires two arguments")
+    try:
+        new_limit = int(cmd.args[2])
+    except ValueError as err:
+        raise InternalShellError(cmd, "Error: 'ulimit': %s" % str(err))
+    if cmd.args[1] == "-v":
+        shenv.ulimit["RLIMIT_AS"] = new_limit * 1024
+    elif cmd.args[1] == "-n":
+        shenv.ulimit["RLIMIT_NOFILE"] = new_limit
+    else:
+        raise InternalShellError(
+            cmd, "'ulimit' does not support option: %s" % cmd.args[1]
+        )
+    return ShellCommandResult(cmd, "", "", 0, False)
+
+
 def executeBuiltinColon(cmd, cmd_shenv):
     """executeBuiltinColon - Discard arguments and exit with status 0."""
     return ShellCommandResult(cmd, "", "", 0, False)
@@ -749,6 +771,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         "popd": executeBuiltinPopd,
         "pushd": executeBuiltinPushd,
         "rm": executeBuiltinRm,
+        "ulimit": executeBuiltinUlimit,
         "umask": executeBuiltinUmask,
         ":": executeBuiltinColon,
     }
@@ -914,6 +937,19 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         if kIsWindows:
             args = quote_windows_command(args)
 
+        # Handle any resource limits. We do this by launching the command with
+        # a wrapper that sets the necessary limits. We use a wrapper rather than
+        # setting the limits in process as we cannot reraise the limits back to
+        # their defaults without elevated permissions.
+        if cmd_shenv.ulimit:
+            executable = sys.executable
+            args.insert(0, sys.executable)
+            args.insert(1, os.path.join(builtin_commands_dir, "_launch_with_limit.py"))
+            for limit in cmd_shenv.ulimit:
+                cmd_shenv.env["LIT_INTERNAL_ULIMIT_" + limit] = str(
+                    cmd_shenv.ulimit[limit]
+                )
+
         try:
             # TODO(boomanaiden154): We currently wrap the subprocess.Popen with
             # os.umask as the umask argument in subprocess.Popen is not
diff --git a/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
new file mode 100644
index 0000000000000..33d2d59ff0dbe
--- /dev/null
+++ b/llvm/utils/lit/lit/builtin_commands/_launch_with_limit.py
@@ -0,0 +1,25 @@
+import sys
+import subprocess
+import resource
+import os
+
+ULIMIT_ENV_VAR_PREFIX = "LIT_INTERNAL_ULIMIT_"
+
+
+def main(argv):
+    command_args = argv[1:]
+    for env_var in os.environ:
+        if env_var.startswith(ULIMIT_ENV_VAR_PREFIX):
+            limit_str = env_var[len(ULIMIT_ENV_VAR_PREFIX) :]
+            limit_value = int(os.environ[env_var])
+            limit = (limit_value, limit_value)
+            if limit_str == "RLIMIT_AS":
+                resource.setrlimit(resource.RLIMIT_AS, limit)
+            elif limit_str == "RLIMIT_NOFILE":
+                resource.setrlimit(resource.RLIMIT_NOFILE, limit)
+    process_output = subprocess.run(command_args)
+    sys.exit(process_output.returncode)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg
new file mode 100644
index 0000000000000..c7bdc7e7b6bc0
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/lit.cfg
@@ -0,0 +1,8 @@
+import lit.formats
+
+config.name = "shtest-ulimit"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest(execute_external=False)
+config.test_source_root = None
+config.test_exec_root = None
+config.substitutions.append(("%{python}", '"%s"' % (sys.executable)))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
new file mode 100644
index 0000000000000..632f954fa8fde
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/print_limits.py
@@ -0,0 +1,4 @@
+import resource
+
+print("RLIMIT_AS=" + str(resource.getrlimit(resource.RLIMIT_AS)[0]))
+print("RLIMIT_NOFILE=" + str(resource.getrlimit(resource.RLIMIT_NOFILE)[0]))
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt
new file mode 100644
index 0000000000000..efa22881047e9
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit-bad-arg.txt
@@ -0,0 +1 @@
+# RUN: ulimit -n
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
new file mode 100644
index 0000000000000..ad353b5d7c459
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_okay.txt
@@ -0,0 +1,5 @@
+# RUN: ulimit -v 1048576
+# RUN: ulimit -n 50
+# RUN: %{python} %S/print_limits.py
+# Fail the test so that we can assert on the output.
+# RUN: not echo return
diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py
new file mode 100644
index 0000000000000..b86578a21f661
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-ulimit.py
@@ -0,0 +1,24 @@
+# Check the ulimit command
+
+# ulimit does not work on non-POSIX platforms.
+# UNSUPPORTED: system-windows
+
+# TODO(boomanaiden154): The test fails on some non-Linux POSIX
+# platforms (like MacOS) due to the underlying system not supporting
+# ulimit -v. This test needs to be carved up so we keep full test
+# coverage on Linux and as much as possible on other platforms.
+# REQUIRES: system-linux
+
+# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit | FileCheck %s
+
+# CHECK: -- Testing: 2 tests{{.*}}
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit-bad-arg.txt ({{[^)]*}})
+# CHECK: ulimit -n
+# CHECK: 'ulimit' requires two arguments
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}})
+# CHECK: ulimit -v 1048576
+# CHECK: ulimit -n 50
+# CHECK: RLIMIT_AS=1073741824
+# CHECK: RLIMIT_NOFILE=50

>From a5bff94ffd1b81a3562f02f05980ee87cc4164df Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 12 Sep 2025 17:13:35 -0700
Subject: [PATCH 21/39] [NFC][CodeGen][CFI] Add GeneralizePointers parameter to
 GeneralizeFunctionType (#158191)

For #158193

---------

Co-authored-by: Alex Langford <alangford at apple.com>
---
 clang/lib/CodeGen/CodeGenModule.cpp | 44 +++++++++++++++++------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a650f27f977c9..d25ce3165bd79 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2339,12 +2339,15 @@ llvm::ConstantInt *CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata *MD) {
   return llvm::ConstantInt::get(Int64Ty, llvm::MD5Hash(MDS->getString()));
 }
 
-// Generalize pointer types to a void pointer with the qualifiers of the
-// originally pointed-to type, e.g. 'const char *' and 'char * const *'
-// generalize to 'const void *' while 'char *' and 'const char **' generalize to
-// 'void *'.
-static QualType GeneralizeType(ASTContext &Ctx, QualType Ty) {
-  if (!Ty->isPointerType())
+// If `GeneralizePointers` is true, generalizes types to a void pointer with the
+// qualifiers of the originally pointed-to type, e.g. 'const char *' and 'char *
+// const *' generalize to 'const void *' while 'char *' and 'const char **'
+// generalize to 'void *'.
+static QualType GeneralizeType(ASTContext &Ctx, QualType Ty,
+                               bool GeneralizePointers) {
+  // TODO: Add other generalizations.
+
+  if (!GeneralizePointers || !Ty->isPointerType())
     return Ty;
 
   return Ctx.getPointerType(
@@ -2353,26 +2356,29 @@ static QualType GeneralizeType(ASTContext &Ctx, QualType Ty) {
 }
 
 // Apply type generalization to a FunctionType's return and argument types
-static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty) {
+static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty,
+                                       bool GeneralizePointers) {
   if (auto *FnType = Ty->getAs<FunctionProtoType>()) {
     SmallVector<QualType, 8> GeneralizedParams;
     for (auto &Param : FnType->param_types())
-      GeneralizedParams.push_back(GeneralizeType(Ctx, Param));
+      GeneralizedParams.push_back(
+          GeneralizeType(Ctx, Param, GeneralizePointers));
 
-    return Ctx.getFunctionType(GeneralizeType(Ctx, FnType->getReturnType()),
-                               GeneralizedParams, FnType->getExtProtoInfo());
+    return Ctx.getFunctionType(
+        GeneralizeType(Ctx, FnType->getReturnType(), GeneralizePointers),
+        GeneralizedParams, FnType->getExtProtoInfo());
   }
 
   if (auto *FnType = Ty->getAs<FunctionNoProtoType>())
     return Ctx.getFunctionNoProtoType(
-        GeneralizeType(Ctx, FnType->getReturnType()));
+        GeneralizeType(Ctx, FnType->getReturnType(), GeneralizePointers));
 
   llvm_unreachable("Encountered unknown FunctionType");
 }
 
 llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T, StringRef Salt) {
-  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
-    T = GeneralizeFunctionType(getContext(), T);
+  T = GeneralizeFunctionType(
+      getContext(), T, getCodeGenOpts().SanitizeCfiICallGeneralizePointers);
   if (auto *FnType = T->getAs<FunctionProtoType>())
     T = getContext().getFunctionType(
         FnType->getReturnType(), FnType->getParamTypes(),
@@ -3041,11 +3047,13 @@ void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
   if (isa<CXXMethodDecl>(FD) && !cast<CXXMethodDecl>(FD)->isStatic())
     return;
 
-  QualType FnType = FD->getType();
+  QualType FnType = GeneralizeFunctionType(getContext(), FD->getType(),
+                                           /*GeneralizePointers=*/false);
   llvm::Metadata *MD = CreateMetadataIdentifierForType(FnType);
   F->addTypeMetadata(0, MD);
 
-  QualType GenPtrFnType = GeneralizeFunctionType(getContext(), FD->getType());
+  QualType GenPtrFnType = GeneralizeFunctionType(getContext(), FD->getType(),
+                                                 /*GeneralizePointers=*/true);
   F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(GenPtrFnType));
 
   // Emit a hash-based bit set entry for cross-DSO calls.
@@ -7939,10 +7947,10 @@ CodeGenModule::CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
 
 llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForFnType(QualType T) {
   assert(isa<FunctionType>(T));
-  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers) {
-    T = GeneralizeFunctionType(getContext(), T);
+  T = GeneralizeFunctionType(
+      getContext(), T, getCodeGenOpts().SanitizeCfiICallGeneralizePointers);
+  if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
     return CreateMetadataIdentifierGeneralized(T);
-  }
   return CreateMetadataIdentifierForType(T);
 }
 

>From ba3b3e3ac812ae30f12f92ee8c4a1c668cd9817e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 12 Sep 2025 17:37:50 -0700
Subject: [PATCH 22/39] [NFC][CodeGen][CFI] Pre-commit transparent_union tests
 (#158192)

For #158193
---
 clang/test/CodeGen/cfi-icall-generalize.c | 16 ++++++++++++++++
 clang/test/CodeGen/cfi-icall-normalize2.c | 14 ++++++++++++++
 clang/test/CodeGen/kcfi-generalize.c      | 16 ++++++++++++++++
 clang/test/CodeGen/kcfi-normalize.c       | 14 ++++++++++++++
 4 files changed, 60 insertions(+)

diff --git a/clang/test/CodeGen/cfi-icall-generalize.c b/clang/test/CodeGen/cfi-icall-generalize.c
index 0af17e5760cc6..46d38511ba6b6 100644
--- a/clang/test/CodeGen/cfi-icall-generalize.c
+++ b/clang/test/CodeGen/cfi-icall-generalize.c
@@ -15,5 +15,21 @@ void g(int** (*fp)(const char *, const char **)) {
   fp(0, 0);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+// CHECK: define{{.*}} void @uni({{.*}} !type [[TYPE2:![0-9]+]] !type [[TYPE2_GENERALIZED:![0-9]+]]
+void uni(void (*fn)(union Union), union Union arg1) {
+  // UNGENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFv5UnionE")
+  // GENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFv5UnionE.generalized")
+    fn(arg1);
+}
+
 // CHECK: [[TYPE]] = !{i64 0, !"_ZTSFPPiPKcPS2_E"}
 // CHECK: [[TYPE_GENERALIZED]] = !{i64 0, !"_ZTSFPvPKvS_E.generalized"}
+
+// CHECK: [[TYPE2]] = !{i64 0, !"_ZTSFvPFv5UnionES_E"}
+// CHECK: [[TYPE2_GENERALIZED]] = !{i64 0, !"_ZTSFvPv5UnionE.generalized"}
+
diff --git a/clang/test/CodeGen/cfi-icall-normalize2.c b/clang/test/CodeGen/cfi-icall-normalize2.c
index 93893065cf903..5e457dc97f0a2 100644
--- a/clang/test/CodeGen/cfi-icall-normalize2.c
+++ b/clang/test/CodeGen/cfi-icall-normalize2.c
@@ -24,6 +24,20 @@ void baz(void (*fn)(int, int, int), int arg1, int arg2, int arg3) {
     fn(arg1, arg2, arg3);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+void uni(void (*fn)(union Union), union Union arg1) {
+    // CHECK-LABEL: define{{.*}}uni
+    // CHECK-SAME: {{.*}}!type ![[TYPE4:[0-9]+]] !type !{{[0-9]+}}
+    // CHECK: call i1 @llvm.type.test({{i8\*|ptr}} {{%f|%0}}, metadata !"_ZTSFv5UnionE.normalized")
+    fn(arg1);
+}
+
 // CHECK: ![[TYPE1]] = !{i64 0, !"_ZTSFvPFvu3i32ES_E.normalized"}
 // CHECK: ![[TYPE2]] = !{i64 0, !"_ZTSFvPFvu3i32S_ES_S_E.normalized"}
 // CHECK: ![[TYPE3]] = !{i64 0, !"_ZTSFvPFvu3i32S_S_ES_S_S_E.normalized"}
+// CHECK: ![[TYPE4]] = !{i64 0, !"_ZTSFvPFv5UnionES_E.normalized"}
+
diff --git a/clang/test/CodeGen/kcfi-generalize.c b/clang/test/CodeGen/kcfi-generalize.c
index 4e32f4f35057c..864cdb8c2e092 100644
--- a/clang/test/CodeGen/kcfi-generalize.c
+++ b/clang/test/CodeGen/kcfi-generalize.c
@@ -26,8 +26,24 @@ void g(int** (*fp)(const char *, const char **)) {
   fp(0, 0);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+// CHECK: define{{.*}} void @uni({{.*}} !kcfi_type [[TYPE4:![0-9]+]]
+void uni(void (*fn)(union Union), union Union arg1) {
+  // UNGENERALIZED: call {{.*}} [ "kcfi"(i32 -1037059548) ]
+  // GENERALIZED: call {{.*}} [ "kcfi"(i32 422130955) ]
+    fn(arg1);
+}
+
 // UNGENERALIZED: [[TYPE]] = !{i32 1296635908}
 // GENERALIZED: [[TYPE]] = !{i32 -49168686}
 
 // UNGENERALIZED: [[TYPE3]] = !{i32 874141567}
 // GENERALIZED: [[TYPE3]] = !{i32 954385378}
+
+// UNGENERALIZED: [[TYPE4]] = !{i32 981319178}
+// GENERALIZED: [[TYPE4]] = !{i32 -1599950473}
+
diff --git a/clang/test/CodeGen/kcfi-normalize.c b/clang/test/CodeGen/kcfi-normalize.c
index b9150e88f6ab5..9291ff8529b31 100644
--- a/clang/test/CodeGen/kcfi-normalize.c
+++ b/clang/test/CodeGen/kcfi-normalize.c
@@ -28,7 +28,21 @@ void baz(void (*fn)(int, int, int), int arg1, int arg2, int arg3) {
     fn(arg1, arg2, arg3);
 }
 
+union Union {
+  char *c;
+  long *n;
+} __attribute__((transparent_union));
+
+void uni(void (*fn)(union Union), union Union arg1) {
+    // CHECK-LABEL: define{{.*}}uni
+    // CHECK-SAME: {{.*}}!kcfi_type ![[TYPE4:[0-9]+]]
+    // CHECK: call void %0(ptr %1) [ "kcfi"(i32 -1430221633) ]
+    fn(arg1);
+}
+
 // CHECK: ![[#]] = !{i32 4, !"cfi-normalize-integers", i32 1}
 // CHECK: ![[TYPE1]] = !{i32 -1143117868}
 // CHECK: ![[TYPE2]] = !{i32 -460921415}
 // CHECK: ![[TYPE3]] = !{i32 -333839615}
+// CHECK: ![[TYPE4]] = !{i32 1766237188}
+

>From 9af4a854602804430dc04766ce1be311259707d6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 13 Sep 2025 10:10:59 +0900
Subject: [PATCH 23/39] AMDGPU: Add test which shows unnecessary register
 alignment (#158168)

The b96 tr loads are a special case that does not require even
aligned VGPRs
---
 .../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll   | 66 +++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll | 54 +++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index f504f2caa8632..3e96dfe40f745 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -158,3 +158,69 @@ entry:
   store <4 x bfloat> %val, ptr addrspace(1) %use
   ret void
 }
+
+; This is a special case that does not require aligned VGPRs. Make
+; sure no copies are required for the unaligned ABI return value.
+define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace(3) %ptr) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v4
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
+
+define void @ds_read_b96_tr_b6_no_align2_requirement_agpr(ptr addrspace(3) %ptr) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, v1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX950-SDAG-NEXT:    ;;#ASMSTART
+; GFX950-SDAG-NEXT:    ; use a1 a2 a3
+; GFX950-SDAG-NEXT:    ;;#ASMEND
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, v1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX950-GISEL-NEXT:    ;;#ASMSTART
+; GFX950-GISEL-NEXT:    ; use a1 a2 a3
+; GFX950-GISEL-NEXT:    ;;#ASMEND
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  %val0 = extractelement <3 x i32> %val, i32 0
+  %val1 = extractelement <3 x i32> %val, i32 1
+  %val2 = extractelement <3 x i32> %val, i32 2
+  call void asm sideeffect "; use $0 $1 $2", "{a1},{a2},{a3}"(i32 %val0, i32 %val1, i32 %val2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
index d91b03ca4461d..d9f2fc55709a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
@@ -320,3 +320,57 @@ entry:
   store <8 x bfloat> %val, ptr addrspace(1) %use
   ret void
 }
+
+; This is a special case that does not require aligned VGPRs. Make
+; sure no copies are required for the unaligned ABI return value.
+define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_vaddr_no_align2_requirement:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_tr6_b96 v[2:4], v[0:1], off offset:32
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
+
+define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}
+
+define { i32, <3 x i32> } @ds_load_tr6_b96_no_align2_requirement(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: ds_load_tr6_b96_no_align2_requirement:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ds_load_tr6_b96 v[2:4], v0 offset:32
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT:    v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+  %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+  ret { i32, <3 x i32> } %insert1
+}

>From 1180c2ced008e33b0a4b2b91b3cb24724f06147c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Fri, 12 Sep 2025 21:11:17 -0400
Subject: [PATCH 24/39] [AMDGPU] Support lowering of cluster related
 instrinsics (#157978)

Since many code are connected, this also changes how workgroup id is lowered.

Co-authored-by: Jay Foad <jay.foad at amd.com>
Co-authored-by: Ivan Kosarev <ivan.kosarev at amd.com>
---
 llvm/docs/AMDGPUUsage.rst                     |    7 +
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp |    8 +
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.h   |   19 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  221 ++-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |    8 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  211 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    3 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |    2 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |    5 +
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |   19 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   48 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   44 +
 .../llvm.amdgcn.cluster.workgroup.id.ll       | 1258 +++++++++++++++++
 ...vm.amdgcn.cluster.workgroup.max.flat.id.ll |  194 +++
 .../llvm.amdgcn.cluster.workgroup.max.id.ll   | 1077 ++++++++++++++
 .../lower-work-group-id-intrinsics-hsa.ll     |    2 +-
 .../lower-work-group-id-intrinsics-opt.ll     |  390 +++++
 .../AMDGPU/lower-work-group-id-intrinsics.ll  |  376 +++++
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |   26 +-
 .../AMDGPU/workgroup-id-in-arch-sgprs.ll      |  216 ++-
 21 files changed, 4100 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 37563203f2f83..cef87e077cc5c 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1812,6 +1812,13 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                                       offset by one less than the number of dynamic VGPR blocks required
                                                       by the function encoded in bits 5..3.
 
+     "amdgpu-cluster-dims"="x,y,z"                    Specify the cluster workgroup dimensions. A value of "0,0,0" indicates that
+                                                      cluster is disabled. A value of "1024,1024,1024" indicates that cluster is enabled,
+                                                      but the dimensions cannot be determined at compile time. Any other value explicitly
+                                                      specifies the cluster dimensions.
+
+                                                      This is only relevant on targets with cluster support.
+
      ================================================ ==========================================================
 
 Calling Conventions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d158f0f58d711..dda8033f47398 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -107,6 +107,14 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
     return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+  case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+    return std::tuple(nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
     return std::tuple(LDSKernelId ? &LDSKernelId : nullptr,
                       &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e07d47381ecca..1064e57b9da9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -111,18 +111,25 @@ struct AMDGPUFunctionArgInfo {
     DISPATCH_ID         =  4,
     FLAT_SCRATCH_INIT   =  5,
     LDS_KERNEL_ID       =  6, // LLVM internal, not part of the ABI
-    WORKGROUP_ID_X      = 10,
-    WORKGROUP_ID_Y      = 11,
-    WORKGROUP_ID_Z      = 12,
+    WORKGROUP_ID_X      = 10, // Also used for cluster ID X.
+    WORKGROUP_ID_Y      = 11, // Also used for cluster ID Y.
+    WORKGROUP_ID_Z      = 12, // Also used for cluster ID Z.
     PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
     IMPLICIT_BUFFER_PTR = 15,
     IMPLICIT_ARG_PTR = 16,
     PRIVATE_SEGMENT_SIZE = 17,
+    CLUSTER_WORKGROUP_ID_X = 21,
+    CLUSTER_WORKGROUP_ID_Y = 22,
+    CLUSTER_WORKGROUP_ID_Z = 23,
+    CLUSTER_WORKGROUP_MAX_ID_X = 24,
+    CLUSTER_WORKGROUP_MAX_ID_Y = 25,
+    CLUSTER_WORKGROUP_MAX_ID_Z = 26,
+    CLUSTER_WORKGROUP_MAX_FLAT_ID = 27,
 
     // VGPRS:
-    WORKITEM_ID_X       = 18,
-    WORKITEM_ID_Y       = 19,
-    WORKITEM_ID_Z       = 20,
+    WORKITEM_ID_X       = 28,
+    WORKITEM_ID_Y       = 29,
+    WORKITEM_ID_Z       = 30,
     FIRST_VGPR_VALUE    = WORKITEM_ID_X
   };
   // clang-format on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f18536cd4ab93..d8c4cbbc4fa33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4452,6 +4452,74 @@ void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
   }
 }
 
+bool AMDGPULegalizerInfo::legalizeWorkGroupId(
+    MachineInstr &MI, MachineIRBuilder &B,
+    AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!ST.hasClusters()) {
+    if (!loadInputValue(DstReg, B, WorkGroupIdPV))
+      return false;
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Clusters are supported. Return the global position in the grid. If clusters
+  // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
+
+  // WorkGroupIdXYZ = ClusterId == 0 ?
+  //   ClusterIdXYZ :
+  //   ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
+  MachineRegisterInfo &MRI = *B.getMRI();
+  const LLT S32 = LLT::scalar(32);
+  Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
+  Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
+  Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
+  if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
+      !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
+      !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
+    return false;
+
+  auto One = B.buildConstant(S32, 1);
+  auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
+  auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
+                                B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
+
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+
+  switch (MFI->getClusterDims().getKind()) {
+  case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
+  case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
+    B.buildCopy(DstReg, GlobalIdXYZ);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
+    B.buildCopy(DstReg, ClusterIdXYZ);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
+    using namespace AMDGPU::Hwreg;
+    unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
+    Register ClusterId = MRI.createGenericVirtualRegister(S32);
+    MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
+    B.buildInstr(AMDGPU::S_GETREG_B32_const)
+        .addDef(ClusterId)
+        .addImm(ClusterIdField);
+    auto Zero = B.buildConstant(S32, 0);
+    auto NoClusters =
+        B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
+    B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
+
+  llvm_unreachable("nothing should reach here");
+}
+
 bool AMDGPULegalizerInfo::loadInputValue(
     Register DstReg, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
@@ -4471,8 +4539,31 @@ bool AMDGPULegalizerInfo::loadInputValue(
       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
   const ArgDescriptor WorkGroupIDZ =
       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  const ArgDescriptor ClusterWorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
+  const ArgDescriptor ClusterWorkGroupIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
+  const ArgDescriptor ClusterWorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
+  const ArgDescriptor ClusterWorkGroupMaxIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
+  const ArgDescriptor ClusterWorkGroupMaxFlatID =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
+
+  auto LoadConstant = [&](unsigned N) {
+    B.buildConstant(DstReg, N);
+    return true;
+  };
+
   if (ST.hasArchitectedSGPRs() &&
       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+    AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
+    bool HasFixedDims = ClusterDims.isFixedDims();
+
     switch (ArgType) {
     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
       Arg = &WorkGroupIDX;
@@ -4489,6 +4580,53 @@ bool AMDGPULegalizerInfo::loadInputValue(
       ArgRC = &AMDGPU::SReg_32RegClass;
       ArgTy = LLT::scalar(32);
       break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+      if (HasFixedDims && ClusterDims.getDims()[0] == 1)
+        return LoadConstant(0);
+      Arg = &ClusterWorkGroupIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+      if (HasFixedDims && ClusterDims.getDims()[1] == 1)
+        return LoadConstant(0);
+      Arg = &ClusterWorkGroupIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+      if (HasFixedDims && ClusterDims.getDims()[2] == 1)
+        return LoadConstant(0);
+      Arg = &ClusterWorkGroupIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[0] - 1);
+      Arg = &ClusterWorkGroupMaxIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[1] - 1);
+      Arg = &ClusterWorkGroupMaxIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[2] - 1);
+      Arg = &ClusterWorkGroupMaxIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+      Arg = &ClusterWorkGroupMaxFlatID;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
     default:
       break;
     }
@@ -4499,10 +4637,9 @@ bool AMDGPULegalizerInfo::loadInputValue(
 
   if (!Arg) {
     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
-      // The intrinsic may appear when we have a 0 sized kernarg segment, in which
-      // case the pointer argument may be missing and we use null.
-      B.buildConstant(DstReg, 0);
-      return true;
+      // The intrinsic may appear when we have a 0 sized kernarg segment, in
+      // which case the pointer argument may be missing and we use null.
+      return LoadConstant(0);
     }
 
     // It's undefined behavior if a function marked with the amdgpu-no-*
@@ -7415,6 +7552,22 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
+                                                 MachineIRBuilder &B,
+                                                 AMDGPU::Hwreg::Id HwReg,
+                                                 unsigned LowBit,
+                                                 unsigned Width) const {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!MRI.getRegClassOrNull(DstReg))
+    MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
+  B.buildInstr(AMDGPU::S_GETREG_B32_const)
+      .addDef(DstReg)
+      .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
+  MI.eraseFromParent();
+  return true;
+}
+
 static constexpr unsigned FPEnvModeBitField =
     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
 
@@ -7577,14 +7730,64 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+    return legalizeWorkGroupId(
+        MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
   case Intrinsic::amdgcn_workgroup_id_y:
-    return legalizePreloadedArgIntrin(MI, MRI, B,
-                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+    return legalizeWorkGroupId(
+        MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
   case Intrinsic::amdgcn_workgroup_id_z:
-    return legalizePreloadedArgIntrin(MI, MRI, B,
+    return legalizeWorkGroupId(
+        MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
+        AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_id_x:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+  case Intrinsic::amdgcn_cluster_id_y:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+  case Intrinsic::amdgcn_cluster_id_z:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_workgroup_id_x:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
+  case Intrinsic::amdgcn_cluster_workgroup_id_y:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
+  case Intrinsic::amdgcn_cluster_workgroup_id_z:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_workgroup_flat_id:
+    return ST.hasClusters() &&
+           legalizeConstHwRegRead(MI, B, AMDGPU::Hwreg::ID_IB_STS2, 21, 4);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
+  case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
+    return ST.hasClusters() &&
+           legalizePreloadedArgIntrin(
+               MI, MRI, B,
+               AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
   case Intrinsic::amdgcn_wave_id:
     return legalizeWaveID(MI, B);
   case Intrinsic::amdgcn_lds_kernel_id:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1f4e02b0d600a..cd44a9ba0807c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -114,6 +114,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   void buildLoadInputValue(Register DstReg, MachineIRBuilder &B,
                            const ArgDescriptor *Arg,
                            const TargetRegisterClass *ArgRC, LLT ArgTy) const;
+  bool legalizeWorkGroupId(
+      MachineInstr &MI, MachineIRBuilder &B,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
@@ -218,6 +223,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B,
+                              AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
+                              unsigned Width) const;
 
   bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
                         MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4927d2be67590..3332723b038f5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2443,6 +2443,53 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
   return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
 }
 
+SDValue SITargetLowering::lowerWorkGroupId(
+    SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
+    AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+    AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
+  if (!Subtarget->hasClusters())
+    return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
+
+  // Clusters are supported. Return the global position in the grid. If clusters
+  // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
+
+  // WorkGroupIdXYZ = ClusterId == 0 ?
+  //   ClusterIdXYZ :
+  //   ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
+  SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
+  SDLoc SL(ClusterIdXYZ);
+  SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
+  SDValue One = DAG.getConstant(1, SL, VT);
+  SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
+  SDValue ClusterWorkGroupIdXYZ =
+      getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
+  SDValue GlobalIdXYZ =
+      DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
+                  DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
+
+  switch (MFI.getClusterDims().getKind()) {
+  case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
+  case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
+    return GlobalIdXYZ;
+  case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
+    return ClusterIdXYZ;
+  case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
+    using namespace AMDGPU::Hwreg;
+    SDValue ClusterIdField =
+        DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
+    SDNode *GetReg =
+        DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
+    SDValue ClusterId(GetReg, 0);
+    SDValue Zero = DAG.getConstant(0, SL, VT);
+    return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
+                       GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
+  }
+  }
+
+  llvm_unreachable("nothing should reach here");
+}
+
 SDValue SITargetLowering::getPreloadedValue(
     SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
     AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
@@ -2461,9 +2508,30 @@ SDValue SITargetLowering::getPreloadedValue(
       AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
   const ArgDescriptor WorkGroupIDZ =
       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  const ArgDescriptor ClusterWorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
+  const ArgDescriptor ClusterWorkGroupIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
+  const ArgDescriptor ClusterWorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
+  const ArgDescriptor ClusterWorkGroupMaxIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDY =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
+  const ArgDescriptor ClusterWorkGroupMaxIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
+  const ArgDescriptor ClusterWorkGroupMaxFlatID =
+      ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
+
+  auto LoadConstant = [&](unsigned N) {
+    return DAG.getConstant(N, SDLoc(), VT);
+  };
+
   if (Subtarget->hasArchitectedSGPRs() &&
-      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx ||
-       CC == CallingConv::AMDGPU_Gfx_WholeWave)) {
+      (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+    AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
+    bool HasFixedDims = ClusterDims.isFixedDims();
+
     switch (PVID) {
     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
       Reg = &WorkGroupIDX;
@@ -2480,6 +2548,53 @@ SDValue SITargetLowering::getPreloadedValue(
       RC = &AMDGPU::SReg_32RegClass;
       Ty = LLT::scalar(32);
       break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+      if (HasFixedDims && ClusterDims.getDims()[0] == 1)
+        return LoadConstant(0);
+      Reg = &ClusterWorkGroupIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+      if (HasFixedDims && ClusterDims.getDims()[1] == 1)
+        return LoadConstant(0);
+      Reg = &ClusterWorkGroupIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+      if (HasFixedDims && ClusterDims.getDims()[2] == 1)
+        return LoadConstant(0);
+      Reg = &ClusterWorkGroupIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[0] - 1);
+      Reg = &ClusterWorkGroupMaxIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[1] - 1);
+      Reg = &ClusterWorkGroupMaxIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+      if (HasFixedDims)
+        return LoadConstant(ClusterDims.getDims()[2] - 1);
+      Reg = &ClusterWorkGroupMaxIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+      Reg = &ClusterWorkGroupMaxFlatID;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
     default:
       break;
     }
@@ -9539,6 +9654,19 @@ SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
                      DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
 }
 
+SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
+                                              AMDGPU::Hwreg::Id HwReg,
+                                              unsigned LowBit,
+                                              unsigned Width) const {
+  SDLoc SL(Op);
+  using namespace AMDGPU::Hwreg;
+  return {DAG.getMachineNode(
+              AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
+              DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
+                                    SL, MVT::i32)),
+          0};
+}
+
 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
                                           unsigned Dim,
                                           const ArgDescriptor &Arg) const {
@@ -9685,14 +9813,81 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
-    return getPreloadedValue(DAG, *MFI, VT,
-                             AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+    return lowerWorkGroupId(DAG, *MFI, VT,
+                            AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
   case Intrinsic::amdgcn_workgroup_id_y:
-    return getPreloadedValue(DAG, *MFI, VT,
-                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+    return lowerWorkGroupId(DAG, *MFI, VT,
+                            AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
   case Intrinsic::amdgcn_workgroup_id_z:
-    return getPreloadedValue(DAG, *MFI, VT,
-                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+    return lowerWorkGroupId(DAG, *MFI, VT,
+                            AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
+                            AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_cluster_id_x:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(DAG, *MFI, VT,
+                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_id_y:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(DAG, *MFI, VT,
+                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_id_z:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(DAG, *MFI, VT,
+                                   AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_id_x:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_id_y:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_id_z:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_flat_id:
+    return Subtarget->hasClusters()
+               ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
+               : SDValue();
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
+               : DAG.getPOISON(VT);
+  case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
+    return Subtarget->hasClusters()
+               ? getPreloadedValue(
+                     DAG, *MFI, VT,
+                     AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
+               : DAG.getPOISON(VT);
   case Intrinsic::amdgcn_wave_id:
     return lowerWaveID(DAG, Op);
   case Intrinsic::amdgcn_lds_kernel_id: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 9c26cfa44a83e..ba408a8f64540 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -16,6 +16,7 @@
 
 #include "AMDGPUArgumentUsageInfo.h"
 #include "AMDGPUISelLowering.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -64,6 +65,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
                               const SDLoc &SL, SDValue Chain,
                               const ISD::InputArg &Arg) const;
+  SDValue lowerWorkGroupId(
+      SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+      AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
   SDValue getPreloadedValue(SelectionDAG &DAG,
                             const SIMachineFunctionInfo &MFI,
                             EVT VT,
@@ -84,6 +90,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                         unsigned NewOpcode) const;
 
   SDValue lowerWaveID(SelectionDAG &DAG, SDValue Op) const;
+  SDValue lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
+                              AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
+                              unsigned Width) const;
   SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim,
                           const ArgDescriptor &ArgDesc) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 24a20cc9dcf82..dffb3d7459e64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -928,7 +928,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return Opcode == AMDGPU::S_CMPK_EQ_U32 || Opcode == AMDGPU::S_CMPK_LG_U32 ||
            Opcode == AMDGPU::S_CMPK_GT_U32 || Opcode == AMDGPU::S_CMPK_GE_U32 ||
            Opcode == AMDGPU::S_CMPK_LT_U32 || Opcode == AMDGPU::S_CMPK_LE_U32 ||
-           Opcode == AMDGPU::S_GETREG_B32;
+           Opcode == AMDGPU::S_GETREG_B32 ||
+           Opcode == AMDGPU::S_GETREG_B32_const;
   }
 
   /// \returns true if this is an s_store_dword* instruction. This is more
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 54426d33d3473..1f11be475e9f8 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -195,6 +195,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     VGPRForAGPRCopy =
         AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
   }
+
+  ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
 }
 
 MachineFunctionInfo *SIMachineFunctionInfo::clone(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ca8f8033a2d54..45606153db58e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -465,6 +465,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // Default/requested number of work groups for the function.
   SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
+  // Requested cluster dimensions.
+  AMDGPU::ClusterDimsAttr ClusterDims;
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1207,6 +1210,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
   unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+
+  AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index fe94887cdff98..296ce5a46287c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1127,19 +1127,26 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "$sdst, $simm16"
 >;
 
-// This is hasSideEffects to allow its use in readcyclecounter selection.
 // FIXME: Need to truncate immediate to 16-bits.
-// FIXME: Should have separate pseudos for known may read MODE and
-// only read MODE.
-def S_GETREG_B32 : SOPK_Pseudo <
+class S_GETREG_B32_Pseudo<list<dag> pattern=[]> : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
-  "$sdst, $simm16",
-  [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
+  "$sdst, $simm16", pattern>;
+
+// This is hasSideEffects to allow its use in readcyclecounter selection.
+// FIXME: Should have separate pseudos for known may read MODE and
+// only read MODE.
+def S_GETREG_B32 : S_GETREG_B32_Pseudo<
+    [(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
   let hasSideEffects = 1;
   let Uses = [MODE];
 }
 
+// A version of the pseudo for reading hardware register fields that are
+// known to remain the same during the course of the run. Has no side
+// effects and doesn't read MODE.
+def S_GETREG_B32_const : S_GETREG_B32_Pseudo;
+
 let Defs = [MODE], Uses = [MODE] in {
 
 // FIXME: Need to truncate immediate to 16-bits.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 40da4f96aefdb..faae1fee342af 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3533,6 +3533,54 @@ bool isPackedFP32Inst(unsigned Opc) {
   }
 }
 
+const std::array<unsigned, 3> &ClusterDimsAttr::getDims() const {
+  assert(isFixedDims() && "expect kind to be FixedDims");
+  return Dims;
+}
+
+std::string ClusterDimsAttr::to_string() const {
+  SmallString<10> Buffer;
+  raw_svector_ostream OS(Buffer);
+
+  switch (getKind()) {
+  case Kind::Unknown:
+    return "";
+  case Kind::NoCluster: {
+    OS << EncoNoCluster << ',' << EncoNoCluster << ',' << EncoNoCluster;
+    return Buffer.c_str();
+  }
+  case Kind::VariableDims: {
+    OS << EncoVariableDims << ',' << EncoVariableDims << ','
+       << EncoVariableDims;
+    return Buffer.c_str();
+  }
+  case Kind::FixedDims: {
+    OS << Dims[0] << ',' << Dims[1] << ',' << Dims[2];
+    return Buffer.c_str();
+  }
+  }
+  llvm_unreachable("Unknown ClusterDimsAttr kind");
+}
+
+ClusterDimsAttr ClusterDimsAttr::get(const Function &F) {
+  std::optional<SmallVector<unsigned>> Attr =
+      getIntegerVecAttribute(F, "amdgpu-cluster-dims", /*Size=*/3);
+  ClusterDimsAttr::Kind AttrKind = Kind::FixedDims;
+
+  if (!Attr.has_value())
+    AttrKind = Kind::Unknown;
+  else if (all_of(*Attr, [](unsigned V) { return V == EncoNoCluster; }))
+    AttrKind = Kind::NoCluster;
+  else if (all_of(*Attr, [](unsigned V) { return V == EncoVariableDims; }))
+    AttrKind = Kind::VariableDims;
+
+  ClusterDimsAttr A(AttrKind);
+  if (AttrKind == Kind::FixedDims)
+    A.Dims = {(*Attr)[0], (*Attr)[1], (*Attr)[2]};
+
+  return A;
+}
+
 } // namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 3fcd16f9290b1..3f8d43db5a48c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1813,6 +1813,50 @@ bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode);
 /// must be defined in terms of bytes.
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST);
 
+class ClusterDimsAttr {
+public:
+  enum class Kind { Unknown, NoCluster, VariableDims, FixedDims };
+
+  ClusterDimsAttr() = default;
+
+  Kind getKind() const { return AttrKind; }
+
+  bool isUnknown() const { return getKind() == Kind::Unknown; }
+
+  bool isNoCluster() const { return getKind() == Kind::NoCluster; }
+
+  bool isFixedDims() const { return getKind() == Kind::FixedDims; }
+
+  bool isVariableedDims() const { return getKind() == Kind::VariableDims; }
+
+  void setUnknown() { *this = ClusterDimsAttr(Kind::Unknown); }
+
+  void setNoCluster() { *this = ClusterDimsAttr(Kind::NoCluster); }
+
+  void setVariableDims() { *this = ClusterDimsAttr(Kind::VariableDims); }
+
+  /// \returns the dims stored. Note that this function can only be called if
+  /// the kind is \p Fixed.
+  const std::array<unsigned, 3> &getDims() const;
+
+  bool operator==(const ClusterDimsAttr &RHS) const {
+    return AttrKind == RHS.AttrKind && Dims == RHS.Dims;
+  }
+
+  std::string to_string() const;
+
+  static ClusterDimsAttr get(const Function &F);
+
+private:
+  enum Encoding { EncoNoCluster = 0, EncoVariableDims = 1024 };
+
+  ClusterDimsAttr(Kind AttrKind) : AttrKind(AttrKind) {}
+
+  std::array<unsigned, 3> Dims = {0, 0, 0};
+
+  Kind AttrKind = Kind::Unknown;
+};
+
 } // end namespace AMDGPU
 
 raw_ostream &operator<<(raw_ostream &OS,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
new file mode 100644
index 0000000000000..aa3b7b3606fd8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
@@ -0,0 +1,1258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.id.z() #0
+
+define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_x:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_x:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_x:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_and_b32 s2, ttmp6, 15
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1,2,2" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_x_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_x_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_x_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_y:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_y:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_y:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_y_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_y_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_y_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_z:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_z:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_z:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
+; CHECK-UNKNOWN-LABEL: test_workgroup_flat_id:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_flat_id:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_flat_id:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_flat_id:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.flat.id()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,2,1" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_id_z_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_id_z_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_id_z_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 1
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v0, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
new file mode 100644
index 0000000000000..afe37e371fbc3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.workgroup.max.flat.id() #0
+
+define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_flat_id:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_flat_id:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_flat_id:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_flat_id:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.flat.id()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
new file mode 100644
index 0000000000000..7ea4fa5373e57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
@@ -0,0 +1,1077 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-UNKNOWN %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-MESA3D %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 %s -o - | FileCheck --check-prefixes=CHECK-G-UNKNOWN %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=CHECK-G-MESA3D %s
+
+declare i32 @llvm.amdgcn.cluster.workgroup.max.id.x() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.max.id.y() #0
+declare i32 @llvm.amdgcn.cluster.workgroup.max.id.z() #0
+
+define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_x:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_x:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_x_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_y:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_y:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_y_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_z:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_z:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
+; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
+; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6
+; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-MESA3D-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-MESA3D:         .amd_kernel_code_t
+; CHECK-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     priority = 0
+; CHECK-MESA3D-NEXT:     float_mode = 240
+; CHECK-MESA3D-NEXT:     priv = 0
+; CHECK-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-MESA3D-NEXT:     debug_mode = 0
+; CHECK-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-MESA3D-NEXT:     enable_exception = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-MESA3D-NEXT:     private_element_size = 1
+; CHECK-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-MESA3D-NEXT:     call_convention = -1
+; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6
+; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-MESA3D-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-MESA3D-NEXT:    s_endpgm
+;
+; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0
+; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    s_endpgm
+;
+; CHECK-G-MESA3D-LABEL: test_workgroup_max_id_z_optimized:
+; CHECK-G-MESA3D:         .amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:     amd_code_version_major = 1
+; CHECK-G-MESA3D-NEXT:     amd_code_version_minor = 2
+; CHECK-G-MESA3D-NEXT:     amd_machine_kind = 1
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_major = 12
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_minor = 5
+; CHECK-G-MESA3D-NEXT:     amd_machine_version_stepping = 0
+; CHECK-G-MESA3D-NEXT:     kernel_code_entry_byte_offset = 256
+; CHECK-G-MESA3D-NEXT:     kernel_code_prefetch_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     granulated_workitem_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     granulated_wavefront_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     priority = 0
+; CHECK-G-MESA3D-NEXT:     float_mode = 240
+; CHECK-G-MESA3D-NEXT:     priv = 0
+; CHECK-G-MESA3D-NEXT:     enable_dx10_clamp = 0
+; CHECK-G-MESA3D-NEXT:     debug_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_ieee_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_wgp_mode = 0
+; CHECK-G-MESA3D-NEXT:     enable_mem_ordered = 1
+; CHECK-G-MESA3D-NEXT:     enable_fwd_progress = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; CHECK-G-MESA3D-NEXT:     user_sgpr_count = 8
+; CHECK-G-MESA3D-NEXT:     enable_trap_handler = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_x = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_y = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_id_z = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_workgroup_info = 0
+; CHECK-G-MESA3D-NEXT:     enable_vgpr_workitem_id = 2
+; CHECK-G-MESA3D-NEXT:     enable_exception_msb = 0
+; CHECK-G-MESA3D-NEXT:     granulated_lds_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_exception = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_buffer = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_queue_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_dispatch_id = 1
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_flat_scratch_init = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_private_segment_size = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; CHECK-G-MESA3D-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; CHECK-G-MESA3D-NEXT:     enable_wavefront_size32 = 1
+; CHECK-G-MESA3D-NEXT:     enable_ordered_append_gds = 0
+; CHECK-G-MESA3D-NEXT:     private_element_size = 1
+; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
+; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
+; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_byte_size = 24
+; CHECK-G-MESA3D-NEXT:     workgroup_fbarrier_count = 0
+; CHECK-G-MESA3D-NEXT:     wavefront_sgpr_count = 6
+; CHECK-G-MESA3D-NEXT:     workitem_vgpr_count = 2
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_vgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_first = 0
+; CHECK-G-MESA3D-NEXT:     reserved_sgpr_count = 0
+; CHECK-G-MESA3D-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     debug_private_segment_buffer_sgpr = 0
+; CHECK-G-MESA3D-NEXT:     kernarg_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     group_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     private_segment_alignment = 4
+; CHECK-G-MESA3D-NEXT:     wavefront_size = 5
+; CHECK-G-MESA3D-NEXT:     call_convention = -1
+; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
+; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
+; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0
+; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
+; CHECK-G-MESA3D-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-MESA3D-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 2554d99def57f..169a84ff1f86b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -297,6 +297,6 @@ declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
 
-attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX9ARCH: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
new file mode 100644
index 0000000000000..69439d49e588f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GFX1250-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel %s -o - | FileCheck -check-prefix=GFX1250-GISEL %s
+
+define void @test_workgroup_id_x_non_kernel(ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, ttmp9, s1
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, ttmp9, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_x_non_kernel_optimized_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1024,1024,1024" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel_optimized_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel_optimized_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_x_non_kernel_optimized_not_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="0,0,0" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel_optimized_not_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, ttmp9
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel_optimized_not_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, ttmp9
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_x_non_kernel_optimized_fixed:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_x_non_kernel_optimized_fixed:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.x()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel(ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel_optimized_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1024,1024,1024" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel_optimized_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel_optimized_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s2, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel_optimized_not_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="0,0,0" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel_optimized_not_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel_optimized_not_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_y_non_kernel_optimized_fixed(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_y_non_kernel_optimized_fixed:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_y_non_kernel_optimized_fixed:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.y()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel(ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel_optimized_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1024,1024,1024" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel_optimized_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel_optimized_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s2, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel_optimized_not_used(ptr addrspace(1) %out) "amdgpu-cluster-dims"="0,0,0" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel_optimized_not_used:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel_optimized_not_used:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_workgroup_id_z_non_kernel_optimized_fixed(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
+; GFX1250-SDAG-LABEL: test_workgroup_id_z_non_kernel_optimized_fixed:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 15
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s1, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, s0, 0x1fffe
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: test_workgroup_id_z_non_kernel_optimized_fixed:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s1, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_lshl1_add_u32 s0, s0, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workgroup.id.z()
+  store i32 %id, ptr addrspace(1) %out
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
new file mode 100644
index 0000000000000..497241cff392d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs < %s | FileCheck -check-prefix=GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel < %s | FileCheck -check-prefix=GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -global-isel < %s | FileCheck -check-prefix=GFX1250-GISEL %s
+
+define amdgpu_cs void @_amdgpu_cs_main() {
+; GFX9-SDAG-LABEL: _amdgpu_cs_main:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: _amdgpu_cs_main:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: _amdgpu_cs_main:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: _amdgpu_cs_main:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: _amdgpu_cs_main:
+; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_and_b32 s3, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s3, s2
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s5, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, 1
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, s0
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, s5, s4
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s5, s4
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s1, ttmp9, s1
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: _amdgpu_cs_main:
+; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s1, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s3, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s4, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s3, s1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s4, s4, s1
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s1, s3, s4
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s3, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s4, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s5, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s3, s4, s3
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s5, s5, s3
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, s4, s5
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @workgroup_id_no_clusters() "amdgpu-cluster-dims"="0,0,0" {
+; GFX9-SDAG-LABEL: workgroup_id_no_clusters:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_id_no_clusters:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_id_no_clusters:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_id_no_clusters:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_no_clusters:
+; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_no_clusters:
+; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
+; GFX9-SDAG-LABEL: workgroup_id_optimized:
+; GFX9-SDAG:       ; %bb.0: ; %.entry
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_id_optimized:
+; GFX9-GISEL:       ; %bb.0: ; %.entry
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_id_optimized:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_id_optimized:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_optimized:
+; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 14
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, s2, 0x3fffc
+; GFX1250-SDAG-NEXT:    s_and_b32 s2, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s3, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_mul_i32 s2, s2, 3
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_optimized:
+; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s1, s1, 3
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s3, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s4, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s2, s1
+; GFX1250-GISEL-NEXT:    s_lshl2_add_u32 s2, s3, s4
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX1250-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-SDAG-NEXT:    s_mov_b32 s8, s0
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, callee at abs32@hi
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, callee at abs32@lo
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-GISEL-NEXT:    s_mov_b32 s8, s0
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, callee at abs32@lo
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, callee at abs32@hi
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: caller:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, ttmp9, s1
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], callee at abs64
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-SDAG-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: caller:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], callee at abs64
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call amdgpu_gfx void @callee(i32 %idx)
+  ret void
+}
+
+declare amdgpu_gfx void @callee(i32)
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 25609e881254e..b2bcb74e4184f 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -4089,32 +4089,44 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ; GFX1250-NEXT:    s_add_co_i32 s0, s10, 1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, s0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v2, s0, v1 :: v_dual_add_nc_u32 v1, 1, v1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_and_b32 s4, ttmp6, 15
+; GFX1250-NEXT:    s_getreg_b32 s5, hwreg(HW_REG_IB_STS2, 6, 4)
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v2, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_bfe_u32 s3, ttmp6, 0x4000c
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_mul_i32 s3, ttmp9, s3
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v1, v3, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, s3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 1, v3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX1250-NEXT:    v_mul_lo_u32 v3, v1, v2
-; GFX1250-NEXT:    v_mad_u32 v0, ttmp9, s2, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_cselect_b32 s3, ttmp9, s4
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_u32 v0, s3, s2, v0
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, v3, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v2, v1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_u32 v3, v2, v3, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[8:9]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_u32 v2, v3, v2, v3
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 7a64e55abb8d3..afca83a7e1c36 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -global-isel=1 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
 
 define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ;
@@ -15,6 +17,50 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1200-LABEL: workgroup_id_x:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1200-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_x:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_and_b32 s3, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s2, s2, 1
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_mul_i32 s2, ttmp9, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s2
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, ttmp9, s3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_x:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s3, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s2, s2, 1
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s2, ttmp9, s2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s3, s3, s2
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, ttmp9, s3
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_x:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -41,6 +87,74 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1200-LABEL: workgroup_id_xy:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX1200-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX1200-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_xy:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s6, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s6, s6, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s7, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_mul_i32 s5, s4, s6
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s6, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s7, 1
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s6, s6, s5
+; GFX1250-SDAG-NEXT:    s_and_b32 s5, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_mul_i32 s7, ttmp9, s7
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s8, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s5, s5, s7
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s5, ttmp9, s5
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_xy:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s6, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s6, s6, 1
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_and_b32 s4, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_mul_i32 s5, ttmp9, s6
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s4, s4, s5
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s4, ttmp9, s4
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s5, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s7, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s5, s5, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s8, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s5, s7, s5
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s8, s8, s5
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s4, s7, s8
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_xy:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -77,6 +191,99 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX1200-LABEL: workgroup_id_xyz:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1200-NEXT:    s_and_b32 s6, ttmp7, 0xffff
+; GFX1200-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX1200-NEXT:    s_lshr_b32 s7, ttmp7, 16
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x2
+; GFX1200-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1200-NEXT:    global_store_b32 v1, v3, s[4:5]
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-SDAG-LABEL: workgroup_id_xyz:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s6, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s0, 1
+; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s9, ttmp6, 0x40010
+; GFX1250-SDAG-NEXT:    s_mul_i32 s7, s6, s7
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s8, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_and_b32 s10, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, 1
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s11, ttmp6, 0x4000c
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s8, s8, s7
+; GFX1250-SDAG-NEXT:    s_mul_i32 s7, s10, s9
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s9, ttmp6, 0x40004
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s11, s11, 1
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, s7
+; GFX1250-SDAG-NEXT:    s_and_b32 s7, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_mul_i32 s11, ttmp9, s11
+; GFX1250-SDAG-NEXT:    s_getreg_b32 s12, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s7, s11
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s7, ttmp9, s7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s7, s10, s9
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s6, s6, s8
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_clause 0x2
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v3, s[4:5]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: workgroup_id_xyz:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
+; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s7, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
+; GFX1250-GISEL-NEXT:    s_and_b32 s8, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s9, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s10, s8, s0
+; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s9, s9, s10
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s9, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_lshr_b32 s10, ttmp7, 16
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s9, s9, 1
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s11, ttmp6, 0x40008
+; GFX1250-GISEL-NEXT:    s_mul_i32 s9, s10, s9
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s11, s11, s9
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s6, s10, s11
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s6
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    s_clause 0x2
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v2, s[2:3]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v3, s[4:5]
+; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_xyz:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
@@ -107,7 +314,6 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
+; GFX1250: {{.*}}
 ; GFX9-GISEL: {{.*}}
 ; GFX9-SDAG: {{.*}}

>From ffcaeca90a3c0965acace6645f775ab1d876fa6e Mon Sep 17 00:00:00 2001
From: Afanasyev Ivan <ivafanas at gmail.com>
Date: Sat, 13 Sep 2025 08:45:54 +0700
Subject: [PATCH 25/39] [CodeGen] Fix partial phi input removal in
 TailDuplicator. (#158265)

Tail duplicator removes the first PHI income from the predecessor basic
block, while it should remove all operands for this block.

PHI instructions happen to have duplicated values for the same
predecessor block:
* `UnreachableMachineBlockElim` assumes that PHI instruction might have
duplicates:
https://github.com/llvm/llvm-project/blob/7289f2cd0c371b2539faa628ec0eea58fa61892c/llvm/lib/CodeGen/UnreachableBlockElim.cpp#L160
* `AArch64` directly states that PHI instruction might have duplicates:
https://github.com/llvm/llvm-project/blob/7289f2cd0c371b2539faa628ec0eea58fa61892c/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp#L244
* And `Hexagon`:
https://github.com/llvm/llvm-project/blob/7289f2cd0c371b2539faa628ec0eea58fa61892c/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp#L844

We have caught the bug on custom out-of-tree backend. `TailDuplicator`
should remove all operands corresponding to the removing block.

Please note, that bug likely does not affect in-tree backends, because:
* It happens only in scenario of **partial** tail duplication (i.e. tail
block is duplicated in some predecessors, but not in all of them)
* It happens in **Pre-RA** tail duplication only (Post-RA does not
contain PHIs, obviously)
* The only backend (I know) uses Pre-RA tail duplication is X86. It uses
tail duplication via `early-tailduplication` pass which declines partial
tail duplication via `canCompletelyDuplicateBB` check, because it uses
`TailDuplicator::tailDuplicateBlocks` public API.

So, bug happens only in the case of pre-ra partial tail duplication if
backend uses `TailDuplicator::tailDuplicate` public API directly.

That's why I can not add reproducer test for in-tree backends.
---
 llvm/lib/CodeGen/TailDuplicator.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 5d720fbbf1c61..9b1420a94142d 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -375,9 +375,14 @@ void TailDuplicator::processPHI(
   if (!Remove)
     return;
 
-  // Remove PredBB from the PHI node.
-  MI->removeOperand(SrcOpIdx + 1);
-  MI->removeOperand(SrcOpIdx);
+  // MI might have multiple entries for PredBB. Need to remove them all.
+  for (unsigned N = MI->getNumOperands(); N > 2; N -= 2) {
+    if (MI->getOperand(N - 1).getMBB() == PredBB) {
+      MI->removeOperand(N - 1);
+      MI->removeOperand(N - 2);
+    }
+  }
+
   if (MI->getNumOperands() == 1 && !TailBB->hasAddressTaken())
     MI->eraseFromParent();
   else if (MI->getNumOperands() == 1)

>From 7aad3830fb208771254b4ae63a01042744471091 Mon Sep 17 00:00:00 2001
From: lntue <lntue at google.com>
Date: Fri, 12 Sep 2025 21:49:34 -0400
Subject: [PATCH 26/39] [libc] Some MSVC compatibility changes for
 src/string/memory_utils. (#158393)

---
 libc/src/__support/endian_internal.h                 | 12 +++++++-----
 libc/src/__support/macros/config.h                   |  2 ++
 libc/src/string/memory_utils/CMakeLists.txt          |  1 +
 libc/src/string/memory_utils/op_generic.h            | 11 +++++++++++
 libc/src/string/memory_utils/op_x86.h                | 10 ++++++++++
 libc/src/string/memory_utils/utils.h                 |  5 +++++
 libc/test/UnitTest/CMakeLists.txt                    |  1 +
 libc/test/UnitTest/LibcTest.h                        |  5 +++++
 .../libc/test/UnitTest/BUILD.bazel                   |  1 +
 9 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/libc/src/__support/endian_internal.h b/libc/src/__support/endian_internal.h
index 4ac8709625d3a..07cde7b905c4d 100644
--- a/libc/src/__support/endian_internal.h
+++ b/libc/src/__support/endian_internal.h
@@ -35,7 +35,7 @@ template <> LIBC_INLINE uint16_t byte_swap<uint16_t>(uint16_t value) {
 #if __has_builtin(__builtin_bswap16)
   return __builtin_bswap16(value);
 #else
-  return (v << 8) | (v >> 8);
+  return (value << 8) | (value >> 8);
 #endif // __builtin_bswap16
 }
 
@@ -43,8 +43,9 @@ template <> LIBC_INLINE uint32_t byte_swap<uint32_t>(uint32_t value) {
 #if __has_builtin(__builtin_bswap32)
   return __builtin_bswap32(value);
 #else
-  return byte_swap<uint16_t>(static_cast<uint16>(v >> 16)) ||
-         (static_cast<uint32_t>(byte_swap<uint16_t>(static_cast<uint16_t>(v)))
+  return byte_swap<uint16_t>(static_cast<uint16_t>(value >> 16)) ||
+         (static_cast<uint32_t>(
+              byte_swap<uint16_t>(static_cast<uint16_t>(value)))
           << 16);
 #endif // __builtin_bswap64
 }
@@ -53,8 +54,9 @@ template <> LIBC_INLINE uint64_t byte_swap<uint64_t>(uint64_t value) {
 #if __has_builtin(__builtin_bswap64)
   return __builtin_bswap64(value);
 #else
-  return byte_swap<uint32_t>(static_cast<uint32>(v >> 32)) ||
-         (static_cast<uint64_t>(byte_swap<uint32_t>(static_cast<uint32_t>(v)))
+  return byte_swap<uint32_t>(static_cast<uint32_t>(value >> 32)) ||
+         (static_cast<uint64_t>(
+              byte_swap<uint32_t>(static_cast<uint32_t>(value)))
           << 32);
 #endif // __builtin_bswap64
 }
diff --git a/libc/src/__support/macros/config.h b/libc/src/__support/macros/config.h
index 501a816d49631..b06a890c9c13c 100644
--- a/libc/src/__support/macros/config.h
+++ b/libc/src/__support/macros/config.h
@@ -46,6 +46,8 @@
 #define __builtin_expect(value, expectation) (value)
 #define __builtin_unreachable() __assume(0)
 
+#define __builtin_prefetch(X, Y, Z)
+
 #endif // LIBC_COMPILER_IS_MSVC
 
 #ifdef __clang__
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 670db30129572..9cabfb9318012 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -42,6 +42,7 @@ add_header_library(
     libc.src.__support.macros.config
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.compiler
 )
 
 add_header_library(
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 37603410e3a51..010f2187a4ffd 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -31,6 +31,7 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_NAMESPACE_DECL
 #include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64
 #include "src/string/memory_utils/op_builtin.h"
 #include "src/string/memory_utils/utils.h"
@@ -39,12 +40,22 @@ static_assert((UINTPTR_MAX == 4294967295U) ||
                   (UINTPTR_MAX == 18446744073709551615UL),
               "We currently only support 32- or 64-bit platforms");
 
+#ifdef LIBC_COMPILER_IS_MSVC
+
+namespace LIBC_NAMESPACE_DECL {
+using generic_v128 = __m128i;
+using generic_v256 = __m256i;
+using generic_v512 = __m512i;
+} // namespace LIBC_NAMESPACE_DECL
+
+#else
 namespace LIBC_NAMESPACE_DECL {
 // Compiler types using the vector attributes.
 using generic_v128 = uint8_t __attribute__((__vector_size__(16)));
 using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
 using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
 } // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_COMPILER_IS_MSVC
 
 namespace LIBC_NAMESPACE_DECL {
 namespace generic {
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 8bd84120c4ffa..1b4052747552d 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -15,6 +15,7 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_NAMESPACE_DECL
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/compiler.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 
@@ -57,7 +58,12 @@ LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
 // Memcpy repmovsb implementation
 struct Memcpy {
   LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
+#ifdef LIBC_COMPILER_IS_MSVC
+    __movsb(static_cast<unsigned char *>(dst),
+            static_cast<const unsigned char *>(src), count);
+#else
     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+#endif // LIBC_COMPILER_IS_MSVC
   }
 };
 
@@ -138,8 +144,10 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
 // When we use these SIMD types in template specialization GCC complains:
 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
 // Therefore, we disable this warning in this file.
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif // !LIBC_COMPILER_IS_MSVC
 
 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for __m128i
@@ -366,7 +374,9 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
 }
 #endif // __AVX512BW__
 
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic pop
+#endif // !LIBC_COMPILER_IS_MSVC
 
 } // namespace generic
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 0f9c9e36a3dcd..86ff4f12e8c26 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -17,6 +17,7 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_NAMESPACE_DECL
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/compiler.h"
 
 #include <stddef.h> // size_t
 
@@ -90,13 +91,17 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
   // different value of the Size parameter. This doesn't play well with GCC's
   // Value Range Analysis that wrongly detects out of bounds accesses. We
   // disable these warnings for the purpose of this function.
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #pragma GCC diagnostic ignored "-Wstringop-overread"
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif // !LIBC_COMPILER_IS_MSVC
   for (size_t i = 0; i < Size; ++i)
     static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
+#ifndef LIBC_COMPILER_IS_MSVC
 #pragma GCC diagnostic pop
+#endif // !LIBC_COMPILER_IS_MSVC
 #endif
 }
 
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index f1a83fc601e5e..31d1e9dce8204 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -76,6 +76,7 @@ add_unittest_framework_library(
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.type_traits
     libc.src.__support.fixed_point.fx_rep
+    libc.src.__support.macros.properties.compiler
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
diff --git a/libc/test/UnitTest/LibcTest.h b/libc/test/UnitTest/LibcTest.h
index fbeafd0bacb75..cf098cdd7a49a 100644
--- a/libc/test/UnitTest/LibcTest.h
+++ b/libc/test/UnitTest/LibcTest.h
@@ -30,6 +30,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/c_string.h"
+#include "src/__support/macros/properties/compiler.h"
 #include "test/UnitTest/ExecuteFunction.h"
 #include "test/UnitTest/TestLogger.h"
 
@@ -260,7 +261,11 @@ constexpr char const *GetPrettyFunctionParamType(char const *str) {
 // This function recovers ParamType at compile time by using __PRETTY_FUNCTION__
 // It can be customized by using the REGISTER_TYPE_NAME macro below.
 template <typename ParamType> static constexpr const char *GetTypeName() {
+#ifdef LIBC_COMPILER_IS_MSVC
+  return GetPrettyFunctionParamType(__FUNCSIG__);
+#else
   return GetPrettyFunctionParamType(__PRETTY_FUNCTION__);
+#endif // LIBC_COMPILER_IS_MSVC
 }
 
 template <typename T>
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index 24baaf1983a08..318397615d0e3 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -62,6 +62,7 @@ libc_test_library(
         "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
         "//libc:__support_macros_properties_architectures",
+        "//libc:__support_macros_properties_compiler",
         "//libc:__support_macros_properties_types",
         "//libc:__support_stringutil",
         "//libc:__support_uint128",

>From 0ca54d7738103f5ff352f7194b34a11aa4d5aea0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 18:54:48 -0700
Subject: [PATCH 27/39] [LegalizeTypes] Use getShiftAmountConstant in
 SplitInteger. (#158392)

This function contained old code for handling the case that the type
returned getScalarShiftAmountTy can't hold the shift amount.

These days this is handled by getShiftAmountTy which is used by
getShiftAmountConstant.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 83fade45d1892..cc0fd7993916c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1026,14 +1026,9 @@ void DAGTypeLegalizer::SplitInteger(SDValue Op,
   assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
          Op.getValueSizeInBits() && "Invalid integer splitting!");
   Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
-  unsigned ReqShiftAmountInBits =
-      Log2_32_Ceil(Op.getValueType().getSizeInBits());
-  MVT ShiftAmountTy =
-      TLI.getScalarShiftAmountTy(DAG.getDataLayout(), Op.getValueType());
-  if (ReqShiftAmountInBits > ShiftAmountTy.getSizeInBits())
-    ShiftAmountTy = MVT::getIntegerVT(NextPowerOf2(ReqShiftAmountInBits));
-  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
-                   DAG.getConstant(LoVT.getSizeInBits(), dl, ShiftAmountTy));
+  Hi = DAG.getNode(
+      ISD::SRL, dl, Op.getValueType(), Op,
+      DAG.getShiftAmountConstant(LoVT.getSizeInBits(), Op.getValueType(), dl));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
 }
 

>From d7f67d0e3519892e589d425df2ed92a0ecc7cf3d Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Sat, 13 Sep 2025 07:40:58 +0530
Subject: [PATCH 28/39] Formatting changes

---
 clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
index 271820e4e5f25..704ddc37e642e 100644
--- a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
+++ b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
@@ -174,7 +174,7 @@ TEST_F(InterpreterTestBase, SanityWithRemoteExecution) {
 
   std::string OrcRuntimePath = getOrcRuntimePath();
   std::string ExecutorPath = getExecutorPath();
-  
+
   if (!llvm::sys::fs::exists(OrcRuntimePath) ||
       !llvm::sys::fs::exists(ExecutorPath))
     GTEST_SKIP();

>From 004f209199d53a0c7a00ca7af4446407da4c9fb1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Fri, 12 Sep 2025 19:18:58 -0700
Subject: [PATCH 29/39] [CodeGen][CFI] Generalize transparent union parameters
 (#158193)

According GCC documentation transparent union
calling convention is the same as the type of the
first member of the union.

C++ ignores attribute.

Note, it does not generalize args of function pointer args.
It's unnecessary with pointer generalization.
It will be fixed in followup patch.

---------

Co-authored-by: lntue <lntue at google.com>
---
 clang/lib/CodeGen/CodeGenModule.cpp       | 15 ++++++++++++++-
 clang/test/CodeGen/cfi-icall-generalize.c |  8 ++++----
 clang/test/CodeGen/cfi-icall-normalize2.c |  4 ++--
 clang/test/CodeGen/kcfi-generalize.c      |  9 ++++-----
 clang/test/CodeGen/kcfi-normalize.c       | 11 ++++++-----
 5 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d25ce3165bd79..0ebab141b187d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2339,13 +2339,26 @@ llvm::ConstantInt *CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata *MD) {
   return llvm::ConstantInt::get(Int64Ty, llvm::MD5Hash(MDS->getString()));
 }
 
+static QualType GeneralizeTransparentUnion(QualType Ty) {
+  const RecordType *UT = Ty->getAsUnionType();
+  if (!UT)
+    return Ty;
+  const RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+  if (!UD->hasAttr<TransparentUnionAttr>())
+    return Ty;
+  for (const auto *it : UD->fields()) {
+    return it->getType();
+  }
+  return Ty;
+}
+
 // If `GeneralizePointers` is true, generalizes types to a void pointer with the
 // qualifiers of the originally pointed-to type, e.g. 'const char *' and 'char *
 // const *' generalize to 'const void *' while 'char *' and 'const char **'
 // generalize to 'void *'.
 static QualType GeneralizeType(ASTContext &Ctx, QualType Ty,
                                bool GeneralizePointers) {
-  // TODO: Add other generalizations.
+  Ty = GeneralizeTransparentUnion(Ty);
 
   if (!GeneralizePointers || !Ty->isPointerType())
     return Ty;
diff --git a/clang/test/CodeGen/cfi-icall-generalize.c b/clang/test/CodeGen/cfi-icall-generalize.c
index 46d38511ba6b6..5995540ba33fb 100644
--- a/clang/test/CodeGen/cfi-icall-generalize.c
+++ b/clang/test/CodeGen/cfi-icall-generalize.c
@@ -22,14 +22,14 @@ union Union {
 
 // CHECK: define{{.*}} void @uni({{.*}} !type [[TYPE2:![0-9]+]] !type [[TYPE2_GENERALIZED:![0-9]+]]
 void uni(void (*fn)(union Union), union Union arg1) {
-  // UNGENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFv5UnionE")
-  // GENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFv5UnionE.generalized")
+  // UNGENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFvPcE")
+  // GENERALIZED: call i1 @llvm.type.test(ptr {{.*}}, metadata !"_ZTSFvPvE.generalized")
     fn(arg1);
 }
 
 // CHECK: [[TYPE]] = !{i64 0, !"_ZTSFPPiPKcPS2_E"}
 // CHECK: [[TYPE_GENERALIZED]] = !{i64 0, !"_ZTSFPvPKvS_E.generalized"}
 
-// CHECK: [[TYPE2]] = !{i64 0, !"_ZTSFvPFv5UnionES_E"}
-// CHECK: [[TYPE2_GENERALIZED]] = !{i64 0, !"_ZTSFvPv5UnionE.generalized"}
+// CHECK: [[TYPE2]] = !{i64 0, !"_ZTSFvPFv5UnionEPcE"}
+// CHECK: [[TYPE2_GENERALIZED]] = !{i64 0, !"_ZTSFvPvS_E.generalized"}
 
diff --git a/clang/test/CodeGen/cfi-icall-normalize2.c b/clang/test/CodeGen/cfi-icall-normalize2.c
index 5e457dc97f0a2..9fa6f95e523d7 100644
--- a/clang/test/CodeGen/cfi-icall-normalize2.c
+++ b/clang/test/CodeGen/cfi-icall-normalize2.c
@@ -32,12 +32,12 @@ union Union {
 void uni(void (*fn)(union Union), union Union arg1) {
     // CHECK-LABEL: define{{.*}}uni
     // CHECK-SAME: {{.*}}!type ![[TYPE4:[0-9]+]] !type !{{[0-9]+}}
-    // CHECK: call i1 @llvm.type.test({{i8\*|ptr}} {{%f|%0}}, metadata !"_ZTSFv5UnionE.normalized")
+    // CHECK: call i1 @llvm.type.test({{i8\*|ptr}} {{%f|%0}}, metadata !"_ZTSFvPu2i8E.normalized")
     fn(arg1);
 }
 
 // CHECK: ![[TYPE1]] = !{i64 0, !"_ZTSFvPFvu3i32ES_E.normalized"}
 // CHECK: ![[TYPE2]] = !{i64 0, !"_ZTSFvPFvu3i32S_ES_S_E.normalized"}
 // CHECK: ![[TYPE3]] = !{i64 0, !"_ZTSFvPFvu3i32S_S_ES_S_S_E.normalized"}
-// CHECK: ![[TYPE4]] = !{i64 0, !"_ZTSFvPFv5UnionES_E.normalized"}
+// CHECK: ![[TYPE4]] = !{i64 0, !"_ZTSFvPFv5UnionEPu2i8E.normalized"}
 
diff --git a/clang/test/CodeGen/kcfi-generalize.c b/clang/test/CodeGen/kcfi-generalize.c
index 864cdb8c2e092..5a44d97412af9 100644
--- a/clang/test/CodeGen/kcfi-generalize.c
+++ b/clang/test/CodeGen/kcfi-generalize.c
@@ -33,8 +33,8 @@ union Union {
 
 // CHECK: define{{.*}} void @uni({{.*}} !kcfi_type [[TYPE4:![0-9]+]]
 void uni(void (*fn)(union Union), union Union arg1) {
-  // UNGENERALIZED: call {{.*}} [ "kcfi"(i32 -1037059548) ]
-  // GENERALIZED: call {{.*}} [ "kcfi"(i32 422130955) ]
+  // UNGENERALIZED: call {{.*}} [ "kcfi"(i32 -587217045) ]
+  // GENERALIZED: call {{.*}} [ "kcfi"(i32 2139530422) ]
     fn(arg1);
 }
 
@@ -44,6 +44,5 @@ void uni(void (*fn)(union Union), union Union arg1) {
 // UNGENERALIZED: [[TYPE3]] = !{i32 874141567}
 // GENERALIZED: [[TYPE3]] = !{i32 954385378}
 
-// UNGENERALIZED: [[TYPE4]] = !{i32 981319178}
-// GENERALIZED: [[TYPE4]] = !{i32 -1599950473}
-
+// UNGENERALIZED: [[TYPE4]] = !{i32 -1619636625}
+// GENERALIZED: [[TYPE4]] = !{i32 -125078496}
diff --git a/clang/test/CodeGen/kcfi-normalize.c b/clang/test/CodeGen/kcfi-normalize.c
index 9291ff8529b31..bd87f4af534a1 100644
--- a/clang/test/CodeGen/kcfi-normalize.c
+++ b/clang/test/CodeGen/kcfi-normalize.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -x c++ -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -o - %s | FileCheck %s --check-prefixes=CHECK,C
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers -x c++ -o - %s | FileCheck %s --check-prefixes=CHECK,CPP
 #if !__has_feature(kcfi)
 #error Missing kcfi?
 #endif
@@ -36,7 +36,8 @@ union Union {
 void uni(void (*fn)(union Union), union Union arg1) {
     // CHECK-LABEL: define{{.*}}uni
     // CHECK-SAME: {{.*}}!kcfi_type ![[TYPE4:[0-9]+]]
-    // CHECK: call void %0(ptr %1) [ "kcfi"(i32 -1430221633) ]
+    // C: call void %0(ptr %1) [ "kcfi"(i32 1819770848) ]
+    // CPP: call void %0(ptr %1) [ "kcfi"(i32 -1430221633) ]
     fn(arg1);
 }
 
@@ -44,5 +45,5 @@ void uni(void (*fn)(union Union), union Union arg1) {
 // CHECK: ![[TYPE1]] = !{i32 -1143117868}
 // CHECK: ![[TYPE2]] = !{i32 -460921415}
 // CHECK: ![[TYPE3]] = !{i32 -333839615}
-// CHECK: ![[TYPE4]] = !{i32 1766237188}
-
+// C: ![[TYPE4]] = !{i32 -650530463}
+// CPP: ![[TYPE4]] = !{i32 1766237188}

>From 4ebd2023291d47402ecd170864df9ea541ea33ba Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 19:49:29 -0700
Subject: [PATCH 30/39] [LegalizeTypes][X86] Use getShiftAmountConstant in
 ExpandIntRes_SIGN_EXTEND. (#158388)

This ensures we don't need to fixup the shift amount later.

Unfortunately, this enabled the
(SRA (SHL X, ShlConst), SraConst) -> (SRA (sext_in_reg X), SraConst -
ShlConst) combine in combineShiftRightArithmetic for some cases in
is_fpclass-fp80.ll. So we need to also update checkSignTestSetCCCombine
to look through sign_extend_inreg to prevent a regression.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 9 ++++-----
 llvm/lib/Target/X86/X86ISelLowering.cpp                | 8 ++++++--
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 87570e6f44a6f..5967b4eb3769a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5088,9 +5088,8 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
     Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
     // The high part is obtained by SRA'ing all but one of the bits of low part.
     unsigned LoSize = NVT.getSizeInBits();
-    Hi = DAG.getNode(
-        ISD::SRA, dl, NVT, Lo,
-        DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getShiftAmountConstant(LoSize - 1, NVT, dl));
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
@@ -5123,8 +5122,8 @@ ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
     // The high part gets the sign extension from the lo-part.  This handles
     // things like sextinreg V:i64 from i8.
     Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
-                     DAG.getConstant(Hi.getValueSizeInBits() - 1, dl,
-                                     TLI.getPointerTy(DAG.getDataLayout())));
+                     DAG.getShiftAmountConstant(Hi.getValueSizeInBits() - 1,
+                                                Hi.getValueType(), dl));
   } else {
     // For example, extension of an i48 to an i64.  Leave the low part alone,
     // sext_inreg the high part.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3631016b0f5c7..eeb5eb8a262de 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48396,13 +48396,17 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,
   MVT SrcVT = Src.getSimpleValueType();
   APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
 
-  // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
-  // peek through and adjust the TEST bit.
+  // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
+  // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
   if (Src.getOpcode() == ISD::SHL) {
     if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
       Src = Src.getOperand(0);
       BitMask.lshrInPlace(*ShiftAmt);
     }
+  } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
+    Src = Src.getOperand(0);
+    BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
   }
 
   SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,

>From 4cbf4408e7d27786490bae933e45e1c3fe2011ec Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 12 Sep 2025 19:49:48 -0700
Subject: [PATCH 31/39] [SelectionDAG] Use getShiftAmountConstant. (#158395)

Many of the shifts in LegalizeIntegerTypes.cpp were using getPointerTy.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 67 +++++++------------
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 44 ++++++------
 .../CodeGen/SelectionDAG/LegalizeTypes.cpp    |  3 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  4 +-
 4 files changed, 46 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bcfc2c5dc9f83..5fb7e63cfb605 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -585,8 +585,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl);
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
-          DAG.getConstant(RoundWidth, dl,
-                          TLI.getShiftAmountTy(Value.getValueType(), DL)));
+          DAG.getShiftAmountConstant(RoundWidth, Value.getValueType(), dl));
       Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              ExtraVT, ST->getBaseAlign(), MMOFlags, AAInfo);
@@ -596,8 +595,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       // Store the top RoundWidth bits.
       Hi = DAG.getNode(
           ISD::SRL, dl, Value.getValueType(), Value,
-          DAG.getConstant(ExtraWidth, dl,
-                          TLI.getShiftAmountTy(Value.getValueType(), DL)));
+          DAG.getShiftAmountConstant(ExtraWidth, Value.getValueType(), dl));
       Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(), RoundVT,
                              ST->getBaseAlign(), MMOFlags, AAInfo);
 
@@ -816,8 +814,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Move the top bits to the right place.
       Hi = DAG.getNode(
           ISD::SHL, dl, Hi.getValueType(), Hi,
-          DAG.getConstant(RoundWidth, dl,
-                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+          DAG.getShiftAmountConstant(RoundWidth, Hi.getValueType(), dl));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
@@ -845,8 +842,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Move the top bits to the right place.
       Hi = DAG.getNode(
           ISD::SHL, dl, Hi.getValueType(), Hi,
-          DAG.getConstant(ExtraWidth, dl,
-                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+          DAG.getShiftAmountConstant(ExtraWidth, Hi.getValueType(), dl));
 
       // Join the hi and lo parts.
       Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
@@ -2767,8 +2763,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     SDValue SignBitTest = DAG.getSetCC(
         dl, SetCCVT, Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
 
-    EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout());
-    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue ShiftConst = DAG.getShiftAmountConstant(1, SrcVT, dl);
     SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
     SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
     SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
@@ -3350,10 +3345,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     } else {
       Op = DAG.getAnyExtOrTrunc(Op, dl, MVT::i32);
     }
-    Op = DAG.getNode(
-        ISD::SHL, dl, MVT::i32, Op,
-        DAG.getConstant(16, dl,
-                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::SHL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(16, MVT::i32, dl));
     Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op);
     // Add fp_extend in case the output is bigger than f32.
     if (Node->getValueType(0) != MVT::f32)
@@ -3370,10 +3363,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (!DAG.isKnownNeverSNaN(Op)) {
       Op = DAG.getNode(ISD::FCANONICALIZE, dl, MVT::f32, Op, Node->getFlags());
     }
-    Op = DAG.getNode(
-        ISD::SRL, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op),
-        DAG.getConstant(16, dl,
-                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32,
+                     DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op),
+                     DAG.getShiftAmountConstant(16, MVT::i32, dl));
     // The result of this node can be bf16 or an integer type in case bf16 is
     // not supported on the target and was softened to i16 for storage.
     if (Node->getValueType(0) == MVT::bf16) {
@@ -3431,13 +3423,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     // NOTE: we could fall back on load/store here too for targets without
     // SRA.  However, it is doubtful that any exist.
-    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
     unsigned BitsDiff = VT.getScalarSizeInBits() -
                         ExtraVT.getScalarSizeInBits();
-    SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
-    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
-                       Node->getOperand(0), ShiftCst);
-    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl);
+    Tmp1 = DAG.getNode(ISD::SHL, dl, VT, Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, VT, Tmp1, ShiftCst);
     Results.push_back(Tmp1);
     break;
   }
@@ -3666,11 +3656,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT OpTy = Node->getOperand(0).getValueType();
     if (Node->getConstantOperandVal(1)) {
       // 1 -> Hi
-      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
-                         DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
-                                         TLI.getShiftAmountTy(
-                                             Node->getOperand(0).getValueType(),
-                                             DAG.getDataLayout())));
+      Tmp1 = DAG.getNode(
+          ISD::SRL, dl, OpTy, Node->getOperand(0),
+          DAG.getShiftAmountConstant(OpTy.getSizeInBits() / 2, OpTy, dl));
       Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
     } else {
       // 0 -> Lo
@@ -3950,9 +3938,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       for (unsigned i = 0; i < 2; ++i) {
         SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
         SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
-        SDValue Shift = DAG.getConstant(
-            HalfType.getScalarSizeInBits(), dl,
-            TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+        SDValue Shift =
+            DAG.getShiftAmountConstant(HalfType.getScalarSizeInBits(), VT, dl);
         Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
         Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
       }
@@ -3999,8 +3986,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
       Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
       SDValue Shift =
-          DAG.getConstant(HalfType.getSizeInBits(), dl,
-                          TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+          DAG.getShiftAmountConstant(HalfType.getSizeInBits(), VT, dl);
       Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
       Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
     }
@@ -4130,8 +4116,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
     Tmp2 = DAG.getNode(
         ISD::SHL, dl, PairTy, Tmp2,
-        DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
-                        TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
+        DAG.getShiftAmountConstant(PairTy.getSizeInBits() / 2, PairTy, dl));
     Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
     break;
   }
@@ -5368,10 +5353,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
     Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
-    Tmp1 = DAG.getNode(
-        ISD::SRL, dl, NVT, Tmp1,
-        DAG.getConstant(DiffBits, dl,
-                        TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                       DAG.getShiftAmountConstant(DiffBits, NVT, dl));
 
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     break;
@@ -5483,11 +5466,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);
 
-    auto &DL = DAG.getDataLayout();
     unsigned OriginalSize = OVT.getScalarSizeInBits();
-    Tmp2 = DAG.getNode(
-        ISD::SRL, dl, NVT, Tmp1,
-        DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                       DAG.getShiftAmountConstant(OriginalSize, NVT, dl));
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 5967b4eb3769a..354aeff0c60ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1938,9 +1938,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
   for (unsigned i = 1; i < NumRegs; ++i) {
     SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
     // Shift it to the right position and "or" it in.
-    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
-                       DAG.getConstant(i * RegVT.getSizeInBits(), dl,
-                                       TLI.getPointerTy(DAG.getDataLayout())));
+    Part = DAG.getNode(
+        ISD::SHL, dl, NVT, Part,
+        DAG.getShiftAmountConstant(i * RegVT.getSizeInBits(), NVT, dl));
     Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
   }
 
@@ -2293,9 +2293,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
   assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
   SDLoc dl(N);
 
-  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
-                   DAG.getConstant(OVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Hi = DAG.getNode(
+      ISD::SHL, dl, N->getValueType(0), Hi,
+      DAG.getShiftAmountConstant(OVT.getSizeInBits(), N->getValueType(0), dl));
   return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
 }
 
@@ -3943,8 +3943,7 @@ void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
     Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
     // The high part replicates the sign bit of Lo, make it explicit.
     Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                     DAG.getConstant(NVTBits - 1, dl,
-                                     TLI.getPointerTy(DAG.getDataLayout())));
+                     DAG.getShiftAmountConstant(NVTBits - 1, NVT, dl));
   }
 }
 
@@ -4329,8 +4328,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
       // lo part.
       unsigned LoSize = Lo.getValueSizeInBits();
       Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
-                       DAG.getConstant(LoSize - 1, dl,
-                                       TLI.getPointerTy(DAG.getDataLayout())));
+                       DAG.getShiftAmountConstant(LoSize - 1, NVT, dl));
     } else if (ExtType == ISD::ZEXTLOAD) {
       // The high part is just a zero.
       Hi = DAG.getConstant(0, dl, NVT);
@@ -4391,13 +4389,12 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
       Lo = DAG.getNode(
           ISD::OR, dl, NVT, Lo,
           DAG.getNode(ISD::SHL, dl, NVT, Hi,
-                      DAG.getConstant(ExcessBits, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout()))));
+                      DAG.getShiftAmountConstant(ExcessBits, NVT, dl)));
       // Move high bits to the right position in Hi.
       Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT,
                        Hi,
-                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
-                                       TLI.getPointerTy(DAG.getDataLayout())));
+                       DAG.getShiftAmountConstant(
+                           NVT.getSizeInBits() - ExcessBits, NVT, dl));
     }
   }
 
@@ -5165,12 +5162,12 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
   SDLoc dl(N);
-  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
-  Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(),
-                   N->getOperand(0),
-                   DAG.getConstant(NVT.getSizeInBits(), dl,
-                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, InOp);
+  Hi = DAG.getNode(ISD::SRL, dl, InVT, InOp,
+                   DAG.getShiftAmountConstant(NVT.getSizeInBits(), InVT, dl));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
@@ -5928,14 +5925,13 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   if (ExcessBits < NVT.getSizeInBits()) {
     // Transfer high bits from the top of Lo to the bottom of Hi.
-    Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
-                     DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
-                                     TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(
+        ISD::SHL, dl, NVT, Hi,
+        DAG.getShiftAmountConstant(NVT.getSizeInBits() - ExcessBits, NVT, dl));
     Hi = DAG.getNode(
         ISD::OR, dl, NVT, Hi,
         DAG.getNode(ISD::SRL, dl, NVT, Lo,
-                    DAG.getConstant(ExcessBits, dl,
-                                    TLI.getPointerTy(DAG.getDataLayout()))));
+                    DAG.getShiftAmountConstant(ExcessBits, NVT, dl)));
   }
 
   // Store both the high bits and maybe some of the low bits.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cc0fd7993916c..f14eeda639e71 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1001,11 +1001,10 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
                               LVT.getSizeInBits() + HVT.getSizeInBits());
 
-  EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
   Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
   Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
   Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
-                   DAG.getConstant(LVT.getSizeInBits(), dlHi, ShiftAmtVT));
+                   DAG.getShiftAmountConstant(LVT.getSizeInBits(), NVT, dlHi));
   return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 118fd8418f787..ff7cd665446cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5945,10 +5945,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
       // interesting bits will end up at the wrong place.
       if (DAG.getDataLayout().isBigEndian()) {
         unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
-        EVT ShiftAmtTy = TLI.getShiftAmountTy(NInVT, DAG.getDataLayout());
-        assert(ShiftAmt < WidenVT.getSizeInBits() && "Too large shift amount!");
         NInOp = DAG.getNode(ISD::SHL, dl, NInVT, NInOp,
-                           DAG.getConstant(ShiftAmt, dl, ShiftAmtTy));
+                            DAG.getShiftAmountConstant(ShiftAmt, NInVT, dl));
       }
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NInOp);
     }

>From 261000760fba7ab353962fbc1a74c194acd3e097 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail at bennani.ma>
Date: Fri, 12 Sep 2025 20:03:36 -0700
Subject: [PATCH 32/39] [lldb/docs] Breakdown python reference into multiple
 files (#158331)

This pages improve the LLDB website documentation readability and
discoverability by breaking down the very long python-reference page
into multiple subpages each explaining a specific topic.

The long term goal is to have tutorials for every scripting extension.

This also converts the pages to markdown, since it's easier to write.

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>
---
 lldb/docs/.htaccess                           |    1 +
 lldb/docs/use/python-reference.rst            | 1141 +----------------
 lldb/docs/use/python.rst                      |  799 ------------
 .../use/tutorials/accessing-documentation.md  |   62 +
 .../tutorials/automating-stepping-logic.md    |   42 +
 .../tutorials/breakpoint-triggered-scripts.md |   85 ++
 .../tutorials/creating-custom-breakpoints.md  |  128 ++
 .../use/tutorials/custom-frame-recognizers.md |   51 +
 .../tutorials/extending-target-stop-hooks.md  |   25 +
 .../implementing-standalone-scripts.md        |  134 ++
 .../tutorials/python-embedded-interpreter.md  |   66 +
 .../use/tutorials/script-driven-debugging.md  |  492 +++++++
 .../use/tutorials/writing-custom-commands.md  |  429 +++++++
 13 files changed, 1533 insertions(+), 1922 deletions(-)
 delete mode 100644 lldb/docs/use/python.rst
 create mode 100644 lldb/docs/use/tutorials/accessing-documentation.md
 create mode 100644 lldb/docs/use/tutorials/automating-stepping-logic.md
 create mode 100644 lldb/docs/use/tutorials/breakpoint-triggered-scripts.md
 create mode 100644 lldb/docs/use/tutorials/creating-custom-breakpoints.md
 create mode 100644 lldb/docs/use/tutorials/custom-frame-recognizers.md
 create mode 100644 lldb/docs/use/tutorials/extending-target-stop-hooks.md
 create mode 100644 lldb/docs/use/tutorials/implementing-standalone-scripts.md
 create mode 100644 lldb/docs/use/tutorials/python-embedded-interpreter.md
 create mode 100644 lldb/docs/use/tutorials/script-driven-debugging.md
 create mode 100644 lldb/docs/use/tutorials/writing-custom-commands.md

diff --git a/lldb/docs/.htaccess b/lldb/docs/.htaccess
index f094bd6ebc783..34e7fcb8f5516 100644
--- a/lldb/docs/.htaccess
+++ b/lldb/docs/.htaccess
@@ -19,6 +19,7 @@ Redirect 301 /resources/architecture.html https://lldb.llvm.org/resources/overvi
 Redirect 301 /design/sbapi.html https://lldb.llvm.org/resources/sbapi.html
 Redirect 301 /design/overview.html https://lldb.llvm.org/resources/overview.html
 Redirect 301 /use/extensions.html https://lldb.llvm.org/resources/extensions.html
+Redirect 301 /use/python.html https://lldb.llvm.org/use/tutorials/script-driven-debugging.html
 Redirect 301 /resources/bots.html https://lldb.llvm.org/resources/test.html
 
 # Redirect old Python API to new Python API.
diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst
index 4292714c9c208..6ac2ec93fbd1f 100644
--- a/lldb/docs/use/python-reference.rst
+++ b/lldb/docs/use/python-reference.rst
@@ -10,1126 +10,21 @@ command interpreter (we refer to this for brevity as the embedded interpreter).
 Of course, in this context it has full access to the LLDB API - with some
 additional conveniences we will call out in the FAQ.
 
-Documentation
---------------
-
-The LLDB API is contained in a python module named lldb. A useful resource when
-writing Python extensions is the lldb Python classes reference guide.
-
-The documentation is also accessible in an interactive debugger session with
-the following command:
-
-::
-
-   (lldb) script help(lldb)
-      Help on package lldb:
-
-      NAME
-         lldb - The lldb module contains the public APIs for Python binding.
-
-      FILE
-         /System/Library/PrivateFrameworks/LLDB.framework/Versions/A/Resources/Python/lldb/__init__.py
-
-      DESCRIPTION
-   ...
-
-You can also get help using a module class name. The full API that is exposed
-for that class will be displayed in a man page style window. Below we want to
-get help on the lldb.SBFrame class:
-
-::
-
-   (lldb) script help(lldb.SBFrame)
-      Help on class SBFrame in module lldb:
-
-      class SBFrame(__builtin__.object)
-      |  Represents one of the stack frames associated with a thread.
-      |  SBThread contains SBFrame(s). For example (from test/lldbutil.py),
-      |
-      |  def print_stacktrace(thread, string_buffer = False):
-      |      '''Prints a simple stack trace of this thread.'''
-      |
-   ...
-
-Or you can get help using any python object, here we use the lldb.process
-object which is a global variable in the lldb module which represents the
-currently selected process:
-
-::
-
-   (lldb) script help(lldb.process)
-      Help on SBProcess in module lldb object:
-
-      class SBProcess(__builtin__.object)
-      |  Represents the process associated with the target program.
-      |
-      |  SBProcess supports thread iteration. For example (from test/lldbutil.py),
-      |
-      |  # ==================================================
-      |  # Utility functions related to Threads and Processes
-      |  # ==================================================
-      |
-   ...
-
-Embedded Python Interpreter
----------------------------
-
-The embedded python interpreter can be accessed in a variety of ways from
-within LLDB. The easiest way is to use the lldb command script with no
-arguments at the lldb command prompt:
-
-::
-
-   (lldb) script
-   Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-   >>> 2+3
-   5
-   >>> hex(12345)
-   '0x3039'
-   >>>
-
-This drops you into the embedded python interpreter. When running under the
-script command, lldb sets some convenience variables that give you quick access
-to the currently selected entities that characterize the program and debugger
-state. In each case, if there is no currently selected entity of the
-appropriate type, the variable's IsValid method will return false. These
-variables are:
-
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| Variable          | Type                | Equivalent                          | Description                                                                         |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.debugger`` | `lldb.SBDebugger`   | `SBTarget.GetDebugger`              | Contains the debugger object whose ``script`` command was invoked.                  |
-|                   |                     |                                     | The `lldb.SBDebugger` object owns the command interpreter                           |
-|                   |                     |                                     | and all the targets in your debug session.  There will always be a                  |
-|                   |                     |                                     | Debugger in the embedded interpreter.                                               |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.target``   | `lldb.SBTarget`     | `SBDebugger.GetSelectedTarget`      | Contains the currently selected target - for instance the one made with the         |
-|                   |                     |                                     | ``file`` or selected by the ``target select <target-index>`` command.               |
-|                   |                     | `SBProcess.GetTarget`               | The `lldb.SBTarget` manages one running process, and all the executable             |
-|                   |                     |                                     | and debug files for the process.                                                    |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.process``  | `lldb.SBProcess`    | `SBTarget.GetProcess`               | Contains the process of the currently selected target.                              |
-|                   |                     |                                     | The `lldb.SBProcess` object manages the threads and allows access to                |
-|                   |                     | `SBThread.GetProcess`               | memory for the process.                                                             |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.thread``   | `lldb.SBThread`     | `SBProcess.GetSelectedThread`       | Contains the currently selected thread.                                             |
-|                   |                     |                                     | The `lldb.SBThread` object manages the stack frames in that thread.                 |
-|                   |                     | `SBFrame.GetThread`                 | A thread is always selected in the command interpreter when a target stops.         |
-|                   |                     |                                     | The ``thread select <thread-index>`` command can be used to change the              |
-|                   |                     |                                     | currently selected thread.  So as long as you have a stopped process, there will be |
-|                   |                     |                                     | some selected thread.                                                               |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-| ``lldb.frame``    | `lldb.SBFrame`      | `SBThread.GetSelectedFrame`         | Contains the currently selected stack frame.                                        |
-|                   |                     |                                     | The `lldb.SBFrame` object manage the stack locals and the register set for          |
-|                   |                     |                                     | that stack.                                                                         |
-|                   |                     |                                     | A stack frame is always selected in the command interpreter when a target stops.    |
-|                   |                     |                                     | The ``frame select <frame-index>`` command can be used to change the                |
-|                   |                     |                                     | currently selected frame.  So as long as you have a stopped process, there will     |
-|                   |                     |                                     | be some selected frame.                                                             |
-+-------------------+---------------------+-------------------------------------+-------------------------------------------------------------------------------------+
-
-While extremely convenient, these variables have a couple caveats that you
-should be aware of. First of all, they hold the values of the selected objects
-on entry to the embedded interpreter. They do not update as you use the LLDB
-API's to change, for example, the currently selected stack frame or thread.
-
-Moreover, they are only defined and meaningful while in the interactive Python
-interpreter. There is no guarantee on their value in any other situation, hence
-you should not use them when defining Python formatters, breakpoint scripts and
-commands (or any other Python extension point that LLDB provides). For the
-latter you'll be passed an `SBDebugger`, `SBTarget`, `SBProcess`, `SBThread` or
-`SBFrame` instance and you can use the functions from the "Equivalent" column
-to navigate between them.
-
-As a rationale for such behavior, consider that lldb can run in a multithreaded
-environment, and another thread might call the "script" command, changing the
-value out from under you.
-
-To get started with these objects and LLDB scripting, please note that almost
-all of the lldb Python objects are able to briefly describe themselves when you
-pass them to the Python print function:
-
-::
-
-   (lldb) script
-   Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-   >>> print(lldb.debugger)
-   Debugger (instance: "debugger_1", id: 1)
-   >>> print(lldb.target)
-   a.out
-   >>> print(lldb.process)
-   SBProcess: pid = 58842, state = stopped, threads = 1, executable = a.out
-   >>> print(lldb.thread)
-   thread #1: tid = 0x2265ce3, 0x0000000100000334 a.out`main at t.c:2:3, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
-   >>> print(lldb.frame)
-   frame #0: 0x0000000100000334 a.out`main at t.c:2:3
-
-
-Running a python script when a breakpoint gets hit
---------------------------------------------------
-
-One very powerful use of the lldb Python API is to have a python script run
-when a breakpoint gets hit. Adding python scripts to breakpoints provides a way
-to create complex breakpoint conditions and also allows for smart logging and
-data gathering.
-
-When your process hits a breakpoint to which you have attached some python
-code, the code is executed as the body of a function which takes three
-arguments:
-
-::
-
-  def breakpoint_function_wrapper(frame, bp_loc, internal_dict):
-     # Your code goes here
-
-or:
-
-::
-
-  def breakpoint_function_wrapper(frame, bp_loc, extra_args, internal_dict):
-     # Your code goes here
-
-
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| Argument          | Type                          | Description                                                                                                                               |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``frame``         | `lldb.SBFrame`                | The current stack frame where the breakpoint got hit.                                                                                     |
-|                   |                               | The object will always be valid.                                                                                                          |
-|                   |                               | This ``frame`` argument might *not* match the currently selected stack frame found in the `lldb` module global variable ``lldb.frame``.   |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``bp_loc``        | `lldb.SBBreakpointLocation`   | The breakpoint location that just got hit. Breakpoints are represented by `lldb.SBBreakpoint`                                             |
-|                   |                               | objects. These breakpoint objects can have one or more locations. These locations                                                         |
-|                   |                               | are represented by `lldb.SBBreakpointLocation` objects.                                                                                   |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``extra_args``    | `lldb.SBStructuredData`       | ``Optional`` If your breakpoint callback function takes this extra parameter, then when the callback gets added to a breakpoint, its      |
-|                   |                               | contents can parametrize this use of the callback.  For instance, instead of writing a callback that stops when the caller is "Foo",      |
-|                   |                               | you could take the function name from a field in the ``extra_args``, making the callback more general.  The ``-k`` and ``-v`` options     |
-|                   |                               | to ``breakpoint command add`` will be passed as a Dictionary in the ``extra_args`` parameter, or you can provide it with the SB API's.    |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-| ``internal_dict`` | ``dict``                      | The python session dictionary as a standard python dictionary object.                                                                     |
-+-------------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-
-Optionally, a Python breakpoint command can return a value. Returning False
-tells LLDB that you do not want to stop at the breakpoint. Any other return
-value (including None or leaving out the return statement altogether) is akin
-to telling LLDB to actually stop at the breakpoint. This can be useful in
-situations where a breakpoint only needs to stop the process when certain
-conditions are met, and you do not want to inspect the program state manually
-at every stop and then continue.
-
-An example will show how simple it is to write some python code and attach it
-to a breakpoint. The following example will allow you to track the order in
-which the functions in a given shared library are first executed during one run
-of your program. This is a simple method to gather an order file which can be
-used to optimize function placement within a binary for execution locality.
-
-We do this by setting a regular expression breakpoint that will match every
-function in the shared library. The regular expression '.' will match any
-string that has at least one character in it, so we will use that. This will
-result in one lldb.SBBreakpoint object that contains an
-lldb.SBBreakpointLocation object for each function. As the breakpoint gets hit,
-we use a counter to track the order in which the function at this particular
-breakpoint location got hit. Since our code is passed the location that was
-hit, we can get the name of the function from the location, disable the
-location so we won't count this function again; then log some info and continue
-the process.
-
-Note we also have to initialize our counter, which we do with the simple
-one-line version of the script command.
-
-Here is the code:
-
-::
-
-   (lldb) breakpoint set --func-regex=. --shlib=libfoo.dylib
-   Breakpoint created: 1: regex = '.', module = libfoo.dylib, locations = 223
-   (lldb) script counter = 0
-   (lldb) breakpoint command add --script-type python 1
-   Enter your Python command(s). Type 'DONE' to end.
-   > # Increment our counter.  Since we are in a function, this must be a global python variable
-   > global counter
-   > counter += 1
-   > # Get the name of the function
-   > name = frame.GetFunctionName()
-   > # Print the order and the function name
-   > print('[%i] %s' % (counter, name))
-   > # Disable the current breakpoint location so it doesn't get hit again
-   > bp_loc.SetEnabled(False)
-   > # No need to stop here
-   > return False
-   > DONE
-
-The breakpoint command add command above attaches a python script to breakpoint 1. To remove the breakpoint command:
-
-::
-
-   (lldb) breakpoint command delete 1
-
-
-Using the python api's to create custom breakpoints
----------------------------------------------------
-
-
-Another use of the Python API's in lldb is to create a custom breakpoint
-resolver. This facility was added in r342259.
-
-It allows you to provide the algorithm which will be used in the breakpoint's
-search of the space of the code in a given Target to determine where to set the
-breakpoint locations - the actual places where the breakpoint will trigger. To
-understand how this works you need to know a little about how lldb handles
-breakpoints.
-
-In lldb, a breakpoint is composed of three parts: the Searcher, the Resolver,
-and the Stop Options. The Searcher and Resolver cooperate to determine how
-breakpoint locations are set and differ between each breakpoint type. Stop
-options determine what happens when a location triggers and includes the
-commands, conditions, ignore counts, etc. Stop options are common between all
-breakpoint types, so for our purposes only the Searcher and Resolver are
-relevant.
-
-The Searcher's job is to traverse in a structured way the code in the current
-target. It proceeds from the Target, to search all the Modules in the Target,
-in each Module it can recurse into the Compile Units in that module, and within
-each Compile Unit it can recurse over the Functions it contains.
-
-The Searcher can be provided with a SearchFilter that it will use to restrict
-this search. For instance, if the SearchFilter specifies a list of Modules, the
-Searcher will not recurse into Modules that aren't on the list. When you pass
-the -s modulename flag to break set you are creating a Module-based search
-filter. When you pass -f filename.c to break set -n you are creating a file
-based search filter. If neither of these is specified, the breakpoint will have
-a no-op search filter, so all parts of the program are searched and all
-locations accepted.
-
-The Resolver has two functions. The most important one is the callback it
-provides. This will get called at the appropriate time in the course of the
-search. The callback is where the job of adding locations to the breakpoint
-gets done.
-
-The other function is specifying to the Searcher at what depth in the above
-described recursion it wants to be called. Setting a search depth also provides
-a stop for the recursion. For instance, if you request a Module depth search,
-then the callback will be called for each Module as it gets added to the
-Target, but the searcher will not recurse into the Compile Units in the module.
-
-One other slight subtlety is that the depth at which you get called back is not
-necessarily the depth at which the SearchFilter is specified. For instance,
-if you are doing symbol searches, it is convenient to use the Module depth for
-the search, since symbols are stored in the module. But the SearchFilter might
-specify some subset of CompileUnits, so not all the symbols you might find in
-each module will pass the search. You don't need to handle this situation
-yourself, since SBBreakpoint::AddLocation will only add locations that pass the
-Search Filter. This API returns an SBError to inform you whether your location
-was added.
-
-When the breakpoint is originally created, its Searcher will process all the
-currently loaded modules. The Searcher will also visit any new modules as they
-are added to the target. This happens, for instance, when a new shared library
-gets added to the target in the course of running, or on rerunning if any of
-the currently loaded modules have been changed. Note, in the latter case, all
-the locations set in the old module will get deleted and you will be asked to
-recreate them in the new version of the module when your callback gets called
-with that module. For this reason, you shouldn't try to manage the locations
-you add to the breakpoint yourself. Note that the Breakpoint takes care of
-deduplicating equal addresses in AddLocation, so you shouldn't need to worry
-about that anyway.
-
-At present, when adding a scripted Breakpoint type, you can only provide a
-custom Resolver, not a custom SearchFilter.
-
-The custom Resolver is provided as a Python class with the following methods:
-
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Name               | Arguments                             | Description                                                                                                      |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__init__``       | ``bkpt``:`lldb.SBBreakpoint`          | This is the constructor for the new Resolver.                                                                    |
-|                    | ``extra_args``:`lldb.SBStructuredData`|                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``bkpt`` is the breakpoint owning this Resolver.                                                                 |
-|                    |                                       |                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``extra_args`` is an `SBStructuredData` object that the user can pass in when creating instances of this         |
-|                    |                                       | breakpoint.  It is not required, but is quite handy.  For instance if you were implementing a breakpoint on some |
-|                    |                                       | symbol name, you could write a generic symbol name based Resolver, and then allow the user to pass               |
-|                    |                                       | in the particular symbol in the extra_args                                                                       |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__callback__``   | ``sym_ctx``:`lldb.SBSymbolContext`    | This is the Resolver callback.                                                                                   |
-|                    |                                       | The ``sym_ctx`` argument will be filled with the current stage                                                   |
-|                    |                                       | of the search.                                                                                                   |
-|                    |                                       |                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | For instance, if you asked for a search depth of lldb.eSearchDepthCompUnit, then the                             |
-|                    |                                       | target, module and compile_unit fields of the sym_ctx will be filled.  The callback should look just in the      |
-|                    |                                       | context passed in ``sym_ctx`` for new locations.  If the callback finds an address of interest, it               |
-|                    |                                       | can add it to the breakpoint with the `SBBreakpoint.AddLocation` method, using the breakpoint passed             |
-|                    |                                       | in to the ``__init__`` method.                                                                                   |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__get_depth__``  | ``None``                              | Specify the depth at which you wish your callback to get called.  The currently supported options are:           |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | `lldb.eSearchDepthModule`                                                                                        |
-|                    |                                       | `lldb.eSearchDepthCompUnit`                                                                                      |
-|                    |                                       | `lldb.eSearchDepthFunction`                                                                                      |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | For instance, if you are looking                                                                                 |
-|                    |                                       | up symbols, which are stored at the Module level, you will want to get called back module by module.             |
-|                    |                                       | So you would want to return `lldb.eSearchDepthModule`.  This method is optional.  If not provided the search     |
-|                    |                                       | will be done at Module depth.                                                                                    |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``get_short_help`` | ``None``                              | This is an optional method.  If provided, the returned string will be printed at the beginning of                |
-|                    |                                       | the description for this breakpoint.                                                                             |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-
-To define a new breakpoint command defined by this class from the lldb command
-line, use the command:
-
-::
-
-  (lldb) breakpoint set -P MyModule.MyResolverClass
-
-You can also populate the extra_args SBStructuredData with a dictionary of
-key/value pairs with:
-
-::
-
-  (lldb) breakpoint set -P MyModule.MyResolverClass -k key_1 -v value_1 -k key_2 -v value_2
-
-Although you can't write a scripted SearchFilter, both the command line and the
-SB API's for adding a scripted resolver allow you to specify a SearchFilter
-restricted to certain modules or certain compile units. When using the command
-line to create the resolver, you can specify a Module specific SearchFilter by
-passing the -s ModuleName option - which can be specified multiple times. You
-can also specify a SearchFilter restricted to certain compile units by passing
-in the -f CompUnitName option. This can also be specified more than once. And
-you can mix the two to specify "this comp unit in this module". So, for
-instance,
-
-::
-
-  (lldb) breakpoint set -P MyModule.MyResolverClass -s a.out
-
-will use your resolver, but will only recurse into or accept new locations in
-the module a.out.
-
-Another option for creating scripted breakpoints is to use the
-SBTarget.BreakpointCreateFromScript API. This one has the advantage that you
-can pass in an arbitrary SBStructuredData object, so you can create more
-complex parametrizations. SBStructuredData has a handy SetFromJSON method which
-you can use for this purpose. Your __init__ function gets passed this
-SBStructuredData object. This API also allows you to directly provide the list
-of Modules and the list of CompileUnits that will make up the SearchFilter. If
-you pass in empty lists, the breakpoint will use the default "search
-everywhere,accept everything" filter.
-
-Using the python API' to create custom stepping logic
------------------------------------------------------
-
-A slightly esoteric use of the Python API's is to construct custom stepping
-types. LLDB's stepping is driven by a stack of "thread plans" and a fairly
-simple state machine that runs the plans. You can create a Python class that
-works as a thread plan, and responds to the requests the state machine makes to
-run its operations.
-
-There is a longer discussion of scripted thread plans and the state machine,
-and several interesting examples of their use in:
-
-https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/scripted_step.py
-
-And for a MUCH fuller discussion of the whole state machine, see:
-
-https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/ThreadPlan.h
-
-If you are reading those comments it is useful to know that scripted thread
-plans are set to be "ControllingPlans", and not "OkayToDiscard".
-
-To implement a scripted step, you define a python class that has the following
-methods:
-
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| Name              | Arguments                          | Description                                                                           |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``__init__``      | ``thread_plan``:`lldb.SBThreadPlan`| This is the underlying `SBThreadPlan` that is pushed onto the plan stack.             |
-|                   |                                    | You will want to store this away in an ivar.  Also, if you are going to               |
-|                   |                                    | use one of the canned thread plans, you can queue it at this point.                   |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``explains_stop`` | ``event``: `lldb.SBEvent`          | Return True if this stop is part of your thread plans logic, false otherwise.         |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``is_stale``      | ``None``                           | If your plan is no longer relevant (for instance, you were                            |
-|                   |                                    | stepping in a particular stack frame, but some other operation                        |
-|                   |                                    | pushed that frame off the stack) return True and your plan will                       |
-|                   |                                    | get popped.                                                                           |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``should_step``   | ``None``                           | Return ``True`` if you want lldb to instruction step one instruction,                 |
-|                   |                                    | or False to continue till the next breakpoint is hit.                                 |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-| ``should_stop``   | ``event``: `lldb.SBEvent`          | If your plan wants to stop and return control to the user at this point, return True. |
-|                   |                                    | If your plan is done at this point, call SetPlanComplete on your                      |
-|                   |                                    | thread plan instance.                                                                 |
-|                   |                                    | Also, do any work you need here to set up the next stage of stepping.                 |
-+-------------------+------------------------------------+---------------------------------------------------------------------------------------+
-
-To use this class to implement a step, use the command:
-
-::
-
-  (lldb) thread step-scripted -C MyModule.MyStepPlanClass
-
-Or use the SBThread.StepUsingScriptedThreadPlan API. The SBThreadPlan passed
-into your __init__ function can also push several common plans (step
-in/out/over and run-to-address) in front of itself on the stack, which can be
-used to compose more complex stepping operations. When you use subsidiary plans
-your explains_stop and should_stop methods won't get called until the
-subsidiary plan is done, or the process stops for an event the subsidiary plan
-doesn't explain. For instance, step over plans don't explain a breakpoint hit
-while performing the step-over.
-
-
-Create a new lldb command using a Python function
--------------------------------------------------
-
-Python functions can be used to create new LLDB command interpreter commands,
-which will work like all the natively defined lldb commands. This provides a
-very flexible and easy way to extend LLDB to meet your debugging requirements.
-
-To write a python function that implements a new LLDB command define the
-function to take five arguments as follows:
-
-::
-
-  def command_function(debugger, command, exe_ctx, result, internal_dict):
-      # Your code goes here
-
-The meaning of the arguments is given in the table below.
-
-If you provide a Python docstring in your command function LLDB will use it
-when providing "long help" for your command, as in:
-
-::
-
-  def command_function(debugger, command, result, internal_dict):
-      """This command takes a lot of options and does many fancy things"""
-      # Your code goes here
-
-though providing help can also be done programmatically (see below).
-
-Prior to lldb 3.5.2 (April 2015), LLDB Python command definitions didn't take the SBExecutionContext
-argument. So you may still see commands where the command definition is:
-
-::
-
-  def command_function(debugger, command, result, internal_dict):
-      # Your code goes here
-
-Using this form is strongly discouraged because it can only operate on the "currently selected"
-target, process, thread, frame.  The command will behave as expected when run
-directly on the command line.  But if the command is used in a stop-hook, breakpoint
-callback, etc. where the response to the callback determines whether we will select
-this or that particular process/frame/thread, the global "currently selected"
-entity is not necessarily the one the callback is meant to handle.  In that case, this
-command definition form can't do the right thing.
-
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| Argument          | Type                           | Description                                                                                                                      |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``debugger``      | `lldb.SBDebugger`              | The current debugger object.                                                                                                     |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``command``       | ``python string``              | A python string containing all arguments for your command. If you need to chop up the arguments                                  |
-|                   |                                | try using the ``shlex`` module's ``shlex.split(command)`` to properly extract the                                                |
-|                   |                                | arguments.                                                                                                                       |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``exe_ctx``       | `lldb.SBExecutionContext`      | An execution context object carrying around information on the inferior process' context in which the command is expected to act |
-|                   |                                |                                                                                                                                  |
-|                   |                                | *Optional since lldb 3.5.2, unavailable before*                                                                                  |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``result``        | `lldb.SBCommandReturnObject`   | A return object which encapsulates success/failure information for the command and output text                                   |
-|                   |                                | that needs to be printed as a result of the command. The plain Python "print" command also works but                             |
-|                   |                                | text won't go in the result by default (it is useful as a temporary logging facility).                                           |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-| ``internal_dict`` | ``python dict object``         | The dictionary for the current embedded script session which contains all variables                                              |
-|                   |                                | and functions.                                                                                                                   |
-+-------------------+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
-
-Since lldb 3.7, Python commands can also be implemented by means of a class
-which should implement the following interface:
-
-.. code-block:: python
-
-  class CommandObjectType:
-      def __init__(self, debugger, internal_dict):
-          this call should initialize the command with respect to the command interpreter for the passed-in debugger
-      def __call__(self, debugger, command, exe_ctx, result):
-          this is the actual bulk of the command, akin to Python command functions
-      def get_short_help(self):
-          this call should return the short help text for this command[1]
-      def get_long_help(self):
-          this call should return the long help text for this command[1]
-      def get_flags(self):
-          this will be called when the command is added to the command interpreter,
-          and should return a flag field made from or-ing together the appropriate
-          elements of the lldb.CommandFlags enum to specify the requirements of this command.
-          The CommandInterpreter will make sure all these requirements are met, and will
-          return the standard lldb error if they are not.[1]
-      def get_repeat_command(self, command):
-          The auto-repeat command is what will get executed when the user types just
-          a return at the next prompt after this command is run.  Even if your command
-          was run because it was specified as a repeat command, that invocation will still
-          get asked for IT'S repeat command, so you can chain a series of repeats, for instance
-          to implement a pager.
-
-          The command argument is the command that is about to be executed.
-
-          If this call returns None, then the ordinary repeat mechanism will be used
-          If this call returns an empty string, then auto-repeat is disabled
-          If this call returns any other string, that will be the repeat command [1]
-
-[1] This method is optional.
-
-As a convenience, you can treat the result object as a Python file object, and
-say
-
-.. code-block:: python
-
-  print("my command does lots of cool stuff", file=result)
-
-SBCommandReturnObject and SBStream both support this file-like behavior by
-providing write() and flush() calls at the Python layer.
-
-The commands that are added using this class definition are what lldb calls
-"raw" commands.  The command interpreter doesn't attempt to parse the command,
-doesn't handle option values, neither generating help for them, or their
-completion.  Raw commands are useful when the arguments passed to the command
-are unstructured, and having to protect them against lldb command parsing would
-be onerous.  For instance, "expr" is a raw command.
-
-You can also add scripted commands that implement the "parsed command", where
-the options and their types are specified, as well as the argument and argument
-types.  These commands look and act like the majority of lldb commands, and you
-can also add custom completions for the options and/or the arguments if you have
-special needs.
-
-The easiest way to do this is to derive your new command from the lldb.ParsedCommand
-class.  That responds in the same way to the help & repeat command interfaces, and
-provides some convenience methods, and most importantly an LLDBOptionValueParser,
-accessed through lldb.ParsedCommand.get_parser().  The parser is used to set
-your command definitions, and to retrieve option values in the __call__ method.
-
-To set up the command definition, implement the ParsedCommand abstract method:
-
-.. code-block:: python
-
-   def setup_command_definition(self):
-
-This is called when your command is added to lldb.  In this method you add the
-options and their types, the option help strings, etc. to the command using the API:
-
-.. code-block:: python
-
-    def add_option(self, short_option, long_option, help, default,
-                   dest = None, required=False, groups = None,
-                   value_type=lldb.eArgTypeNone, completion_type=None,
-                   enum_values=None):
-        """
-        short_option: one character, must be unique, not required
-        long_option:  no spaces, must be unique, required
-        help:         a usage string for this option, will print in the command help
-        default:      the initial value for this option (if it has a value)
-        dest:         the name of the property that gives you access to the value for
-                      this value.  Defaults to the long option if not provided.
-        required: if true, this option must be provided or the command will error out
-        groups: Which "option groups" does this option belong to.  This can either be
-                a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists:
-                so [1, [3,5]] is the same as [1, 3, 4, 5].
-        value_type: one of the lldb.eArgType enum values.  Some of the common arg
-                    types also have default completers, which will be applied automatically.
-        completion_type: currently these are values form the lldb.CompletionType enum.	If
-                         you need custom completions, implement	handle_option_argument_completion.
-        enum_values: An array of duples: ["element_name", "element_help"].  If provided,
-                     only one of the enum elements is allowed.  The value will be the
-                     element_name for the chosen enum element as a string.
-        """
-
-Similarly, you can add argument types to the command:
-
-.. code-block:: python
-
-    def make_argument_element(self, arg_type, repeat = "optional", groups = None):
-        """
-      	arg_type: The argument type, one of the	lldb.eArgType enum values.
-      	repeat:	Choose from the	following options:
-      	      	"plain"	- one value
-      	      	"optional" - zero or more values
-      	      	"plus" - one or	more values
-      	groups:	As with	add_option.
-        """
-
-Then implement the body of the command by defining:
-
-.. code-block:: python
-
-    def __call__(self, debugger, args_array, exe_ctx, result):
-        """This is the command callback.  The option values are
-        provided by the 'dest' properties on the parser.
-
-        args_array: This is the list of arguments provided.
-        exe_ctx: Gives the SBExecutionContext on which the
-                 command should operate.
-        result:  Any results of the command should be
-                 written into this SBCommandReturnObject.
-        """
-
-This differs from the "raw" command's __call__ in that the arguments are already
-parsed into the args_array, and the option values are set in the parser, and
-can be accessed using their property name.  The LLDBOptionValueParser class has
-a couple of other handy methods:
-
-.. code-block:: python
-    def was_set(self, long_option_name):
-
-returns True if the option was specified on the command line.
-
-.. code-block:: python
-
-    def dest_for_option(self, long_option_name):
-    """
-    This will return the value of the dest variable you defined for opt_name.
-    Mostly useful for handle_completion where you get passed the long option.
-    """
-
-lldb will handle completing your option names, and all your enum values
-automatically.  If your option or argument types have associated built-in completers,
-then lldb will also handle that completion for you.  But if you have a need for
-custom completions, either in your arguments or option values, you can handle
-completion by hand as well.  To handle completion of option value arguments,
-your lldb.ParsedCommand subclass should implement:
-
-.. code-block:: python
-
-    def handle_option_argument_completion(self, long_option, cursor_pos):
-    """
-    long_option: The long option name of the option whose value you are
-                 asked to complete.
-    cursor_pos: The cursor position in the value for that option - which
-    you can get from the option parser.
-    """
-
-And to handle the completion of arguments:
-
-.. code-block:: python
-
-    def handle_argument_completion(self, args, arg_pos, cursor_pos):
-    """
-    args: A list of the arguments to the command
-    arg_pos: An index into the args list of the argument with the cursor
-    cursor_pos: The cursor position in the arg specified by arg_pos
-    """
-
-When either of these API's is called, the command line will have been parsed up to
-the word containing the cursor, and any option values set in that part of the command
-string are available from the option value parser.  That's useful for instance
-if you have a --shared-library option that would constrain the completions for,
-say, a symbol name option or argument.
-
-The return value specifies what the completion options are.  You have four
-choices:
-
-- `True`: the completion was handled with no completions.
-
-- `False`: the completion was not handled, forward it to the regular
-completion machinery.
-
-- A dictionary with the key: "completion": there is one candidate,
-whose value is the value of the "completion" key.  Optionally you can pass a
-"mode" key whose value is either "partial" or "complete".  Return partial if
-the "completion" string is a prefix for all the completed value.
-
-For instance, if the string you are completing is "Test" and the available completions are:
-"Test1", "Test11" and "Test111", you should return the dictionary:
-
-.. code-block:: python
-
-   return {"completion": "Test1", "mode" : "partial"}
-
-and then lldb will add the "1" at the cursor and advance it after the added string,
-waiting for more completions.  But if "Test1" is the only completion, return:
-
-.. code-block:: python
-
-   {"completion": "Test1", "mode": "complete"}
-
-and lldb will add "1 " at the cursor, indicating the command string is complete.
-
-The default is "complete", you don't need to specify a "mode" in that case.
-
-- A dictionary with the key: "values" whose value is a list of candidate completion
-strings.  The command interpreter will present those strings as the available choices.
-You can optionally include a "descriptions" key, whose value is a parallel array
-of description strings, and the completion will show the description next to
-each completion.
-
-
-One other handy convenience when defining lldb command-line commands is the
-command "command script import" which will import a module specified by file
-path, so you don't have to change your PYTHONPATH for temporary scripts. It
-also has another convenience that if your new script module has a function of
-the form:
-
-.. code-block python
-
-  def __lldb_init_module(debugger, internal_dict):
-      # Command Initialization code goes here
-
-where debugger and internal_dict are as above, that function will get run when
-the module is loaded allowing you to add whatever commands you want into the
-current debugger. Note that this function will only be run when using the LLDB
-command ``command script import``, it will not get run if anyone imports your
-module from another module.
-
-The standard test for ``__main__``, like many python modules do, is useful for
-creating scripts that can be run from the command line. However, for command
-line scripts, the debugger instance must be created manually. Sample code would
-look like:
-
-.. code-block:: python
-
-  if __name__ == '__main__':
-      # Initialize the debugger before making any API calls.
-      lldb.SBDebugger.Initialize()
-      # Create a new debugger instance in your module if your module
-      # can be run from the command line. When we run a script from
-      # the command line, we won't have any debugger object in
-      # lldb.debugger, so we can just create it if it will be needed
-      debugger = lldb.SBDebugger.Create()
-
-      # Next, do whatever work this module should do when run as a command.
-      # ...
-
-      # Finally, dispose of the debugger you just made.
-      lldb.SBDebugger.Destroy(debugger)
-      # Terminate the debug session
-      lldb.SBDebugger.Terminate()
-
-
-Now we can create a module called ls.py in the file ~/ls.py that will implement
-a function that can be used by LLDB's python command code:
-
-.. code-block:: python
-
-  #!/usr/bin/env python
-
-  import lldb
-  import commands
-  import optparse
-  import shlex
-
-  def ls(debugger, command, result, internal_dict):
-      print >>result, (commands.getoutput('/bin/ls %s' % command))
-
-  # And the initialization code to add your commands
-  def __lldb_init_module(debugger, internal_dict):
-      debugger.HandleCommand('command script add -f ls.ls ls')
-      print('The "ls" python command has been installed and is ready for use.')
-
-Now we can load the module into LLDB and use it
-
-::
-
-  $ lldb
-  (lldb) command script import ~/ls.py
-  The "ls" python command has been installed and is ready for use.
-  (lldb) ls -l /tmp/
-  total 365848
-  -rw-r--r--@  1 someuser  wheel         6148 Jan 19 17:27 .DS_Store
-  -rw-------   1 someuser  wheel         7331 Jan 19 15:37 crash.log
-
-You can also make "container" commands to organize the commands you are adding to
-lldb.  Most of the lldb built-in commands structure themselves this way, and using
-a tree structure has the benefit of leaving the one-word command space free for user
-aliases.  It can also make it easier to find commands if you are adding more than
-a few of them.  Here's a trivial example of adding two "utility" commands into a
-"my-utilities" container:
-
-::
-
-  #!/usr/bin/env python
-
-  import lldb
-
-  def first_utility(debugger, command, result, internal_dict):
-      print("I am the first utility")
-
-  def second_utility(debugger, command, result, internal_dict):
-      print("I am the second utility")
-
-  # And the initialization code to add your commands
-  def __lldb_init_module(debugger, internal_dict):
-      debugger.HandleCommand('command container add -h "A container for my utilities" my-utilities')
-      debugger.HandleCommand('command script add -f my_utilities.first_utility -h "My first utility" my-utilities first')
-      debugger.HandleCommand('command script add -f my_utilities.second_utility -h "My second utility" my-utilities second')
-      print('The "my-utilities" python command has been installed and its subcommands are ready for use.')
-
-Then your new commands are available under the my-utilities node:
-
-::
-
-  (lldb) help my-utilities
-  A container for my utilities
-
-  Syntax: my-utilities
-
-  The following subcommands are supported:
-
-      first  -- My first utility  Expects 'raw' input (see 'help raw-input'.)
-      second -- My second utility  Expects 'raw' input (see 'help raw-input'.)
-
-  For more help on any particular subcommand, type 'help <command> <subcommand>'.
-  (lldb) my-utilities first
-  I am the first utility
-
-
-A more interesting template has been created in the source repository that can
-help you to create lldb command quickly:
-
-https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/cmdtemplate.py
-
-A commonly required facility is being able to create a command that does some
-token substitution, and then runs a different debugger command (usually, it
-po'es the result of an expression evaluated on its argument). For instance,
-given the following program:
-
-::
-
-  #import <Foundation/Foundation.h>
-  NSString*
-  ModifyString(NSString* src)
-  {
-  	return [src stringByAppendingString:@"foobar"];
-  }
-
-  int main()
-  {
-  	NSString* aString = @"Hello world";
-  	NSString* anotherString = @"Let's be friends";
-  	return 1;
-  }
-
-you may want a pofoo X command, that equates po [ModifyString(X)
-capitalizedString]. The following debugger interaction shows how to achieve
-that goal:
-
-::
-
-  (lldb) script
-  Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-  >>> def pofoo_funct(debugger, command, result, internal_dict):
-  ...	cmd = "po [ModifyString(" + command + ") capitalizedString]"
-  ...	debugger.HandleCommand(cmd)
-  ...
-  >>> ^D
-  (lldb) command script add pofoo -f pofoo_funct
-  (lldb) pofoo aString
-  $1 = 0x000000010010aa00 Hello Worldfoobar
-  (lldb) pofoo anotherString
-  $2 = 0x000000010010aba0 Let's Be Friendsfoobar
-
-Using the lldb.py module in Python
-----------------------------------
-
-LLDB has all of its core code built into a shared library which gets used by
-the `lldb` command line application. On macOS this shared library is a
-framework: LLDB.framework and on other unix variants the program is a shared
-library: lldb.so. LLDB also provides an lldb.py module that contains the
-bindings from LLDB into Python. To use the LLDB.framework to create your own
-stand-alone python programs, you will need to tell python where to look in
-order to find this module. This is done by setting the PYTHONPATH environment
-variable, adding a path to the directory that contains the lldb.py python
-module. The lldb driver program has an option to report the path to the lldb
-module. You can use that to point to correct lldb.py:
-
-For csh and tcsh:
-
-::
-
-  % setenv PYTHONPATH `lldb -P`
-
-For sh and bash:
-
-::
-
-  $ export PYTHONPATH=`lldb -P`
-
-Alternatively, you can append the LLDB Python directory to the sys.path list
-directly in your Python code before importing the lldb module.
-
-Now your python scripts are ready to import the lldb module. Below is a python
-script that will launch a program from the current working directory called
-"a.out", set a breakpoint at "main", and then run and hit the breakpoint, and
-print the process, thread and frame objects if the process stopped:
-
-.. code-block:: python
-
-  #!/usr/bin/env python3
-
-  import lldb
-  import os
-
-
-  def disassemble_instructions(insts):
-      for i in insts:
-          print(i)
-
-
-  # Set the path to the executable to debug
-  exe = "./a.out"
-
-  # Create a new debugger instance
-  debugger = lldb.SBDebugger.Create()
-
-  # When we step or continue, don't return from the function until the process
-  # stops. Otherwise we would have to handle the process events ourselves which, while doable is
-  # a little tricky.  We do this by setting the async mode to false.
-  debugger.SetAsync(False)
-
-  # Create a target from a file and arch
-  print("Creating a target for '%s'" % exe)
-
-  target = debugger.CreateTargetWithFileAndArch(exe, lldb.LLDB_ARCH_DEFAULT)
-
-  if target:
-      # If the target is valid set a breakpoint at main
-      main_bp = target.BreakpointCreateByName(
-          "main", target.GetExecutable().GetFilename()
-      )
-
-      print(main_bp)
-
-      # Launch the process. Since we specified synchronous mode, we won't return
-      # from this function until we hit the breakpoint at main
-      process = target.LaunchSimple(None, None, os.getcwd())
-
-      # Make sure the launch went ok
-      if process:
-          # Print some simple process info
-          state = process.GetState()
-          print(process)
-          if state == lldb.eStateStopped:
-              # Get the first thread
-              thread = process.GetThreadAtIndex(0)
-              if thread:
-                  # Print some simple thread info
-                  print(thread)
-                  # Get the first frame
-                  frame = thread.GetFrameAtIndex(0)
-                  if frame:
-                      # Print some simple frame info
-                      print(frame)
-                      function = frame.GetFunction()
-                      # See if we have debug info (a function)
-                      if function:
-                          # We do have a function, print some info for the function
-                          print(function)
-                          # Now get all instructions for this function and print them
-                          insts = function.GetInstructions(target)
-                          disassemble_instructions(insts)
-                      else:
-                          # See if we have a symbol in the symbol table for where we stopped
-                          symbol = frame.GetSymbol()
-                          if symbol:
-                              # We do have a symbol, print some info for the symbol
-                              print(symbol)
-
-Writing lldb frame recognizers in Python
-----------------------------------------
-
-Frame recognizers allow for retrieving information about special frames based
-on ABI, arguments or other special properties of that frame, even without
-source code or debug info. Currently, one use case is to extract function
-arguments that would otherwise be inaccessible, or augment existing arguments.
-
-Adding a custom frame recognizer is done by implementing a Python class and
-using the 'frame recognizer add' command. The Python class should have a
-'get_recognized_arguments' method and it will receive an argument of type
-lldb.SBFrame representing the current frame that we are trying to recognize.
-The method should return a (possibly empty) list of lldb.SBValue objects that
-represent the recognized arguments.
-
-An example of a recognizer that retrieves the file descriptor values from libc
-functions 'read', 'write' and 'close' follows:
-
-::
-
-  class LibcFdRecognizer(object):
-    def get_recognized_arguments(self, frame):
-      if frame.name in ["read", "write", "close"]:
-        fd = frame.EvaluateExpression("$arg1").unsigned
-        target = frame.thread.process.target
-        value = target.CreateValueFromExpression("fd", "(int)%d" % fd)
-        return [value]
-      return []
-
-The file containing this implementation can be imported via ``command script import``
-and then we can register this recognizer with ``frame recognizer add``.
-It's important to restrict the recognizer to the libc library (which is
-libsystem_kernel.dylib on macOS) to avoid matching functions with the same name
-in other modules:
-
-::
-
-  (lldb) command script import .../fd_recognizer.py
-  (lldb) frame recognizer add -l fd_recognizer.LibcFdRecognizer -n read -s libsystem_kernel.dylib
-
-When the program is stopped at the beginning of the 'read' function in libc, we can view the recognizer arguments in 'frame variable':
-
-::
-
-  (lldb) b read
-  (lldb) r
-  Process 1234 stopped
-  * thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.3
-      frame #0: 0x00007fff06013ca0 libsystem_kernel.dylib`read
-  (lldb) frame variable
-  (int) fd = 3
-
-Writing Target Stop-Hooks in Python
------------------------------------
-
-Stop hooks fire whenever the process stops just before control is returned to the
-user.  Stop hooks can either be a set of lldb command-line commands, or can
-be implemented by a suitably defined Python class.  The Python-based stop-hooks
-can also be passed as a set of -key -value pairs when they are added, and those
-will get packaged up into a SBStructuredData Dictionary and passed to the
-constructor of the Python object managing the stop hook.  This allows for
-parameterization of the stop hooks.
-
-To add a Python-based stop hook, first define a class with the following methods:
-
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Name               | Arguments                             | Description                                                                                                      |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``__init__``       | ``target: lldb.SBTarget``             | This is the constructor for the new stop-hook.                                                                   |
-|                    | ``extra_args: lldb.SBStructuredData`` |                                                                                                                  |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``target`` is the SBTarget to which the stop hook is added.                                                      |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``extra_args`` is an SBStructuredData object that the user can pass in when creating instances of this           |
-|                    |                                       | breakpoint.  It is not required, but allows for reuse of stop-hook classes.                                      |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| ``handle_stop``    | ``exe_ctx: lldb.SBExecutionContext``  | This is the called when the target stops.                                                                        |
-|                    | ``stream: lldb.SBStream``             |                                                                                                                  |
-|                    |                                       | ``exe_ctx`` argument will be filled with the current stop point for which the stop hook is                       |
-|                    |                                       | being evaluated.                                                                                                 |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | ``stream`` an lldb.SBStream, anything written to this stream will be written to the debugger console.            |
-|                    |                                       |                                                                                                                  |
-|                    |                                       | The return value is a "Should Stop" vote from this thread.  If the method returns either True or no return       |
-|                    |                                       | this thread votes to stop.  If it returns False, then the thread votes to continue after all the stop-hooks      |
-|                    |                                       | are evaluated.                                                                                                   |
-|                    |                                       | Note, the --auto-continue flag to 'target stop-hook add' overrides a True return value from the method.          |
-+--------------------+---------------------------------------+------------------------------------------------------------------------------------------------------------------+
-
-To use this class in lldb, run the command:
-
-::
-
-   (lldb) command script import MyModule.py
-   (lldb) target stop-hook add -P MyModule.MyStopHook -k first -v 1 -k second -v 2
-
-where MyModule.py is the file containing the class definition MyStopHook.
+Python Tutorials
+-----------------
+
+The following tutorials and documentation demonstrate various Python capabilities within LLDB:
+
+.. toctree::
+   :maxdepth: 1
+
+   tutorials/accessing-documentation
+   tutorials/python-embedded-interpreter
+   tutorials/script-driven-debugging
+   tutorials/breakpoint-triggered-scripts
+   tutorials/creating-custom-breakpoints
+   tutorials/automating-stepping-logic
+   tutorials/writing-custom-commands
+   tutorials/implementing-standalone-scripts
+   tutorials/custom-frame-recognizers
+   tutorials/extending-target-stop-hooks
\ No newline at end of file
diff --git a/lldb/docs/use/python.rst b/lldb/docs/use/python.rst
deleted file mode 100644
index 3a919f2a8cdb1..0000000000000
--- a/lldb/docs/use/python.rst
+++ /dev/null
@@ -1,799 +0,0 @@
-Python Scripting
-================
-
-LLDB has been structured from the beginning to be scriptable in two
-ways -- a Unix Python session can initiate/run a debug session
-non-interactively using LLDB; and within the LLDB debugger tool, Python
-scripts can be used to help with many tasks, including inspecting
-program data, iterating over containers and determining if a breakpoint
-should stop execution or continue. This document will show how to do
-some of these things by going through an example, explaining how to use
-Python scripting to find a bug in a program that searches for text in a
-large binary tree.
-
-The Test Program and Input
---------------------------
-
-We have a simple C program (dictionary.c) that reads in a text file,
-and stores all the words from the file in a Binary Search Tree, sorted
-alphabetically. It then enters a loop prompting the user for a word,
-searching for the word in the tree (using Binary Search), and reporting
-to the user whether or not it found the word in the tree.
-
-The input text file we are using to test our program contains the text
-for William Shakespeare's famous tragedy "Romeo and Juliet".
-
-The Bug
--------
-
-When we try running our program, we find there is a problem. While it
-successfully finds some of the words we would expect to find, such as
-"love" or "sun", it fails to find the word "Romeo", which MUST be in
-the input text file:
-
-::
-
-   $ ./dictionary Romeo-and-Juliet.txt
-   Dictionary loaded.
-   Enter search word: love
-   Yes!
-   Enter search word: sun
-   Yes!
-   Enter search word: Romeo
-   No!
-   Enter search word: ^D
-   $
-
-Using Depth First Search
-------------------------
-
-Our first job is to determine if the word "Romeo" actually got inserted
-into the tree or not. Since "Romeo and Juliet" has thousands of words,
-trying to examine our binary search tree by hand is completely
-impractical. Therefore we will write a Python script to search the tree
-for us. We will write a recursive Depth First Search function that
-traverses the entire tree searching for a word, and maintaining
-information about the path from the root of the tree to the current
-node. If it finds the word in the tree, it returns the path from the
-root to the node containing the word. This is what our DFS function in
-Python would look like, with line numbers added for easy reference in
-later explanations:
-
-::
-
-   1: def DFS (root, word, cur_path):
-   2:     root_word_ptr = root.GetChildMemberWithName ("word")
-   3:     left_child_ptr = root.GetChildMemberWithName ("left")
-   4:     right_child_ptr = root.GetChildMemberWithName ("right")
-   5:     root_word = root_word_ptr.GetSummary()
-   6:     end = len (root_word) - 1
-   7:     if root_word[0] == '"' and root_word[end] == '"':
-   8:         root_word = root_word[1:end]
-   9:     end = len (root_word) - 1
-   10:     if root_word[0] == '\'' and root_word[end] == '\'':
-   11:        root_word = root_word[1:end]
-   12:     if root_word == word:
-   13:         return cur_path
-   14:     elif word < root_word:
-   15:         if left_child_ptr.GetValue() is None:
-   16:             return ""
-   17:         else:
-   18:             cur_path = cur_path + "L"
-   19:             return DFS (left_child_ptr, word, cur_path)
-   20:     else:
-   21:         if right_child_ptr.GetValue() is None:
-   22:             return ""
-   23:         else:
-   24:             cur_path = cur_path + "R"
-   25:             return DFS (right_child_ptr, word, cur_path)
-
-
-Accessing & Manipulating Program Variables
-------------------------------------------
-
-Before we can call any Python function on any of our program's
-variables, we need to get the variable into a form that Python can
-access. To show you how to do this we will look at the parameters for
-the DFS function. The first parameter is going to be a node in our
-binary search tree, put into a Python variable. The second parameter is
-the word we are searching for (a string), and the third parameter is a
-string representing the path from the root of the tree to our current
-node.
-
-The most interesting parameter is the first one, the Python variable
-that needs to contain a node in our search tree. How can we take a
-variable out of our program and put it into a Python variable? What
-kind of Python variable will it be? The answers are to use the LLDB API
-functions, provided as part of the LLDB Python module. Running Python
-from inside LLDB, LLDB will automatically give us our current frame
-object as a Python variable, "lldb.frame". This variable has the type
-`SBFrame` (see the LLDB API for more information about `SBFrame`
-objects). One of the things we can do with a frame object, is to ask it
-to find and return its local variable. We will call the API function
-`SBFrame.FindVariable` on the lldb.frame object to give us our dictionary
-variable as a Python variable:
-
-::
-
-   root = lldb.frame.FindVariable ("dictionary")
-
-The line above, executed in the Python script interpreter in LLDB, asks the
-current frame to find the variable named "dictionary" and return it. We then
-store the returned value in the Python variable named "root". This answers the
-question of HOW to get the variable, but it still doesn't explain WHAT actually
-gets put into "root". If you examine the LLDB API, you will find that the
-`SBFrame` method "FindVariable" returns an object of type `SBValue`. `SBValue`
-objects are used, among other things, to wrap up program variables and values.
-There are many useful methods defined in the `SBValue` class to allow you to get
-information or children values out of SBValues. For complete information, see
-the header file SBValue.h. The `SBValue` methods that we use in our DFS function
-are ``GetChildMemberWithName()``, ``GetSummary()``, and ``GetValue()``.
-
-
-Explaining DFS Script in Detail
--------------------------------
-
-Before diving into the details of this code, it would be best to give a
-high-level overview of what it does. The nodes in our binary search tree were
-defined to have type ``tree_node *``, which is defined as:
-
-::
-
-   typedef struct tree_node
-   {
-      const char *word;
-      struct tree_node *left;
-      struct tree_node *right;
-   } tree_node;
-
-Lines 2-11 of DFS are getting data out of the current tree node and getting
-ready to do the actual search; lines 12-25 are the actual depth-first search.
-Lines 2-4 of our DFS function get the word, left and right fields out of the
-current node and store them in Python variables. Since root_word_ptr is a
-pointer to our word, and we want the actual word, line 5 calls GetSummary() to
-get a string containing the value out of the pointer. Since GetSummary() adds
-quotes around its result, lines 6-11 strip surrounding quotes off the word.
-
-Line 12 checks to see if the word in the current node is the one we are
-searching for. If so, we are done, and line 13 returns the current path.
-Otherwise, line 14 checks to see if we should go left (search word comes before
-the current word). If we decide to go left, line 15 checks to see if the left
-pointer child is NULL ("None" is the Python equivalent of NULL). If the left
-pointer is NULL, then the word is not in this tree and we return an empty path
-(line 16). Otherwise, we add an "L" to the end of our current path string, to
-indicate we are going left (line 18), and then recurse on the left child (line
-19). Lines 20-25 are the same as lines 14-19, except for going right rather
-than going left.
-
-One other note: Typing something as long as our DFS function directly into the
-interpreter can be difficult, as making a single typing mistake means having to
-start all over. Therefore we recommend doing as we have done: Writing your
-longer, more complicated script functions in a separate file (in this case
-tree_utils.py) and then importing it into your LLDB Python interpreter.
-
-
-The DFS Script in Action
-------------------------
-
-At this point we are ready to use the DFS function to see if the word "Romeo"
-is in our tree or not. To actually use it in LLDB on our dictionary program,
-you would do something like this:
-
-::
-
-   $ lldb
-   (lldb) process attach -n "dictionary"
-   Architecture set to: x86_64.
-   Process 521 stopped
-   * thread #1: tid = 0x2c03, 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8, stop reason = signal SIGSTOP
-   frame #0: 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8
-   (lldb) breakpoint set -n find_word
-   Breakpoint created: 1: name = 'find_word', locations = 1, resolved = 1
-   (lldb) continue
-   Process 521 resuming
-   Process 521 stopped
-   * thread #1: tid = 0x2c03, 0x0000000100001830 dictionary`find_word + 16
-   at dictionary.c:105, stop reason = breakpoint 1.1
-   frame #0: 0x0000000100001830 dictionary`find_word + 16 at dictionary.c:105
-   102 int
-   103 find_word (tree_node *dictionary, char *word)
-   104 {
-   -> 105 if (!word || !dictionary)
-   106 return 0;
-   107
-   108 int compare_value = strcmp (word, dictionary->word);
-   (lldb) script
-   Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
-   >>> import tree_utils
-   >>> root = lldb.frame.FindVariable ("dictionary")
-   >>> current_path = ""
-   >>> path = tree_utils.DFS (root, "Romeo", current_path)
-   >>> print path
-   LLRRL
-   >>> ^D
-   (lldb)
-
-The first bit of code above shows starting lldb, attaching to the dictionary
-program, and getting to the find_word function in LLDB. The interesting part
-(as far as this example is concerned) begins when we enter the script command
-and drop into the embedded interactive Python interpreter. We will go over this
-Python code line by line. The first line
-
-::
-
-   import tree_utils
-
-
-imports the file where we wrote our DFS function, tree_utils.py, into Python.
-Notice that to import the file we leave off the ".py" extension. We can now
-call any function in that file, giving it the prefix "tree_utils.", so that
-Python knows where to look for the function. The line
-
-::
-
-   root = lldb.frame.FindVariable ("dictionary")
-
-
-gets our program variable "dictionary" (which contains the binary search tree)
-and puts it into the Python variable "root". See Accessing & Manipulating
-Program Variables in Python above for more details about how this works. The
-next line is
-
-::
-
-   current_path = ""
-
-This line initializes the current_path from the root of the tree to our current
-node. Since we are starting at the root of the tree, our current path starts as
-an empty string. As we go right and left through the tree, the DFS function
-will append an 'R' or an 'L' to the current path, as appropriate. The line
-
-::
-
-   path = tree_utils.DFS (root, "Romeo", current_path)
-
-calls our DFS function (prefixing it with the module name so that Python can
-find it). We pass in our binary tree stored in the variable root, the word we
-are searching for, and our current path. We assign whatever path the DFS
-function returns to the Python variable path.
-
-Finally, we want to see if the word was found or not, and if so we want to see
-the path through the tree to the word. So we do
-
-::
-
-   print path
-
-From this we can see that the word "Romeo" was indeed found in the tree, and
-the path from the root of the tree to the node containing "Romeo" is
-left-left-right-right-left.
-
-Using Breakpoint Command Scripts
---------------------------------
-
-We are halfway to figuring out what the problem is. We know the word we are
-looking for is in the binary tree, and we know exactly where it is in the
-binary tree. Now we need to figure out why our binary search algorithm is not
-finding the word. We will do this using breakpoint command scripts.
-
-The idea is as follows. The binary search algorithm has two main decision
-points: the decision to follow the right branch; and, the decision to follow
-the left branch. We will set a breakpoint at each of these decision points, and
-attach a Python breakpoint command script to each breakpoint. The breakpoint
-commands will use the global path Python variable that we got from our DFS
-function. Each time one of these decision breakpoints is hit, the script will
-compare the actual decision with the decision the front of the path variable
-says should be made (the first character of the path). If the actual decision
-and the path agree, then the front character is stripped off the path, and
-execution is resumed. In this case the user never even sees the breakpoint
-being hit. But if the decision differs from what the path says it should be,
-then the script prints out a message and does NOT resume execution, leaving the
-user sitting at the first point where a wrong decision is being made.
-
-Python Breakpoint Command Scripts Are Not What They Seem
---------------------------------------------------------
-
-What do we mean by that? When you enter a Python breakpoint command in LLDB, it
-appears that you are entering one or more plain lines of Python. BUT LLDB then
-takes what you entered and wraps it into a Python FUNCTION (just like using the
-"def" Python command). It automatically gives the function an obscure, unique,
-hard-to-stumble-across function name, and gives it two parameters: frame and
-bp_loc. When the breakpoint gets hit, LLDB wraps up the frame object where the
-breakpoint was hit, and the breakpoint location object for the breakpoint that
-was hit, and puts them into Python variables for you. It then calls the Python
-function that was created for the breakpoint command, and passes in the frame
-and breakpoint location objects.
-
-So, being practical, what does this mean for you when you write your Python
-breakpoint commands? It means that there are two things you need to keep in
-mind: 1. If you want to access any Python variables created outside your
-script, you must declare such variables to be global. If you do not declare
-them as global, then the Python function will treat them as local variables,
-and you will get unexpected behavior. 2. All Python breakpoint command scripts
-automatically have a frame and a bp_loc variable. The variables are pre-loaded
-by LLDB with the correct context for the breakpoint. You do not have to use
-these variables, but they are there if you want them.
-
-The Decision Point Breakpoint Commands
---------------------------------------
-
-This is what the Python breakpoint command script would look like for the
-decision to go right:
-
-::
-
-   global path
-   if path[0] == 'R':
-      path = path[1:]
-      thread = frame.GetThread()
-      process = thread.GetProcess()
-      process.Continue()
-   else:
-      print "Here is the problem; going right, should go left!"
-
-
-Just as a reminder, LLDB is going to take this script and wrap it up in a function, like this:
-
-::
-
-   def some_unique_and_obscure_function_name (frame, bp_loc):
-      global path
-      if path[0] == 'R':
-         path = path[1:]
-         thread = frame.GetThread()
-         process = thread.GetProcess()
-         process.Continue()
-      else:
-         print "Here is the problem; going right, should go left!"
-
-LLDB will call the function, passing in the correct frame and breakpoint
-location whenever the breakpoint gets hit. There are several things to notice
-about this function. The first one is that we are accessing and updating a
-piece of state (the path variable), and actually conditioning our behavior
-based upon this variable. Since the variable was defined outside of our script
-(and therefore outside of the corresponding function) we need to tell Python
-that we are accessing a global variable. That is what the first line of the
-script does. Next we check where the path says we should go and compare it to
-our decision (recall that we are at the breakpoint for the decision to go
-right). If the path agrees with our decision, then we strip the first character
-off of the path.
-
-Since the decision matched the path, we want to resume execution. To do this we
-make use of the frame parameter that LLDB guarantees will be there for us. We
-use LLDB API functions to get the current thread from the current frame, and
-then to get the process from the thread. Once we have the process, we tell it
-to resume execution (using the Continue() API function).
-
-If the decision to go right does not agree with the path, then we do not resume
-execution. We allow the breakpoint to remain stopped (by doing nothing), and we
-print an informational message telling the user we have found the problem, and
-what the problem is.
-
-Actually Using The Breakpoint Commands
---------------------------------------
-
-Now we will look at what happens when we actually use these breakpoint commands
-on our program. Doing a source list -n find_word shows us the function
-containing our two decision points. Looking at the code below, we see that we
-want to set our breakpoints on lines 113 and 115:
-
-::
-
-   (lldb) source list -n find_word
-   File: /Volumes/Data/HD2/carolinetice/Desktop/LLDB-Web-Examples/dictionary.c.
-   101
-   102 int
-   103 find_word (tree_node *dictionary, char *word)
-   104 {
-   105   if (!word || !dictionary)
-   106     return 0;
-   107
-   108   int compare_value = strcmp (word, dictionary->word);
-   109
-   110   if (compare_value == 0)
-   111     return 1;
-   112   else if (compare_value < 0)
-   113     return find_word (dictionary->left, word);
-   114   else
-   115     return find_word (dictionary->right, word);
-   116 }
-   117
-
-
-So, we set our breakpoints, enter our breakpoint command scripts, and see what happens:
-
-::
-
-   (lldb) breakpoint set -l 113
-   Breakpoint created: 2: file ='dictionary.c', line = 113, locations = 1, resolved = 1
-   (lldb) breakpoint set -l 115
-   Breakpoint created: 3: file ='dictionary.c', line = 115, locations = 1, resolved = 1
-   (lldb) breakpoint command add -s python 2
-   Enter your Python command(s). Type 'DONE' to end.
-   > global path
-   > if (path[0] == 'L'):
-   >     path = path[1:]
-   >     thread = frame.GetThread()
-   >     process = thread.GetProcess()
-   >     process.Continue()
-   > else:
-   >     print "Here is the problem. Going left, should go right!"
-   > DONE
-   (lldb) breakpoint command add -s python 3
-   Enter your Python command(s). Type 'DONE' to end.
-   > global path
-   > if (path[0] == 'R'):
-   >     path = path[1:]
-   >     thread = frame.GetThread()
-   >     process = thread.GetProcess()
-   >     process.Continue()
-   > else:
-   >     print "Here is the problem. Going right, should go left!"
-   > DONE
-   (lldb) continue
-   Process 696 resuming
-   Here is the problem. Going right, should go left!
-   Process 696 stopped
-   * thread #1: tid = 0x2d03, 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115, stop reason = breakpoint 3.1
-   frame #0: 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115
-      112   else if (compare_value < 0)
-      113     return find_word (dictionary->left, word);
-      114   else
-   -> 115     return find_word (dictionary->right, word);
-      116 }
-      117
-      118 void
-   (lldb)
-
-
-After setting our breakpoints, adding our breakpoint commands and continuing,
-we run for a little bit and then hit one of our breakpoints, printing out the
-error message from the breakpoint command. Apparently at this point in the
-tree, our search algorithm decided to go right, but our path says the node we
-want is to the left. Examining the word at the node where we stopped, and our
-search word, we see:
-
-::
-
-   (lldb) expr dictionary->word
-   (const char *) $1 = 0x0000000100100080 "dramatis"
-   (lldb) expr word
-   (char *) $2 = 0x00007fff5fbff108 "romeo"
-
-So the word at our current node is "dramatis", and the word we are searching
-for is "romeo". "romeo" comes after "dramatis" alphabetically, so it seems like
-going right would be the correct decision. Let's ask Python what it thinks the
-path from the current node to our word is:
-
-::
-
-   (lldb) script print path
-   LLRRL
-
-According to Python we need to go left-left-right-right-left from our current
-node to find the word we are looking for. Let's double check our tree, and see
-what word it has at that node:
-
-::
-
-   (lldb) expr dictionary->left->left->right->right->left->word
-   (const char *) $4 = 0x0000000100100880 "Romeo"
-
-So the word we are searching for is "romeo" and the word at our DFS location is
-"Romeo". Aha! One is uppercase and the other is lowercase: We seem to have a
-case conversion problem somewhere in our program (we do).
-
-This is the end of our example on how you might use Python scripting in LLDB to
-help you find bugs in your program.
-
-Source Files for The Example
-----------------------------
-
-The complete code for the Dictionary program (with case-conversion bug), the
-DFS function and other Python script examples (tree_utils.py) used for this
-example are available below.
-
-tree_utils.py - Example Python functions using LLDB's API, including DFS
-
-::
-
-   """
-   # ===-- tree_utils.py ---------------------------------------*- Python -*-===//
-   #
-   #  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   #  See https://llvm.org/LICENSE.txt for license information.
-   #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-   #
-   # ===----------------------------------------------------------------------===//
-
-   tree_utils.py  - A set of functions for examining binary
-   search trees, based on the example search tree defined in
-   dictionary.c.  These functions contain calls to LLDB API
-   functions, and assume that the LLDB Python module has been
-   imported.
-
-   For a thorough explanation of how the DFS function works, and
-   for more information about dictionary.c go to
-   http://lldb.llvm.org/scripting.html
-   """
-
-
-   def DFS(root, word, cur_path):
-      """
-      Recursively traverse a binary search tree containing
-      words sorted alphabetically, searching for a particular
-      word in the tree.  Also maintains a string representing
-      the path from the root of the tree to the current node.
-      If the word is found in the tree, return the path string.
-      Otherwise return an empty string.
-
-      This function assumes the binary search tree is
-      the one defined in dictionary.c  It uses LLDB API
-      functions to examine and traverse the tree nodes.
-      """
-
-      # Get pointer field values out of node 'root'
-
-      root_word_ptr = root.GetChildMemberWithName("word")
-      left_child_ptr = root.GetChildMemberWithName("left")
-      right_child_ptr = root.GetChildMemberWithName("right")
-
-      # Get the word out of the word pointer and strip off
-      # surrounding quotes (added by call to GetSummary).
-
-      root_word = root_word_ptr.GetSummary()
-      end = len(root_word) - 1
-      if root_word[0] == '"' and root_word[end] == '"':
-         root_word = root_word[1:end]
-      end = len(root_word) - 1
-      if root_word[0] == '\'' and root_word[end] == '\'':
-         root_word = root_word[1:end]
-
-      # Main depth first search
-
-      if root_word == word:
-         return cur_path
-      elif word < root_word:
-
-         # Check to see if left child is NULL
-
-         if left_child_ptr.GetValue() is None:
-               return ""
-         else:
-               cur_path = cur_path + "L"
-               return DFS(left_child_ptr, word, cur_path)
-      else:
-
-         # Check to see if right child is NULL
-
-         if right_child_ptr.GetValue() is None:
-               return ""
-         else:
-               cur_path = cur_path + "R"
-               return DFS(right_child_ptr, word, cur_path)
-
-
-   def tree_size(root):
-      """
-      Recursively traverse a binary search tree, counting
-      the nodes in the tree.  Returns the final count.
-
-      This function assumes the binary search tree is
-      the one defined in dictionary.c  It uses LLDB API
-      functions to examine and traverse the tree nodes.
-      """
-      if (root.GetValue is None):
-         return 0
-
-      if (int(root.GetValue(), 16) == 0):
-         return 0
-
-      left_size = tree_size(root.GetChildAtIndex(1))
-      right_size = tree_size(root.GetChildAtIndex(2))
-
-      total_size = left_size + right_size + 1
-      return total_size
-
-
-   def print_tree(root):
-      """
-      Recursively traverse a binary search tree, printing out
-      the words at the nodes in alphabetical order (the
-      search order for the binary tree).
-
-      This function assumes the binary search tree is
-      the one defined in dictionary.c  It uses LLDB API
-      functions to examine and traverse the tree nodes.
-      """
-      if (root.GetChildAtIndex(1).GetValue() is not None) and (
-               int(root.GetChildAtIndex(1).GetValue(), 16) != 0):
-         print_tree(root.GetChildAtIndex(1))
-
-      print root.GetChildAtIndex(0).GetSummary()
-
-      if (root.GetChildAtIndex(2).GetValue() is not None) and (
-               int(root.GetChildAtIndex(2).GetValue(), 16) != 0):
-         print_tree(root.GetChildAtIndex(2))
-
-
-dictionary.c - Sample dictionary program, with bug
-
-::
-
-   //===-- dictionary.c ---------------------------------------------*- C -*-===//
-   //
-   // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   // See https://llvm.org/LICENSE.txt for license information.
-   // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-   //
-   //===----------------------------------------------------------------------===//
-   #include <ctype.h>
-   #include <stdio.h>
-   #include <stdlib.h>
-   #include <string.h>
-
-   typedef struct tree_node {
-   const char *word;
-   struct tree_node *left;
-   struct tree_node *right;
-   } tree_node;
-
-   /* Given a char*, returns a substring that starts at the first
-      alphabet character and ends at the last alphabet character, i.e. it
-      strips off beginning or ending quotes, punctuation, etc. */
-
-   char *strip(char **word) {
-   char *start = *word;
-   int len = strlen(start);
-   char *end = start + len - 1;
-
-   while ((start < end) && (!isalpha(start[0])))
-      start++;
-
-   while ((end > start) && (!isalpha(end[0])))
-      end--;
-
-   if (start > end)
-      return NULL;
-
-   end[1] = '\0';
-   *word = start;
-
-   return start;
-   }
-
-   /* Given a binary search tree (sorted alphabetically by the word at
-      each node), and a new word, inserts the word at the appropriate
-      place in the tree.  */
-
-   void insert(tree_node *root, char *word) {
-   if (root == NULL)
-      return;
-
-   int compare_value = strcmp(word, root->word);
-
-   if (compare_value == 0)
-      return;
-
-   if (compare_value < 0) {
-      if (root->left != NULL)
-         insert(root->left, word);
-      else {
-         tree_node *new_node = (tree_node *)malloc(sizeof(tree_node));
-         new_node->word = strdup(word);
-         new_node->left = NULL;
-         new_node->right = NULL;
-         root->left = new_node;
-      }
-   } else {
-      if (root->right != NULL)
-         insert(root->right, word);
-      else {
-         tree_node *new_node = (tree_node *)malloc(sizeof(tree_node));
-         new_node->word = strdup(word);
-         new_node->left = NULL;
-         new_node->right = NULL;
-         root->right = new_node;
-      }
-   }
-   }
-
-   /* Read in a text file and storea all the words from the file in a
-      binary search tree.  */
-
-   void populate_dictionary(tree_node **dictionary, char *filename) {
-   FILE *in_file;
-   char word[1024];
-
-   in_file = fopen(filename, "r");
-   if (in_file) {
-      while (fscanf(in_file, "%s", word) == 1) {
-         char *new_word = (strdup(word));
-         new_word = strip(&new_word);
-         if (*dictionary == NULL) {
-         tree_node *new_node = (tree_node *)malloc(sizeof(tree_node));
-         new_node->word = new_word;
-         new_node->left = NULL;
-         new_node->right = NULL;
-         *dictionary = new_node;
-         } else
-         insert(*dictionary, new_word);
-      }
-   }
-   }
-
-   /* Given a binary search tree and a word, search for the word
-      in the binary search tree.  */
-
-   int find_word(tree_node *dictionary, char *word) {
-   if (!word || !dictionary)
-      return 0;
-
-   int compare_value = strcmp(word, dictionary->word);
-
-   if (compare_value == 0)
-      return 1;
-   else if (compare_value < 0)
-      return find_word(dictionary->left, word);
-   else
-      return find_word(dictionary->right, word);
-   }
-
-   /* Print out the words in the binary search tree, in sorted order.  */
-
-   void print_tree(tree_node *dictionary) {
-   if (!dictionary)
-      return;
-
-   if (dictionary->left)
-      print_tree(dictionary->left);
-
-   printf("%s\n", dictionary->word);
-
-   if (dictionary->right)
-      print_tree(dictionary->right);
-   }
-
-   int main(int argc, char **argv) {
-   tree_node *dictionary = NULL;
-   char buffer[1024];
-   char *filename;
-   int done = 0;
-
-   if (argc == 2)
-      filename = argv[1];
-
-   if (!filename)
-      return -1;
-
-   populate_dictionary(&dictionary, filename);
-   fprintf(stdout, "Dictionary loaded.\nEnter search word: ");
-   while (!done && fgets(buffer, sizeof(buffer), stdin)) {
-      char *word = buffer;
-      int len = strlen(word);
-      int i;
-
-      for (i = 0; i < len; ++i)
-         word[i] = tolower(word[i]);
-
-      if ((len > 0) && (word[len - 1] == '\n')) {
-         word[len - 1] = '\0';
-         len = len - 1;
-      }
-
-      if (find_word(dictionary, word))
-         fprintf(stdout, "Yes!\n");
-      else
-         fprintf(stdout, "No!\n");
-
-      fprintf(stdout, "Enter search word: ");
-   }
-
-   fprintf(stdout, "\n");
-   return 0;
-   }
-
-
-The text for "Romeo and Juliet" can be obtained from the Gutenberg Project
-(http://www.gutenberg.org).
-
diff --git a/lldb/docs/use/tutorials/accessing-documentation.md b/lldb/docs/use/tutorials/accessing-documentation.md
new file mode 100644
index 0000000000000..d14efa5f3c428
--- /dev/null
+++ b/lldb/docs/use/tutorials/accessing-documentation.md
@@ -0,0 +1,62 @@
+# Accessing Script Documentation
+
+The LLDB API is contained in a python module named lldb. A useful resource when
+writing Python extensions is the lldb Python classes reference guide.
+
+The documentation is also accessible in an interactive debugger session with
+the following command:
+
+```python3
+(lldb) script help(lldb)
+   Help on package lldb:
+
+   NAME
+      lldb - The lldb module contains the public APIs for Python binding.
+
+   FILE
+      /System/Library/PrivateFrameworks/LLDB.framework/Versions/A/Resources/Python/lldb/__init__.py
+
+   DESCRIPTION
+...
+```
+
+You can also get help using a module class name. The full API that is exposed
+for that class will be displayed in a man page style window. Below we want to
+get help on the lldb.SBFrame class:
+
+```python3
+(lldb) script help(lldb.SBFrame)
+   Help on class SBFrame in module lldb:
+
+   class SBFrame(builtins.object)
+    |  SBFrame(*args)
+    |  
+    |  Represents one of the stack frames associated with a thread.
+    |  
+    |  SBThread contains SBFrame(s). For example (from test/lldbutil.py), ::
+    |  
+    |      def print_stacktrace(thread, string_buffer = False):
+    |          '''Prints a simple stack trace of this thread.'''
+...
+```
+
+Or you can get help using any python object, here we use the lldb.process
+object which is a global variable in the lldb module which represents the
+currently selected process:
+
+```python3
+(lldb) script help(lldb.process)
+   Help on SBProcess in module lldb object:
+
+   class SBProcess(builtins.object)
+    |  SBProcess(*args)
+    |  
+    |  Represents the process associated with the target program.
+    |  
+    |  SBProcess supports thread iteration. For example (from test/lldbutil.py), ::
+    |  
+    |      # ==================================================
+    |      # Utility functions related to Threads and Processes
+    |      # ==================================================
+...
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/automating-stepping-logic.md b/lldb/docs/use/tutorials/automating-stepping-logic.md
new file mode 100644
index 0000000000000..564d3ec1f14d4
--- /dev/null
+++ b/lldb/docs/use/tutorials/automating-stepping-logic.md
@@ -0,0 +1,42 @@
+# Automating Stepping Logic
+
+A slightly esoteric use of the Python API's is to construct custom stepping
+types. LLDB's stepping is driven by a stack of "thread plans" and a fairly
+simple state machine that runs the plans. You can create a Python class that
+works as a thread plan, and responds to the requests the state machine makes to
+run its operations.
+
+The base class for the [ScriptedThreadPlan](https://lldb.llvm.org/python_api/lldb.plugins.scripted_thread_plan.ScriptedThreadPlan.html) is provided as part of the lldb python module, making it easy to derive a new class from it.
+
+There is a longer discussion of scripted thread plans and the state machine,
+and several interesting examples of their use in [scripted_step.py](https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/scripted_step.py)
+and for a **MUCH** fuller discussion of the whole state machine, see [ThreadPlan.h](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/ThreadPlan.h)
+
+If you are reading those comments it is useful to know that scripted thread
+plans are set to be either ***"ControllingPlans"*** or ***"OkayToDiscard"***.
+
+To implement a scripted step, you define a python class that has the following
+methods:
+
+| Name | Arguments | Description |
+|------|-----------|-------------|
+| `__init__` | `thread_plan`: `lldb.SBThreadPlan` | This is the underlying `SBThreadPlan` that is pushed onto the plan stack. You will want to store this away in an ivar. Also, if you are going to use one of the canned thread plans, you can queue it at this point. |
+| `explains_stop` | `event`: `lldb.SBEvent` | Return True if this stop is part of your thread plans logic, false otherwise. |
+| `is_stale` | `None` | If your plan is no longer relevant (for instance, you were stepping in a particular stack frame, but some other operation pushed that frame off the stack) return True and your plan will get popped. |
+| `should_step` | `None` | Return `True` if you want lldb to instruction step one instruction, or False to continue till the next breakpoint is hit. |
+| `should_stop` | `event`: `lldb.SBEvent` | If your plan wants to stop and return control to the user at this point, return True. If your plan is done at this point, call SetPlanComplete on your thread plan instance. Also, do any work you need here to set up the next stage of stepping. |
+
+To use this class to implement a step, use the command:
+
+```python3
+(lldb) thread step-scripted -C MyModule.MyStepPlanClass
+```
+
+Or use the `SBThread.StepUsingScriptedThreadPlan` API. The `SBThreadPlan` passed
+into your `__init__` function can also push several common plans (step
+in/out/over and run-to-address) in front of itself on the stack, which can be
+used to compose more complex stepping operations. When you use subsidiary plans
+your explains_stop and should_stop methods won't get called until the
+subsidiary plan is done, or the process stops for an event the subsidiary plan
+doesn't explain. For instance, step over plans don't explain a breakpoint hit
+while performing the step-over.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/breakpoint-triggered-scripts.md b/lldb/docs/use/tutorials/breakpoint-triggered-scripts.md
new file mode 100644
index 0000000000000..0cd9f945f0d11
--- /dev/null
+++ b/lldb/docs/use/tutorials/breakpoint-triggered-scripts.md
@@ -0,0 +1,85 @@
+# Breakpoint-Triggered Scripts
+
+One very powerful use of the lldb Python API is to have a python script run
+when a breakpoint gets hit. Adding python scripts to breakpoints provides a way
+to create complex breakpoint conditions and also allows for smart logging and
+data gathering.
+
+When your process hits a breakpoint to which you have attached some python
+code, the code is executed as the body of a function which takes three
+arguments:
+
+```python3
+def breakpoint_function_wrapper(frame, bp_loc, internal_dict):
+   # Your code goes here
+```
+
+or:
+
+```python3
+def breakpoint_function_wrapper(frame, bp_loc, extra_args, internal_dict):
+   # Your code goes here
+```
+
+| Argument | Type | Description |
+|----------|------|-------------|
+| `frame` | `lldb.SBFrame` | The current stack frame where the breakpoint got hit. The object will always be valid. This `frame` argument might *not* match the currently selected stack frame found in the `lldb` module global variable `lldb.frame`. |
+| `bp_loc` | `lldb.SBBreakpointLocation` | The breakpoint location that just got hit. Breakpoints are represented by `lldb.SBBreakpoint` objects. These breakpoint objects can have one or more locations. These locations are represented by `lldb.SBBreakpointLocation` objects. |
+| `extra_args` | `lldb.SBStructuredData` | **Optional** If your breakpoint callback function takes this extra parameter, then when the callback gets added to a breakpoint, its contents can parametrize this use of the callback. For instance, instead of writing a callback that stops when the caller is "Foo", you could take the function name from a field in the `extra_args`, making the callback more general. The `-k` and `-v` options to `breakpoint command add` will be passed as a Dictionary in the `extra_args` parameter, or you can provide it with the SB API's. |
+| `internal_dict` | `dict` | The python session dictionary as a standard python dictionary object. |
+
+Optionally, a Python breakpoint command can return a value. Returning `False`
+tells LLDB that you do not want to stop at the breakpoint. Any other return
+value (including None or leaving out the return statement altogether) is akin
+to telling LLDB to actually stop at the breakpoint. This can be useful in
+situations where a breakpoint only needs to stop the process when certain
+conditions are met, and you do not want to inspect the program state manually
+at every stop and then continue.
+
+An example will show how simple it is to write some python code and attach it
+to a breakpoint. The following example will allow you to track the order in
+which the functions in a given shared library are first executed during one run
+of your program. This is a simple method to gather an order file which can be
+used to optimize function placement within a binary for execution locality.
+
+We do this by setting a regular expression breakpoint that will match every
+function in the shared library. The regular expression '.' will match any
+string that has at least one character in it, so we will use that. This will
+result in one lldb.SBBreakpoint object that contains an
+lldb.SBBreakpointLocation object for each function. As the breakpoint gets hit,
+we use a counter to track the order in which the function at this particular
+breakpoint location got hit. Since our code is passed the location that was
+hit, we can get the name of the function from the location, disable the
+location so we won't count this function again; then log some info and continue
+the process.
+
+Note we also have to initialize our counter, which we do with the simple
+one-line version of the script command.
+
+Here is the code:
+
+```python3
+(lldb) breakpoint set --func-regex=. --shlib=libfoo.dylib
+Breakpoint created: 1: regex = '.', module = libfoo.dylib, locations = 223
+(lldb) script counter = 0
+(lldb) breakpoint command add --script-type python 1
+Enter your Python command(s). Type 'DONE' to end.
+> # Increment our counter.  Since we are in a function, this must be a global python variable
+> global counter
+> counter += 1
+> # Get the name of the function
+> name = frame.GetFunctionName()
+> # Print the order and the function name
+> print('[%i] %s' % (counter, name))
+> # Disable the current breakpoint location so it doesn't get hit again
+> bp_loc.SetEnabled(False)
+> # No need to stop here
+> return False
+> DONE
+```
+
+The breakpoint command add command above attaches a python script to breakpoint 1. To remove the breakpoint command:
+
+```python3
+(lldb) breakpoint command delete 1
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/creating-custom-breakpoints.md b/lldb/docs/use/tutorials/creating-custom-breakpoints.md
new file mode 100644
index 0000000000000..e3081c44e3650
--- /dev/null
+++ b/lldb/docs/use/tutorials/creating-custom-breakpoints.md
@@ -0,0 +1,128 @@
+# Custom Breakpoint Resolvers
+
+Another use of the Python API's in lldb is to create a custom breakpoint
+resolver.
+
+It allows you to provide the algorithm which will be used in the breakpoint's
+search of the space of the code in a given Target to determine where to set the
+breakpoint locations - the actual places where the breakpoint will trigger. To
+understand how this works you need to know a little about how lldb handles
+breakpoints.
+
+In lldb, a breakpoint is composed of three parts:
+1. the Searcher
+2. the Resolver,
+3. the Stop Options.
+
+The Searcher and Resolver cooperate to determine how breakpoint locations are
+set and differ between each breakpoint type. Stop options determine what
+happens when a location triggers and includes the commands, conditions, ignore
+counts, etc. Stop options are common between all breakpoint types, so for our
+purposes only the Searcher and Resolver are relevant.
+
+### Breakpoint Searcher
+
+The Searcher's job is to traverse in a structured way the code in the current
+target. It proceeds from the Target, to search all the Modules in the Target,
+in each Module it can recurse into the Compile Units in that module, and within
+each Compile Unit it can recurse over the Functions it contains.
+
+The Searcher can be provided with a SearchFilter that it will use to restrict
+this search. For instance, if the SearchFilter specifies a list of Modules, the
+Searcher will not recurse into Modules that aren't on the list. When you pass
+the -s modulename flag to break set you are creating a Module-based search
+filter. When you pass -f filename.c to break set -n you are creating a file
+based search filter. If neither of these is specified, the breakpoint will have
+a no-op search filter, so all parts of the program are searched and all
+locations accepted.
+
+### Breakpoint Resolver
+
+The Resolver has two functions:
+
+The most important one is the callback it provides. This will get called at the
+appropriate time in the course of the search. The callback is where the job of
+adding locations to the breakpoint gets done.
+
+The other function is specifying to the Searcher at what depth in the above
+described recursion it wants to be called. Setting a search depth also provides
+a stop for the recursion. For instance, if you request a Module depth search,
+then the callback will be called for each Module as it gets added to the
+Target, but the searcher will not recurse into the Compile Units in the module.
+
+One other slight subtlety is that the depth at which you get called back is not
+necessarily the depth at which the SearchFilter is specified. For instance,
+if you are doing symbol searches, it is convenient to use the Module depth for
+the search, since symbols are stored in the module. But the SearchFilter might
+specify some subset of CompileUnits, so not all the symbols you might find in
+each module will pass the search. You don't need to handle this situation
+yourself, since SBBreakpoint::AddLocation will only add locations that pass the
+Search Filter. This API returns an SBError to inform you whether your location
+was added.
+
+When the breakpoint is originally created, its Searcher will process all the
+currently loaded modules. The Searcher will also visit any new modules as they
+are added to the target. This happens, for instance, when a new shared library
+gets added to the target in the course of running, or on rerunning if any of
+the currently loaded modules have been changed. Note, in the latter case, all
+the locations set in the old module will get deleted and you will be asked to
+recreate them in the new version of the module when your callback gets called
+with that module. For this reason, you shouldn't try to manage the locations
+you add to the breakpoint yourself. Note that the Breakpoint takes care of
+deduplicating equal addresses in AddLocation, so you shouldn't need to worry
+about that anyway.
+
+### Scripted Breakpoint Resolver
+
+At present, when adding a ScriptedBreakpoint type, you can only provide a
+custom Resolver, not a custom SearchFilter.
+
+The custom Resolver is provided as a Python class with the following methods:
+
+| Name | Arguments | Description |
+|------|-----------|-------------|
+| `__init__` | `bkpt`: `lldb.SBBreakpoint` `extra_args`: `lldb.SBStructuredData` | This is the constructor for the new Resolver. `bkpt` is the breakpoint owning this Resolver. `extra_args` is an `SBStructuredData` object that the user can pass in when creating instances of this breakpoint. It is not required, but is quite handy. For instance if you were implementing a breakpoint on some symbol name, you could write a generic symbol name based Resolver, and then allow the user to pass in the particular symbol in the extra_args |
+| `__callback__` | `sym_ctx`: `lldb.SBSymbolContext` | This is the Resolver callback. The `sym_ctx` argument will be filled with the current stage of the search. For instance, if you asked for a search depth of lldb.eSearchDepthCompUnit, then the target, module and compile_unit fields of the sym_ctx will be filled. The callback should look just in the context passed in `sym_ctx` for new locations. If the callback finds an address of interest, it can add it to the breakpoint with the `SBBreakpoint.AddLocation` method, using the breakpoint passed in to the `__init__` method. |
+| `__get_depth__` | `None` | Specify the depth at which you wish your callback to get called. The currently supported options are: `lldb.eSearchDepthModule` `lldb.eSearchDepthCompUnit` `lldb.eSearchDepthFunction` For instance, if you are looking up symbols, which are stored at the Module level, you will want to get called back module by module. So you would want to return `lldb.eSearchDepthModule`. This method is optional. If not provided the search will be done at Module depth. |
+| `get_short_help` | `None` | This is an optional method. If provided, the returned string will be printed at the beginning of the description for this breakpoint. |
+
+To define a new breakpoint command defined by this class from the lldb command
+line, use the command:
+
+```
+(lldb) breakpoint set -P MyModule.MyResolverClass
+```
+
+You can also populate the extra_args SBStructuredData with a dictionary of
+key/value pairs with:
+
+```
+(lldb) breakpoint set -P MyModule.MyResolverClass -k key_1 -v value_1 -k key_2 -v value_2
+```
+
+Although you can't write a scripted SearchFilter, both the command line and the
+SB API's for adding a scripted resolver allow you to specify a SearchFilter
+restricted to certain modules or certain compile units. When using the command
+line to create the resolver, you can specify a Module specific SearchFilter by
+passing the -s ModuleName option - which can be specified multiple times. You
+can also specify a SearchFilter restricted to certain compile units by passing
+in the -f CompUnitName option. This can also be specified more than once. And
+you can mix the two to specify "this comp unit in this module". So, for
+instance,
+
+```
+(lldb) breakpoint set -P MyModule.MyResolverClass -s a.out
+```
+
+will use your resolver, but will only recurse into or accept new locations in
+the module a.out.
+
+Another option for creating scripted breakpoints is to use the
+SBTarget.BreakpointCreateFromScript API. This one has the advantage that you
+can pass in an arbitrary SBStructuredData object, so you can create more
+complex parametrizations. SBStructuredData has a handy SetFromJSON method which
+you can use for this purpose. Your __init__ function gets passed this
+SBStructuredData object. This API also allows you to directly provide the list
+of Modules and the list of CompileUnits that will make up the SearchFilter. If
+you pass in empty lists, the breakpoint will use the default "search
+everywhere,accept everything" filter.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/custom-frame-recognizers.md b/lldb/docs/use/tutorials/custom-frame-recognizers.md
new file mode 100644
index 0000000000000..17bf9637d9a85
--- /dev/null
+++ b/lldb/docs/use/tutorials/custom-frame-recognizers.md
@@ -0,0 +1,51 @@
+# Detecting Patterns With Recognizers
+
+Frame recognizers allow for retrieving information about special frames based
+on ABI, arguments or other special properties of that frame, even without
+source code or debug info. Currently, one use case is to extract function
+arguments that would otherwise be inaccessible, or augment existing arguments.
+
+Adding a custom frame recognizer is done by implementing a Python class and
+using the `frame recognizer add` command. The Python class should implement the
+`get_recognized_arguments` method and it will receive an argument of type
+`lldb.SBFrame` representing the current frame that we are trying to recognize.
+The method should return a (possibly empty) list of `lldb.SBValue` objects that
+represent the recognized arguments.
+
+An example of a recognizer that retrieves the file descriptor values from libc
+functions 'read', 'write' and 'close' follows:
+
+```python3
+class LibcFdRecognizer:
+  def get_recognized_arguments(self, frame: lldb.SBFrame):
+    if frame.name in ["read", "write", "close"]:
+      fd = frame.EvaluateExpression("$arg1").unsigned
+      target = frame.thread.process.target
+      value = target.CreateValueFromExpression("fd", "(int)%d" % fd)
+      return [value]
+    return []
+```
+
+The file containing this implementation can be imported via `command script import`
+and then we can register this recognizer with `frame recognizer add`.
+
+It's important to restrict the recognizer to the libc library (which is
+`libsystem_kernel.dylib` on macOS) to avoid matching functions with the same name
+in other modules:
+
+```c++
+(lldb) command script import .../fd_recognizer.py
+(lldb) frame recognizer add -l fd_recognizer.LibcFdRecognizer -n read -s libsystem_kernel.dylib
+```
+
+When the program is stopped at the beginning of the 'read' function in libc, we can view the recognizer arguments in 'frame variable':
+
+```c++
+(lldb) b read
+(lldb) r
+Process 1234 stopped
+* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.3
+    frame #0: 0x00007fff06013ca0 libsystem_kernel.dylib`read
+(lldb) frame variable
+(int) fd = 3
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/extending-target-stop-hooks.md b/lldb/docs/use/tutorials/extending-target-stop-hooks.md
new file mode 100644
index 0000000000000..232187d0dcf11
--- /dev/null
+++ b/lldb/docs/use/tutorials/extending-target-stop-hooks.md
@@ -0,0 +1,25 @@
+# Extending Target Stop-Hooks
+
+Stop hooks fire whenever the process stops just before control is returned to the
+user.  Stop hooks can either be a set of lldb command-line commands, or can
+be implemented by a suitably defined Python class.  The Python-based stop-hooks
+can also be passed as a set of -key -value pairs when they are added, and those
+will get packaged up into a `SBStructuredData` Dictionary and passed to the
+constructor of the Python object managing the stop hook.  This allows for
+parameterization of the stop hooks.
+
+To add a Python-based stop hook, first define a class with the following methods:
+
+| Name | Arguments | Description |
+|------|-----------|-------------|
+| `__init__` | `target: lldb.SBTarget` `extra_args: lldb.SBStructuredData` | This is the constructor for the new stop-hook. `target` is the SBTarget to which the stop hook is added. `extra_args` is an SBStructuredData object that the user can pass in when creating instances of this breakpoint. It is not required, but allows for reuse of stop-hook classes. |
+| `handle_stop` | `exe_ctx: lldb.SBExecutionContext` `stream: lldb.SBStream` | This is the called when the target stops. `exe_ctx` argument will be filled with the current stop point for which the stop hook is being evaluated. `stream` an lldb.SBStream, anything written to this stream will be written to the debugger console. The return value is a "Should Stop" vote from this thread. If the method returns either True or no return this thread votes to stop. If it returns False, then the thread votes to continue after all the stop-hooks are evaluated. Note, the --auto-continue flag to 'target stop-hook add' overrides a True return value from the method. |
+
+To use this class in lldb, run the command:
+
+```
+(lldb) command script import MyModule.py
+(lldb) target stop-hook add -P MyModule.MyStopHook -k first -v 1 -k second -v 2
+```
+
+where `MyModule.py` is the file containing the class definition `MyStopHook`.
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/implementing-standalone-scripts.md b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
new file mode 100644
index 0000000000000..b8aaacf22fc2e
--- /dev/null
+++ b/lldb/docs/use/tutorials/implementing-standalone-scripts.md
@@ -0,0 +1,134 @@
+# Implementing Standalone Scripts
+
+### Configuring `PYTHONPATH`
+
+LLDB has all of its core code built into a shared library which gets used by
+the `lldb` command line application.
+- On macOS this shared library is a framework: `LLDB.framework`.
+- On other unix variants the program is a shared library: lldb.so.
+
+LLDB also provides an `lldb.py` module that contains the bindings from LLDB
+into Python. To use the `LLDB.framework` to create your own stand-alone python
+programs, you will need to tell python where to look in order to find this
+module. This is done by setting the `PYTHONPATH` environment variable,
+adding a path to the directory that contains the `lldb.py` python
+module. The lldb driver program has an option to report the path to the lldb
+module. You can use that to point to correct lldb.py:
+
+For csh and tcsh:
+
+```csh
+% setenv PYTHONPATH `lldb -P`
+```
+
+For sh and bash:
+
+```bash
+$ export PYTHONPATH=`lldb -P`
+```
+
+Alternatively, you can append the LLDB Python directory to the sys.path list
+directly in your Python code before importing the lldb module.
+
+### Initialization
+
+The standard test for `__main__`, like many python modules do, is useful for
+creating scripts that can be run from the command line. However, for command
+line scripts, the debugger instance must be created manually. Sample code would
+look like:
+
+```python3
+if __name__ == '__main__':
+    # Initialize the debugger before making any API calls.
+    lldb.SBDebugger.Initialize()
+    # Create a new debugger instance in your module if your module
+    # can be run from the command line. When we run a script from
+    # the command line, we won't have any debugger object in
+    # lldb.debugger, so we can just create it if it will be needed
+    debugger = lldb.SBDebugger.Create()
+
+    # Next, do whatever work this module should do when run as a command.
+    # ...
+
+    # Finally, dispose of the debugger you just made.
+    lldb.SBDebugger.Destroy(debugger)
+    # Terminate the debug session
+    lldb.SBDebugger.Terminate()
+```
+
+### Example
+
+Now your python scripts are ready to import the lldb module. Below is a python
+script that will launch a program from the current working directory called
+`a.out`, set a breakpoint at `main`, and then run and hit the breakpoint, and
+print the process, thread and frame objects if the process stopped:
+
+```python3
+#!/usr/bin/env python3
+
+import lldb
+import os
+
+def disassemble_instructions(insts):
+    for i in insts:
+        print(i)
+
+# Set the path to the executable to debug
+exe = "./a.out"
+
+# Create a new debugger instance
+debugger = lldb.SBDebugger.Create()
+
+# When we step or continue, don't return from the function until the process
+# stops. Otherwise we would have to handle the process events ourselves which, while doable is
+# a little tricky.  We do this by setting the async mode to false.
+debugger.SetAsync(False)
+
+# Create a target from a file and arch
+print("Creating a target for '%s'" % exe)
+
+target = debugger.CreateTargetWithFileAndArch(exe, lldb.LLDB_ARCH_DEFAULT)
+
+if target:
+    # If the target is valid set a breakpoint at main
+    main_bp = target.BreakpointCreateByName(
+        "main", target.GetExecutable().GetFilename()
+    )
+
+    print(main_bp)
+
+    # Launch the process. Since we specified synchronous mode, we won't return
+    # from this function until we hit the breakpoint at main
+    process = target.LaunchSimple(None, None, os.getcwd())
+
+    # Make sure the launch went ok
+    if process:
+        # Print some simple process info
+        state = process.GetState()
+        print(process)
+        if state == lldb.eStateStopped:
+            # Get the first thread
+            thread = process.GetThreadAtIndex(0)
+            if thread:
+                # Print some simple thread info
+                print(thread)
+                # Get the first frame
+                frame = thread.GetFrameAtIndex(0)
+                if frame:
+                    # Print some simple frame info
+                    print(frame)
+                    function = frame.GetFunction()
+                    # See if we have debug info (a function)
+                    if function:
+                        # We do have a function, print some info for the function
+                        print(function)
+                        # Now get all instructions for this function and print them
+                        insts = function.GetInstructions(target)
+                        disassemble_instructions(insts)
+                    else:
+                        # See if we have a symbol in the symbol table for where we stopped
+                        symbol = frame.GetSymbol()
+                        if symbol:
+                            # We do have a symbol, print some info for the symbol
+                            print(symbol)
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/python-embedded-interpreter.md b/lldb/docs/use/tutorials/python-embedded-interpreter.md
new file mode 100644
index 0000000000000..719d746b35d43
--- /dev/null
+++ b/lldb/docs/use/tutorials/python-embedded-interpreter.md
@@ -0,0 +1,66 @@
+# Embedded Python Interpreter
+
+The embedded python interpreter can be accessed in a variety of ways from
+within LLDB. The easiest way is to use the lldb command script with no
+arguments at the lldb command prompt:
+
+```python3
+(lldb) script
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> 2+3
+5
+>>> hex(12345)
+'0x3039'
+>>>
+```
+
+This drops you into the embedded python interpreter. When running under the
+script command, lldb sets some convenience variables that give you quick access
+to the currently selected entities that characterize the program and debugger
+state. In each case, if there is no currently selected entity of the
+appropriate type, the variable's IsValid method will return false. These
+variables are:
+
+| Variable | Type | Equivalent | Description |
+|----------|------|------------|-------------|
+| `lldb.debugger` | `lldb.SBDebugger` | `SBTarget.GetDebugger` | Contains the debugger object whose `script` command was invoked. The `lldb.SBDebugger` object owns the command interpreter and all the targets in your debug session. There will always be a Debugger in the embedded interpreter. |
+| `lldb.target` | `lldb.SBTarget` | `SBDebugger.GetSelectedTarget` `SBProcess.GetTarget` | Contains the currently selected target - for instance the one made with the `file` or selected by the `target select <target-index>` command. The `lldb.SBTarget` manages one running process, and all the executable and debug files for the process. |
+| `lldb.process` | `lldb.SBProcess` | `SBTarget.GetProcess` `SBThread.GetProcess` | Contains the process of the currently selected target. The `lldb.SBProcess` object manages the threads and allows access to memory for the process. |
+| `lldb.thread` | `lldb.SBThread` | `SBProcess.GetSelectedThread` `SBFrame.GetThread` | Contains the currently selected thread. The `lldb.SBThread` object manages the stack frames in that thread. A thread is always selected in the command interpreter when a target stops. The `thread select <thread-index>` command can be used to change the currently selected thread. So as long as you have a stopped process, there will be some selected thread. |
+| `lldb.frame` | `lldb.SBFrame` | `SBThread.GetSelectedFrame` | Contains the currently selected stack frame. The `lldb.SBFrame` object manage the stack locals and the register set for that stack. A stack frame is always selected in the command interpreter when a target stops. The `frame select <frame-index>` command can be used to change the currently selected frame. So as long as you have a stopped process, there will be some selected frame. |
+
+While extremely convenient, these variables have a couple caveats that you
+should be aware of. First of all, they hold the values of the selected objects
+on entry to the embedded interpreter. They do not update as you use the LLDB
+API's to change, for example, the currently selected stack frame or thread.
+
+Moreover, they are only defined and meaningful while in the interactive Python
+interpreter. There is no guarantee on their value in any other situation, hence
+you should not use them when defining Python formatters, breakpoint scripts and
+commands (or any other Python extension point that LLDB provides). For the
+latter you'll be passed an `SBDebugger`, `SBTarget`, `SBProcess`, `SBThread` or
+`SBFrame` instance and you can use the functions from the "Equivalent" column
+to navigate between them.
+
+As a rationale for such behavior, consider that lldb can run in a multithreaded
+environment, and another thread might call the "script" command, changing the
+value out from under you.
+
+To get started with these objects and LLDB scripting, please note that almost
+all of the lldb Python objects are able to briefly describe themselves when you
+pass them to the Python print function:
+
+```python3
+(lldb) script
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> print(lldb.debugger)
+Debugger (instance: "debugger_1", id: 1)
+>>> print(lldb.target)
+a.out
+>>> print(lldb.process)
+SBProcess: pid = 58842, state = stopped, threads = 1, executable = a.out
+>>> print(lldb.thread)
+thread #1: tid = 0x2265ce3, 0x0000000100000334 a.out`main at t.c:2:3, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
+>>> print(lldb.frame)
+frame #0: 0x0000000100000334 a.out`main at t.c:2:3
+```
\ No newline at end of file
diff --git a/lldb/docs/use/tutorials/script-driven-debugging.md b/lldb/docs/use/tutorials/script-driven-debugging.md
new file mode 100644
index 0000000000000..55b90b1e25bf5
--- /dev/null
+++ b/lldb/docs/use/tutorials/script-driven-debugging.md
@@ -0,0 +1,492 @@
+# Script-Driven Debugging
+
+LLDB has been structured from the beginning to be scriptable in two
+ways:
+- a Unix Python session can initiate/run a debug session non-interactively
+using LLDB;
+- and within the LLDB debugger tool, Python scripts can be used to help with
+many tasks, including inspecting program data, iterating over containers and
+determining if a breakpoint should stop execution or continue.
+
+This document will show how to do some of these things by going through an
+example, explaining how to use Python scripting to find a bug in a program
+that searches for text in a large binary tree.
+
+### The Test Program and Input
+
+We have a simple C program ([dictionary.c](https://github.com/llvm/llvm-project/blob/main/lldb/examples/scripting/dictionary.c))
+that reads in a text file, and stores all the words from the file in a
+Binary Search Tree, sorted alphabetically. It then enters a loop
+prompting the user for a word, searching for the word in the tree
+(using Binary Search), and reporting to the user whether or not it found
+the word in the tree.
+
+The input text file we are using to test our program contains the text
+for William Shakespeare's famous tragedy "Romeo and Juliet".
+
+### The Bug
+
+When we try running our program, we find there is a problem. While it
+successfully finds some of the words we would expect to find, such as
+"love" or "sun", it fails to find the word "Romeo", which **MUST** be in
+the input text file:
+
+```shell
+$ ./dictionary Romeo-and-Juliet.txt
+Dictionary loaded.
+Enter search word: love
+Yes!
+Enter search word: sun
+Yes!
+Enter search word: Romeo
+No!
+Enter search word: ^D
+$
+```
+
+### Using Depth First Search
+
+Our first job is to determine if the word "Romeo" actually got inserted
+into the tree or not. Since "Romeo and Juliet" has thousands of words,
+trying to examine our binary search tree by hand is completely
+impractical. Therefore we will write a Python script to search the tree
+for us. We will write a recursive Depth First Search function that
+traverses the entire tree searching for a word, and maintaining
+information about the path from the root of the tree to the current
+node. If it finds the word in the tree, it returns the path from the
+root to the node containing the word. This is what our DFS function in
+Python would look like, with line numbers added for easy reference in
+later explanations:
+
+```python3
+1: def DFS (root, word, cur_path):
+2:     root_word_ptr = root.GetChildMemberWithName ("word")
+3:     left_child_ptr = root.GetChildMemberWithName ("left")
+4:     right_child_ptr = root.GetChildMemberWithName ("right")
+5:     root_word = root_word_ptr.GetSummary()
+6:     end = len (root_word) - 1
+7:     if root_word[0] == '"' and root_word[end] == '"':
+8:         root_word = root_word[1:end]
+9:     end = len (root_word) - 1
+10:     if root_word[0] == '\'' and root_word[end] == '\'':
+11:        root_word = root_word[1:end]
+12:     if root_word == word:
+13:         return cur_path
+14:     elif word < root_word:
+15:         if left_child_ptr.GetValue() is None:
+16:             return ""
+17:         else:
+18:             cur_path = cur_path + "L"
+19:             return DFS (left_child_ptr, word, cur_path)
+20:     else:
+21:         if right_child_ptr.GetValue() is None:
+22:             return ""
+23:         else:
+24:             cur_path = cur_path + "R"
+25:             return DFS (right_child_ptr, word, cur_path)
+```
+
+### Accessing & Manipulating Program Variables
+
+Before we can call any Python function on any of our program's
+variables, we need to get the variable into a form that Python can
+access. To show you how to do this we will look at the parameters for
+the DFS function. The first parameter is going to be a node in our
+binary search tree, put into a Python variable. The second parameter is
+the word we are searching for (a string), and the third parameter is a
+string representing the path from the root of the tree to our current
+node.
+
+The most interesting parameter is the first one, the Python variable
+that needs to contain a node in our search tree. How can we take a
+variable out of our program and put it into a Python variable? What
+kind of Python variable will it be? The answers are to use the LLDB API
+functions, provided as part of the LLDB Python module. Running Python
+from inside LLDB, LLDB will automatically give us our current frame
+object as a Python variable, "lldb.frame". This variable has the type
+`SBFrame` (see the LLDB API for more information about `SBFrame`
+objects). One of the things we can do with a frame object, is to ask it
+to find and return its local variable. We will call the API function
+`SBFrame.FindVariable` on the `lldb.frame` object to give us our
+dictionary variable as a Python variable:
+
+```python3
+root = lldb.frame.FindVariable ("dictionary")
+```
+
+The line above, executed in the Python script interpreter in LLDB, asks the
+current frame to find the variable named "dictionary" and return it. We then
+store the returned value in the Python variable named "root". This answers the
+question of HOW to get the variable, but it still doesn't explain WHAT actually
+gets put into "root". If you examine the LLDB API, you will find that the
+`SBFrame` method "FindVariable" returns an object of type `SBValue`. `SBValue`
+objects are used, among other things, to wrap up program variables and values.
+There are many useful methods defined in the `SBValue` class to allow you to get
+information or children values out of SBValues. For complete information, see
+the header file SBValue.h. The `SBValue` methods that we use in our DFS function
+are `GetChildMemberWithName()`, `GetSummary()`, and `GetValue()`.
+
+### Explaining DFS Script in Detail
+
+Before diving into the details of this code, it would be best to give a
+high-level overview of what it does. The nodes in our binary search tree were
+defined to have type `tree_node *`, which is defined as:
+
+```c++
+typedef struct tree_node
+{
+   const char *word;
+   struct tree_node *left;
+   struct tree_node *right;
+} tree_node;
+```
+
+Lines 2-11 of DFS are getting data out of the current tree node and getting
+ready to do the actual search; lines 12-25 are the actual depth-first search.
+Lines 2-4 of our DFS function get the word, left and right fields out of the
+current node and store them in Python variables. Since root_word_ptr is a
+pointer to our word, and we want the actual word, line 5 calls GetSummary() to
+get a string containing the value out of the pointer. Since GetSummary() adds
+quotes around its result, lines 6-11 strip surrounding quotes off the word.
+
+Line 12 checks to see if the word in the current node is the one we are
+searching for. If so, we are done, and line 13 returns the current path.
+Otherwise, line 14 checks to see if we should go left (search word comes before
+the current word). If we decide to go left, line 15 checks to see if the left
+pointer child is NULL ("None" is the Python equivalent of NULL). If the left
+pointer is NULL, then the word is not in this tree and we return an empty path
+(line 16). Otherwise, we add an "L" to the end of our current path string, to
+indicate we are going left (line 18), and then recurse on the left child (line
+19). Lines 20-25 are the same as lines 14-19, except for going right rather
+than going left.
+
+One other note: Typing something as long as our DFS function directly into the
+interpreter can be difficult, as making a single typing mistake means having to
+start all over. Therefore we recommend doing as we have done: Writing your
+longer, more complicated script functions in a separate file (in this case
+tree_utils.py) and then importing it into your LLDB Python interpreter.
+
+### The DFS Script in Action
+
+At this point we are ready to use the DFS function to see if the word "Romeo"
+is in our tree or not. To actually use it in LLDB on our dictionary program,
+you would do something like this:
+
+```c++
+$ lldb
+(lldb) process attach -n "dictionary"
+Architecture set to: x86_64.
+Process 521 stopped
+* thread #1: tid = 0x2c03, 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8, stop reason = signal SIGSTOP
+frame #0: 0x00007fff86c8bea0 libSystem.B.dylib`read$NOCANCEL + 8
+(lldb) breakpoint set -n find_word
+Breakpoint created: 1: name = 'find_word', locations = 1, resolved = 1
+(lldb) continue
+Process 521 resuming
+Process 521 stopped
+* thread #1: tid = 0x2c03, 0x0000000100001830 dictionary`find_word + 16
+at dictionary.c:105, stop reason = breakpoint 1.1
+frame #0: 0x0000000100001830 dictionary`find_word + 16 at dictionary.c:105
+102 int
+103 find_word (tree_node *dictionary, char *word)
+104 {
+-> 105 if (!word || !dictionary)
+106 return 0;
+107
+108 int compare_value = strcmp (word, dictionary->word);
+(lldb) script
+```
+```python3
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> import tree_utils
+>>> root = lldb.frame.FindVariable ("dictionary")
+>>> current_path = ""
+>>> path = tree_utils.DFS (root, "Romeo", current_path)
+>>> print path
+LLRRL
+>>> ^D
+(lldb)
+```
+
+The first bit of code above shows starting lldb, attaching to the dictionary
+program, and getting to the find_word function in LLDB. The interesting part
+(as far as this example is concerned) begins when we enter the script command
+and drop into the embedded interactive Python interpreter. We will go over this
+Python code line by line. The first line
+
+```python3
+import tree_utils
+```
+
+imports the file where we wrote our DFS function, tree_utils.py, into Python.
+Notice that to import the file we leave off the ".py" extension. We can now
+call any function in that file, giving it the prefix "tree_utils.", so that
+Python knows where to look for the function. The line
+
+```python3
+root = lldb.frame.FindVariable ("dictionary")
+```
+
+gets our program variable "dictionary" (which contains the binary search tree)
+and puts it into the Python variable "root". See Accessing & Manipulating
+Program Variables in Python above for more details about how this works. The
+next line is
+
+```python3
+current_path = ""
+```
+
+This line initializes the current_path from the root of the tree to our current
+node. Since we are starting at the root of the tree, our current path starts as
+an empty string. As we go right and left through the tree, the DFS function
+will append an 'R' or an 'L' to the current path, as appropriate. The line
+
+```python3
+path = tree_utils.DFS (root, "Romeo", current_path)
+```
+
+calls our DFS function (prefixing it with the module name so that Python can
+find it). We pass in our binary tree stored in the variable root, the word we
+are searching for, and our current path. We assign whatever path the DFS
+function returns to the Python variable path.
+
+Finally, we want to see if the word was found or not, and if so we want to see
+the path through the tree to the word. So we do
+
+```python3
+print path
+```
+
+From this we can see that the word "Romeo" was indeed found in the tree, and
+the path from the root of the tree to the node containing "Romeo" is
+left-left-right-right-left.
+
+### Using Breakpoint Command Scripts
+
+We are halfway to figuring out what the problem is. We know the word we are
+looking for is in the binary tree, and we know exactly where it is in the
+binary tree. Now we need to figure out why our binary search algorithm is not
+finding the word. We will do this using breakpoint command scripts.
+
+The idea is as follows. The binary search algorithm has two main decision
+points: the decision to follow the right branch; and, the decision to follow
+the left branch. We will set a breakpoint at each of these decision points, and
+attach a Python breakpoint command script to each breakpoint. The breakpoint
+commands will use the global path Python variable that we got from our DFS
+function. Each time one of these decision breakpoints is hit, the script will
+compare the actual decision with the decision the front of the path variable
+says should be made (the first character of the path). If the actual decision
+and the path agree, then the front character is stripped off the path, and
+execution is resumed. In this case the user never even sees the breakpoint
+being hit. But if the decision differs from what the path says it should be,
+then the script prints out a message and does NOT resume execution, leaving the
+user sitting at the first point where a wrong decision is being made.
+
+### Python Breakpoint Command Scripts Are Not What They Seem
+
+What do we mean by that? When you enter a Python breakpoint command in LLDB, it
+appears that you are entering one or more plain lines of Python. BUT LLDB then
+takes what you entered and wraps it into a Python FUNCTION (just like using the
+"def" Python command). It automatically gives the function an obscure, unique,
+hard-to-stumble-across function name, and gives it two parameters: frame and
+bp_loc. When the breakpoint gets hit, LLDB wraps up the frame object where the
+breakpoint was hit, and the breakpoint location object for the breakpoint that
+was hit, and puts them into Python variables for you. It then calls the Python
+function that was created for the breakpoint command, and passes in the frame
+and breakpoint location objects.
+
+So, being practical, what does this mean for you when you write your Python
+breakpoint commands? It means that there are two things you need to keep in
+mind: 1. If you want to access any Python variables created outside your
+script, you must declare such variables to be global. If you do not declare
+them as global, then the Python function will treat them as local variables,
+and you will get unexpected behavior. 2. All Python breakpoint command scripts
+automatically have a frame and a bp_loc variable. The variables are pre-loaded
+by LLDB with the correct context for the breakpoint. You do not have to use
+these variables, but they are there if you want them.
+
+### The Decision Point Breakpoint Commands
+
+This is what the Python breakpoint command script would look like for the
+decision to go right:
+
+```python3
+global path
+if path[0] == 'R':
+   path = path[1:]
+   thread = frame.GetThread()
+   process = thread.GetProcess()
+   process.Continue()
+else:
+   print "Here is the problem; going right, should go left!"
+```
+
+Just as a reminder, LLDB is going to take this script and wrap it up in a function, like this:
+
+```python3
+def some_unique_and_obscure_function_name (frame, bp_loc):
+   global path
+   if path[0] == 'R':
+      path = path[1:]
+      thread = frame.GetThread()
+      process = thread.GetProcess()
+      process.Continue()
+   else:
+      print "Here is the problem; going right, should go left!"
+```
+
+LLDB will call the function, passing in the correct frame and breakpoint
+location whenever the breakpoint gets hit. There are several things to notice
+about this function. The first one is that we are accessing and updating a
+piece of state (the path variable), and actually conditioning our behavior
+based upon this variable. Since the variable was defined outside of our script
+(and therefore outside of the corresponding function) we need to tell Python
+that we are accessing a global variable. That is what the first line of the
+script does. Next we check where the path says we should go and compare it to
+our decision (recall that we are at the breakpoint for the decision to go
+right). If the path agrees with our decision, then we strip the first character
+off of the path.
+
+Since the decision matched the path, we want to resume execution. To do this we
+make use of the frame parameter that LLDB guarantees will be there for us. We
+use LLDB API functions to get the current thread from the current frame, and
+then to get the process from the thread. Once we have the process, we tell it
+to resume execution (using the Continue() API function).
+
+If the decision to go right does not agree with the path, then we do not resume
+execution. We allow the breakpoint to remain stopped (by doing nothing), and we
+print an informational message telling the user we have found the problem, and
+what the problem is.
+
+### Actually Using The Breakpoint Commands
+
+Now we will look at what happens when we actually use these breakpoint commands
+on our program. Doing a source list -n find_word shows us the function
+containing our two decision points. Looking at the code below, we see that we
+want to set our breakpoints on lines 113 and 115:
+
+```c++
+(lldb) source list -n find_word
+File: /Volumes/Data/HD2/carolinetice/Desktop/LLDB-Web-Examples/dictionary.c.
+101
+102 int
+103 find_word (tree_node *dictionary, char *word)
+104 {
+105   if (!word || !dictionary)
+106     return 0;
+107
+108   int compare_value = strcmp (word, dictionary->word);
+109
+110   if (compare_value == 0)
+111     return 1;
+112   else if (compare_value < 0)
+113     return find_word (dictionary->left, word);
+114   else
+115     return find_word (dictionary->right, word);
+116 }
+117
+```
+
+So, we set our breakpoints, enter our breakpoint command scripts, and see what happens:
+
+```c++
+(lldb) breakpoint set -l 113
+Breakpoint created: 2: file ="dictionary.c", line = 113, locations = 1, resolved = 1
+(lldb) breakpoint set -l 115
+Breakpoint created: 3: file ="dictionary.c", line = 115, locations = 1, resolved = 1
+(lldb) breakpoint command add -s python 2
+```
+```python3
+Enter your Python command(s). Type 'DONE' to end.
+> global path
+> if (path[0] == 'L'):
+>     path = path[1:]
+>     thread = frame.GetThread()
+>     process = thread.GetProcess()
+>     process.Continue()
+> else:
+>     print "Here is the problem. Going left, should go right!"
+> DONE
+```
+```c++
+(lldb) breakpoint command add -s python 3
+```
+```python3
+Enter your Python command(s). Type 'DONE' to end.
+> global path
+> if (path[0] == 'R'):
+>     path = path[1:]
+>     thread = frame.GetThread()
+>     process = thread.GetProcess()
+>     process.Continue()
+> else:
+>     print "Here is the problem. Going right, should go left!"
+> DONE
+```
+```c++
+(lldb) continue
+Process 696 resuming
+Here is the problem. Going right, should go left!
+Process 696 stopped
+* thread #1: tid = 0x2d03, 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115, stop reason = breakpoint 3.1
+frame #0: 0x000000010000189f dictionary`find_word + 127 at dictionary.c:115
+   112   else if (compare_value < 0)
+   113     return find_word (dictionary->left, word);
+   114   else
+-> 115     return find_word (dictionary->right, word);
+   116 }
+   117
+   118 void
+(lldb)
+```
+
+After setting our breakpoints, adding our breakpoint commands and continuing,
+we run for a little bit and then hit one of our breakpoints, printing out the
+error message from the breakpoint command. Apparently at this point in the
+tree, our search algorithm decided to go right, but our path says the node we
+want is to the left. Examining the word at the node where we stopped, and our
+search word, we see:
+
+```c++
+(lldb) expr dictionary->word
+(const char *) $1 = 0x0000000100100080 "dramatis"
+(lldb) expr word
+(char *) $2 = 0x00007fff5fbff108 "romeo"
+```
+
+So the word at our current node is "dramatis", and the word we are searching
+for is "romeo". "romeo" comes after "dramatis" alphabetically, so it seems like
+going right would be the correct decision. Let's ask Python what it thinks the
+path from the current node to our word is:
+
+```c++
+(lldb) script print path
+LLRRL
+```
+
+According to Python we need to go left-left-right-right-left from our current
+node to find the word we are looking for. Let's double check our tree, and see
+what word it has at that node:
+
+```c++
+(lldb) expr dictionary->left->left->right->right->left->word
+(const char *) $4 = 0x0000000100100880 "Romeo"
+```
+
+So the word we are searching for is "romeo" and the word at our DFS location is
+"Romeo". Aha! One is uppercase and the other is lowercase: We seem to have a
+case conversion problem somewhere in our program (we do).
+
+This is the end of our example on how you might use Python scripting in LLDB to
+help you find bugs in your program.
+
+### Sources
+
+The complete code for the Dictionary program (with case-conversion bug), the
+DFS function and other Python script examples used for this example are
+available below.
+
+- [tree_utils.py](https://github.com/llvm/llvm-project/blob/main/lldb/examples/scripting/tree_utils.py) - Example Python functions using LLDB's API, including DFS
+- [dictionary.c](https://github.com/llvm/llvm-project/blob/main/lldb/examples/scripting/dictionary.c) - Sample dictionary program, with bug
+- The text for "Romeo and Juliet" can be obtained from [the Gutenberg Project](https://www.gutenberg.org).
+
diff --git a/lldb/docs/use/tutorials/writing-custom-commands.md b/lldb/docs/use/tutorials/writing-custom-commands.md
new file mode 100644
index 0000000000000..d53b7e473a505
--- /dev/null
+++ b/lldb/docs/use/tutorials/writing-custom-commands.md
@@ -0,0 +1,429 @@
+# Writing Custom Commands
+
+### Create a new command using a Python function
+
+Python functions can be used to create new LLDB command interpreter commands,
+which will work like all the natively defined lldb commands. This provides a
+very flexible and easy way to extend LLDB to meet your debugging requirements.
+
+To write a python function that implements a new LLDB command define the
+function to take five arguments as follows:
+
+```python3
+def command_function(debugger, command, exe_ctx, result, internal_dict):
+    # Your code goes here
+```
+
+The meaning of the arguments is given in the table below.
+
+If you provide a Python docstring in your command function LLDB will use it
+when providing "long help" for your command, as in:
+
+```python3
+def command_function(debugger, command, result, internal_dict):
+    """This command takes a lot of options and does many fancy things"""
+    # Your code goes here
+```
+
+though providing help can also be done programmatically (see below).
+
+Prior to lldb 3.5.2 (April 2015), LLDB Python command definitions didn't take the SBExecutionContext
+argument. So you may still see commands where the command definition is:
+
+```python3
+def command_function(debugger, command, result, internal_dict):
+    # Your code goes here
+```
+
+Using this form is strongly discouraged because it can only operate on the "currently selected"
+target, process, thread, frame.  The command will behave as expected when run
+directly on the command line.  But if the command is used in a stop-hook, breakpoint
+callback, etc. where the response to the callback determines whether we will select
+this or that particular process/frame/thread, the global "currently selected"
+entity is not necessarily the one the callback is meant to handle.  In that case, this
+command definition form can't do the right thing.
+
+| Argument | Type | Description |
+|----------|------|-------------|
+| `debugger` | `lldb.SBDebugger` | The current debugger object. |
+| `command` | `python string` | A python string containing all arguments for your command. If you need to chop up the arguments try using the `shlex` module's `shlex.split(command)` to properly extract the arguments. |
+| `exe_ctx` | `lldb.SBExecutionContext` | An execution context object carrying around information on the inferior process' context in which the command is expected to act *Optional since lldb 3.5.2, unavailable before* |
+| `result` | `lldb.SBCommandReturnObject` | A return object which encapsulates success/failure information for the command and output text that needs to be printed as a result of the command. The plain Python "print" command also works but text won't go in the result by default (it is useful as a temporary logging facility). |
+| `internal_dict` | `python dict object` | The dictionary for the current embedded script session which contains all variables and functions. |
+
+### Create a new command using a Python class
+
+Since lldb 3.7, Python commands can also be implemented by means of a class
+which should implement the following interface:
+
+```python3
+class CommandObjectType:
+    def __init__(self, debugger, internal_dict):
+        # this call should initialize the command with respect to the command interpreter for the passed-in debugger
+
+    def __call__(self, debugger, command, exe_ctx, result):
+        # this is the actual bulk of the command, akin to Python command functions
+
+    def get_short_help(self):
+        # this call should return the short help text for this command[1]
+
+    def get_long_help(self):
+        # this call should return the long help text for this command[1]
+
+    def get_flags(self):
+        # this will be called when the command is added to the command interpreter,
+        # and should return a flag field made from or-ing together the appropriate
+        # elements of the lldb.CommandFlags enum to specify the requirements of this command.
+        # The CommandInterpreter will make sure all these requirements are met, and will
+        # return the standard lldb error if they are not.[1]
+
+    def get_repeat_command(self, command):
+        # The auto-repeat command is what will get executed when the user types just
+        # a return at the next prompt after this command is run.  Even if your command
+        # was run because it was specified as a repeat command, that invocation will still
+        # get asked for IT'S repeat command, so you can chain a series of repeats, for instance
+        # to implement a pager.
+
+        # The command argument is the command that is about to be executed.
+
+        # If this call returns None, then the ordinary repeat mechanism will be used
+        # If this call returns an empty string, then auto-repeat is disabled
+        # If this call returns any other string, that will be the repeat command [1]
+```
+
+[1] This method is optional.
+
+As a convenience, you can treat the result object as a Python file object, and
+say
+
+```python3
+print("my command does lots of cool stuff", file=result)
+```
+
+`SBCommandReturnObject` and `SBStream` both support this file-like behavior by
+providing `write()` and `flush()` calls at the Python layer.
+
+### Parsed Commands
+
+The commands that are added using this class definition are what lldb calls
+"raw" commands.  The command interpreter doesn't attempt to parse the command,
+doesn't handle option values, neither generating help for them, or their
+completion.  Raw commands are useful when the arguments passed to the command
+are unstructured, and having to protect them against lldb command parsing would
+be onerous.  For instance, "expr" is a raw command.
+
+You can also add scripted commands that implement the "parsed command", where
+the options and their types are specified, as well as the argument and argument
+types.  These commands look and act like the majority of lldb commands, and you
+can also add custom completions for the options and/or the arguments if you have
+special needs.
+
+The easiest way to do this is to derive your new command from the lldb.ParsedCommand
+class.  That responds in the same way to the help & repeat command interfaces, and
+provides some convenience methods, and most importantly an LLDBOptionValueParser,
+accessed through lldb.ParsedCommand.get_parser().  The parser is used to set
+your command definitions, and to retrieve option values in the `__call__` method.
+
+To set up the command definition, implement the ParsedCommand abstract method:
+
+```python3
+def setup_command_definition(self):
+```
+
+This is called when your command is added to lldb.  In this method you add the
+options and their types, the option help strings, etc. to the command using the API:
+
+```python3
+def add_option(self, short_option, long_option, help, default,
+               dest = None, required=False, groups = None,
+               value_type=lldb.eArgTypeNone, completion_type=None,
+               enum_values=None):
+    """
+    short_option: one character, must be unique, not required
+    long_option:  no spaces, must be unique, required
+    help:         a usage string for this option, will print in the command help
+    default:      the initial value for this option (if it has a value)
+    dest:         the name of the property that gives you access to the value for
+                  this value.  Defaults to the long option if not provided.
+    required: if true, this option must be provided or the command will error out
+    groups: Which "option groups" does this option belong to.  This can either be
+            a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists:
+            so [1, [3,5]] is the same as [1, 3, 4, 5].
+    value_type: one of the lldb.eArgType enum values.  Some of the common arg
+                types also have default completers, which will be applied automatically.
+    completion_type: currently these are values form the lldb.CompletionType enum.	If
+                     you need custom completions, implement	handle_option_argument_completion.
+    enum_values: An array of duples: ["element_name", "element_help"].  If provided,
+                 only one of the enum elements is allowed.  The value will be the
+                 element_name for the chosen enum element as a string.
+    """
+```
+
+Similarly, you can add argument types to the command:
+
+```python3
+def make_argument_element(self, arg_type, repeat = "optional", groups = None):
+    """
+  	arg_type: The argument type, one of the	lldb.eArgType enum values.
+  	repeat:	Choose from the	following options:
+  	      	"plain"	- one value
+  	      	"optional" - zero or more values
+  	      	"plus" - one or	more values
+  	groups:	As with	add_option.
+    """
+```
+
+Then implement the body of the command by defining:
+
+```python3
+def __call__(self, debugger, args_array, exe_ctx, result):
+    """This is the command callback.  The option values are
+    provided by the 'dest' properties on the parser.
+
+    args_array: This is the list of arguments provided.
+    exe_ctx: Gives the SBExecutionContext on which the
+             command should operate.
+    result:  Any results of the command should be
+             written into this SBCommandReturnObject.
+    """
+```
+
+This differs from the "raw" command's `__call__` in that the arguments are already
+parsed into the args_array, and the option values are set in the parser, and
+can be accessed using their property name.  The LLDBOptionValueParser class has
+a couple of other handy methods:
+
+```python3
+def was_set(self, long_option_name):
+```
+
+returns `True` if the option was specified on the command line.
+
+```python
+def dest_for_option(self, long_option_name):
+"""
+This will return the value of the dest variable you defined for opt_name.
+Mostly useful for handle_completion where you get passed the long option.
+"""
+```
+
+### Completion
+
+lldb will handle completing your option names, and all your enum values
+automatically.  If your option or argument types have associated built-in completers,
+then lldb will also handle that completion for you.  But if you have a need for
+custom completions, either in your arguments or option values, you can handle
+completion by hand as well.  To handle completion of option value arguments,
+your lldb.ParsedCommand subclass should implement:
+
+```python3
+def handle_option_argument_completion(self, long_option, cursor_pos):
+"""
+long_option: The long option name of the option whose value you are
+             asked to complete.
+cursor_pos: The cursor position in the value for that option - which
+you can get from the option parser.
+"""
+```
+
+And to handle the completion of arguments:
+
+```python3
+def handle_argument_completion(self, args, arg_pos, cursor_pos):
+"""
+args: A list of the arguments to the command
+arg_pos: An index into the args list of the argument with the cursor
+cursor_pos: The cursor position in the arg specified by arg_pos
+"""
+```
+
+When either of these API's is called, the command line will have been parsed up to
+the word containing the cursor, and any option values set in that part of the command
+string are available from the option value parser.  That's useful for instance
+if you have a --shared-library option that would constrain the completions for,
+say, a symbol name option or argument.
+
+The return value specifies what the completion options are.  You have four
+choices:
+
+- `True`: the completion was handled with no completions.
+
+- `False`: the completion was not handled, forward it to the regular
+completion machinery.
+
+- A dictionary with the key: "completion": there is one candidate,
+whose value is the value of the "completion" key.  Optionally you can pass a
+"mode" key whose value is either "partial" or "complete".  Return partial if
+the "completion" string is a prefix for all the completed value.
+
+For instance, if the string you are completing is "Test" and the available completions are:
+"Test1", "Test11" and "Test111", you should return the dictionary:
+
+```python3
+return {"completion": "Test1", "mode" : "partial"}
+```
+
+and then lldb will add the "1" at the cursor and advance it after the added string,
+waiting for more completions.  But if "Test1" is the only completion, return:
+
+```python3
+{"completion": "Test1", "mode": "complete"}
+```
+
+and lldb will add "1 " at the cursor, indicating the command string is complete.
+
+The default is "complete", you don't need to specify a "mode" in that case.
+
+- A dictionary with the key: "values" whose value is a list of candidate completion
+strings.  The command interpreter will present those strings as the available choices.
+You can optionally include a "descriptions" key, whose value is a parallel array
+of description strings, and the completion will show the description next to
+each completion.
+
+### Loading Commands
+
+One other handy convenience when defining lldb command-line commands is the
+command "command script import" which will import a module specified by file
+path, so you don't have to change your PYTHONPATH for temporary scripts. It
+also has another convenience that if your new script module has a function of
+the form:
+
+```python
+def __lldb_init_module(debugger, internal_dict):
+    # Command Initialization code goes here
+```
+
+where debugger and internal_dict are as above, that function will get run when
+the module is loaded allowing you to add whatever commands you want into the
+current debugger. Note that this function will only be run when using the LLDB
+command `command script import`, it will not get run if anyone imports your
+module from another module.
+
+Another way to load custom commands in lldb is to use the
+`@lldb.command(command_name=None, doc=None)` decorator.
+
+```python3
+ at lldb.command()
+def goodstuff(debugger, command, ctx, result, internal_dict):
+    """command help string"""
+    # Command Implementation code goes here
+```
+
+### Examples
+
+Now we can create a module called ls.py in the file ~/ls.py that will implement
+a function that can be used by LLDB's python command code:
+
+```python3
+#!/usr/bin/env python3
+
+import lldb
+import subprocess
+
+def ls(debugger, command, result, internal_dict):
+    output = subprocess.check_output(["/bin/ls"] + command.split(), text=True)
+    print(output, file=result)
+
+# And the initialization code to add your commands
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('command script add -f ls.ls ls')
+    print('The "ls" python command has been installed and is ready for use.')
+```
+
+Now we can load the module into LLDB and use it
+
+```shell
+$ lldb
+(lldb) command script import ~/ls.py
+The "ls" python command has been installed and is ready for use.
+(lldb) ls -l /tmp/
+total 365848
+-rw-------   1 someuser  wheel         7331 Jan 19 15:37 crash.log
+```
+
+You can also make "container" commands to organize the commands you are adding to
+lldb.  Most of the lldb built-in commands structure themselves this way, and using
+a tree structure has the benefit of leaving the one-word command space free for user
+aliases.  It can also make it easier to find commands if you are adding more than
+a few of them.  Here's a trivial example of adding two "utility" commands into a
+"my-utilities" container:
+
+```python3
+#!/usr/bin/env python
+
+import lldb
+
+def first_utility(debugger, command, result, internal_dict):
+    print("I am the first utility")
+
+def second_utility(debugger, command, result, internal_dict):
+    print("I am the second utility")
+
+# And the initialization code to add your commands
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('command container add -h "A container for my utilities" my-utilities')
+    debugger.HandleCommand('command script add -f my_utilities.first_utility -h "My first utility" my-utilities first')
+    debugger.HandleCommand('command script add -f my_utilities.second_utility -h "My second utility" my-utilities second')
+    print('The "my-utilities" python command has been installed and its subcommands are ready for use.')
+```
+
+Then your new commands are available under the my-utilities node:
+
+```
+(lldb) help my-utilities
+A container for my utilities
+
+Syntax: my-utilities
+
+The following subcommands are supported:
+
+    first  -- My first utility  Expects 'raw' input (see 'help raw-input'.)
+    second -- My second utility  Expects 'raw' input (see 'help raw-input'.)
+
+For more help on any particular subcommand, type 'help <command> <subcommand>'.
+(lldb) my-utilities first
+I am the first utility
+```
+
+A more interesting [template](https://github.com/llvm/llvm-project/blob/main/lldb/examples/python/cmdtemplate.py)
+has been created in the source repository that can help you to create lldb command quickly.
+
+A commonly required facility is being able to create a command that does some
+token substitution, and then runs a different debugger command (usually, it
+po'es the result of an expression evaluated on its argument). For instance,
+given the following program:
+
+```objc
+#import <Foundation/Foundation.h>
+NSString*
+ModifyString(NSString* src)
+{
+	return [src stringByAppendingString:@"foobar"];
+}
+
+int main()
+{
+	NSString* aString = @"Hello world";
+	NSString* anotherString = @"Let's be friends";
+	return 1;
+}
+```
+
+you may want a `pofoo` X command, that equates po [ModifyString(X)
+capitalizedString]. The following debugger interaction shows how to achieve
+that goal:
+
+```python3
+(lldb) script
+Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D.
+>>> def pofoo_funct(debugger, command, result, internal_dict):
+...	cmd = "po [ModifyString(" + command + ") capitalizedString]"
+...	debugger.HandleCommand(cmd)
+...
+>>> ^D
+(lldb) command script add pofoo -f pofoo_funct
+(lldb) pofoo aString
+$1 = 0x000000010010aa00 Hello Worldfoobar
+(lldb) pofoo anotherString
+$2 = 0x000000010010aba0 Let's Be Friendsfoobar
+```
\ No newline at end of file

>From 0c3cf200f5b918fb5c1114e9f1764c2d54d1779b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Fri, 12 Sep 2025 21:48:41 -0700
Subject: [PATCH 33/39] [MemProf] Optionally allow transformation of nobuiltin
 operator new (#158396)

For cases where we can guarantee the application does not override
operator new.
---
 .../llvm/Transforms/Utils/SimplifyLibCalls.h  |  2 +-
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 43 +++++++++----
 .../InstCombine/simplify-libcalls-new.ll      | 60 ++++++++++++-------
 3 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index deb3d6c44ef09..4e7c97194cc59 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -189,7 +189,7 @@ class LibCallSimplifier {
   Value *optimizeMemSet(CallInst *CI, IRBuilderBase &B);
   Value *optimizeRealloc(CallInst *CI, IRBuilderBase &B);
   Value *optimizeNew(CallInst *CI, IRBuilderBase &B, LibFunc &Func);
-  Value *optimizeExistingHotColdNew(CallInst *CI, IRBuilderBase &B);
+  Value *maybeOptimizeNoBuiltinOperatorNew(CallInst *CI, IRBuilderBase &B);
   Value *optimizeWcslen(CallInst *CI, IRBuilderBase &B);
   Value *optimizeBCopy(CallInst *CI, IRBuilderBase &B);
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 8acebbaa5458b..4a1565977b91c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -61,6 +61,9 @@ static cl::opt<bool> OptimizeExistingHotColdNew(
     "optimize-existing-hot-cold-new", cl::Hidden, cl::init(false),
     cl::desc(
         "Enable optimization of existing hot/cold operator new library calls"));
+static cl::opt<bool> OptimizeNoBuiltinHotColdNew(
+    "optimize-nobuiltin-hot-cold-new-new", cl::Hidden, cl::init(false),
+    cl::desc("Enable transformation of nobuiltin operator new library calls"));
 
 namespace {
 
@@ -1723,13 +1726,11 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
-// Allow existing calls to operator new() that takes a __hot_cold_t parameter to
-// be updated with a compiler-determined hot cold hint value. This is used in
-// cases where the call is marked nobuiltin (because operator new called
-// explicitly) and therefore cannot be replaced with a different callee.
-Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
-                                                     IRBuilderBase &B) {
-  if (!OptimizeHotColdNew || !OptimizeExistingHotColdNew)
+// Optionally allow optimization of nobuiltin calls to operator new and its
+// variants.
+Value *LibCallSimplifier::maybeOptimizeNoBuiltinOperatorNew(CallInst *CI,
+                                                            IRBuilderBase &B) {
+  if (!OptimizeHotColdNew)
     return nullptr;
   Function *Callee = CI->getCalledFunction();
   if (!Callee)
@@ -1738,6 +1739,22 @@ Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
   if (!TLI->getLibFunc(*Callee, Func))
     return nullptr;
   switch (Func) {
+  case LibFunc_Znwm:
+  case LibFunc_ZnwmRKSt9nothrow_t:
+  case LibFunc_ZnwmSt11align_val_t:
+  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
+  case LibFunc_Znam:
+  case LibFunc_ZnamRKSt9nothrow_t:
+  case LibFunc_ZnamSt11align_val_t:
+  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
+  case LibFunc_size_returning_new:
+  case LibFunc_size_returning_new_aligned:
+    // By default normal operator new calls (not already passing a hot_cold_t
+    // parameter) are not mutated if the call is not marked builtin. Optionally
+    // enable that in cases where it is known to be safe.
+    if (!OptimizeNoBuiltinHotColdNew)
+      return nullptr;
+    break;
   case LibFunc_Znwm12__hot_cold_t:
   case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
   case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
@@ -1748,10 +1765,15 @@ Value *LibCallSimplifier::optimizeExistingHotColdNew(CallInst *CI,
   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
   case LibFunc_size_returning_new_hot_cold:
   case LibFunc_size_returning_new_aligned_hot_cold:
-    return optimizeNew(CI, B, Func);
+    // If the nobuiltin call already passes a hot_cold_t parameter, allow update
+    // of that parameter when enabled.
+    if (!OptimizeExistingHotColdNew)
+      return nullptr;
+    break;
   default:
     return nullptr;
   }
+  return optimizeNew(CI, B, Func);
 }
 
 // When enabled, replace operator new() calls marked with a hot or cold memprof
@@ -4121,9 +4143,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
   //       we can all non-FP calls with the StrictFP attribute to be
   //       optimized.
   if (CI->isNoBuiltin()) {
-    // If this is an existing call to a hot cold operator new, we can update the
-    // hint parameter value, which doesn't change the callee.
-    return optimizeExistingHotColdNew(CI, Builder);
+    // Optionally update operator new calls.
+    return maybeOptimizeNoBuiltinOperatorNew(CI, Builder);
   }
 
   LibFunc Func;
diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
index 41db7f929dfdf..5a4fb04f5f2c0 100644
--- a/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-libcalls-new.ll
@@ -6,14 +6,18 @@
 ; OFF-LABEL: @new_hot_cold()
 
 ;; First check with the default hint values (254 = -2, 128 = -128, 222 = -34).
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=1 -DHOT=-2 -DNOTCOLD=-128 -DAMBIG=-34 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-OFF -DCOLD=1 -DHOT=-2 -DNOTCOLD=-128 -DAMBIG=-34 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
 
 ;; Next check with the non-default cold and hot hint values (200 =-56).
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -hot-new-hint-value=200 -notcold-new-hint-value=99 -ambiguous-new-hint-value=44 -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56 -DAMBIG=44 -DNOTCOLD=99 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -hot-new-hint-value=200 -notcold-new-hint-value=99 -ambiguous-new-hint-value=44 -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-OFF -DCOLD=5 -DHOT=-56 -DAMBIG=44 -DNOTCOLD=99 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
+
+;; Next check with the same non-default cold and hot hint values (200 =-56),
+;; but with transformation of nobuiltin calls enabled.
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -optimize-nobuiltin-hot-cold-new-new -cold-new-hint-value=5 -hot-new-hint-value=200 -notcold-new-hint-value=99 -ambiguous-new-hint-value=44 -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-ON -DCOLD=5 -DHOT=-56 -DAMBIG=44 -DNOTCOLD=99 -DPREVHINTCOLD=7 -DPREVHINTNOTCOLD=7 -DPREVHINTHOT=7 -DPREVHINTAMBIG=7
 
 ;; Try again with the non-default cold and hot hint values (200 =-56), and this
 ;; time specify that existing hints should be updated.
-; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -notcold-new-hint-value=100 -hot-new-hint-value=200 -ambiguous-new-hint-value=44 -optimize-existing-hot-cold-new -S | FileCheck %s --check-prefix=HOTCOLD -DCOLD=5 -DHOT=-56 -DNOTCOLD=100 -DAMBIG=44 -DPREVHINTCOLD=5 -DPREVHINTNOTCOLD=100 -DPREVHINTHOT=-56 -DPREVHINTAMBIG=44
+; RUN: opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=5 -notcold-new-hint-value=100 -hot-new-hint-value=200 -ambiguous-new-hint-value=44 -optimize-existing-hot-cold-new -S | FileCheck %s --check-prefixes=HOTCOLD,NOBUILTIN-OFF -DCOLD=5 -DHOT=-56 -DNOTCOLD=100 -DAMBIG=44 -DPREVHINTCOLD=5 -DPREVHINTNOTCOLD=100 -DPREVHINTHOT=-56 -DPREVHINTAMBIG=44
 
 ;; Make sure that values not in 0..255 are flagged with an error
 ; RUN: not opt < %s -passes=instcombine -optimize-hot-cold-new -cold-new-hint-value=256 -S 2>&1 | FileCheck %s --check-prefix=ERROR
@@ -40,8 +44,9 @@ define void @new() {
   ; HOTCOLD: @_Znwm12__hot_cold_t(i64 10, i8 [[AMBIG]])
   %call4 = call ptr @_Znwm(i64 10) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_Znwm(i64 10)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_Znwm(i64 10)
+  ; NOBUILTIN-ON: @_Znwm12__hot_cold_t(i64 10, i8 [[COLD]])
   %call3 = call ptr @_Znwm(i64 10) #6
   call void @dummy(ptr %call3)
   ret void
@@ -68,8 +73,9 @@ define void @new_align() {
   ; HOTCOLD: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[AMBIG]])
   %call4 = call ptr @_ZnwmSt11align_val_t(i64 10, i64 8) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnwmSt11align_val_t(i64 10, i64 8)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnwmSt11align_val_t(i64 10, i64 8)
+  ; NOBUILTIN-ON: @_ZnwmSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[COLD]])
   %call3 = call ptr @_ZnwmSt11align_val_t(i64 10, i64 8) #6
   call void @dummy(ptr %call3)
   ret void
@@ -97,8 +103,9 @@ define void @new_nothrow() {
   ; HOTCOLD: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnwmRKSt9nothrow_t(i64 10, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnwmRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnwmRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnwmRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnwmRKSt9nothrow_t(i64 10, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -127,8 +134,9 @@ define void @new_align_nothrow() {
   ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnwmSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -154,8 +162,9 @@ define void @array_new() {
   ; HOTCOLD: @_Znam12__hot_cold_t(i64 10, i8 [[AMBIG]])
   %call4 = call ptr @_Znam(i64 10) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_Znam(i64 10)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_Znam(i64 10)
+  ; NOBUILTIN-ON: @_Znam12__hot_cold_t(i64 10, i8 [[COLD]])
   %call3 = call ptr @_Znam(i64 10) #6
   call void @dummy(ptr %call3)
   ret void
@@ -182,8 +191,9 @@ define void @array_new_align() {
   ; HOTCOLD: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[AMBIG]])
   %call4 = call ptr @_ZnamSt11align_val_t(i64 10, i64 8) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnamSt11align_val_t(i64 10, i64 8)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnamSt11align_val_t(i64 10, i64 8)
+  ; NOBUILTIN-ON: @_ZnamSt11align_val_t12__hot_cold_t(i64 10, i64 8, i8 [[COLD]])
   %call3 = call ptr @_ZnamSt11align_val_t(i64 10, i64 8) #6
   call void @dummy(ptr %call3)
   ret void
@@ -211,8 +221,9 @@ define void @array_new_nothrow() {
   ; HOTCOLD: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnamRKSt9nothrow_t(i64 10, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnamRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnamRKSt9nothrow_t(i64 10, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnamRKSt9nothrow_t12__hot_cold_t(i64 10, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnamRKSt9nothrow_t(i64 10, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -241,8 +252,9 @@ define void @array_new_align_nothrow() {
   ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[AMBIG]])
   %call4 = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #7
   call void @dummy(ptr %call4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr nonnull %nt)
+  ; NOBUILTIN-ON: @_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 10, i64 8, ptr nonnull %nt, i8 [[COLD]])
   %call3 = call ptr @_ZnamSt11align_val_tRKSt9nothrow_t(i64 10, i64 8, ptr %nt) #6
   call void @dummy(ptr %call3)
   ret void
@@ -492,8 +504,9 @@ define void @size_returning_test() {
   %call4 = call {ptr, i64} @__size_returning_new(i64 10) #8
   %p4  = extractvalue {ptr, i64} %call4, 0
   call void @dummy(ptr %p4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @__size_returning_new(i64 10)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @__size_returning_new(i64 10)
+  ; NOBUILTIN-ON: @__size_returning_new_hot_cold(i64 10, i8 [[COLD]])
   %call3 = call {ptr, i64} @__size_returning_new(i64 10) #6
   %p3 = extractvalue {ptr, i64} %call3, 0
   call void @dummy(ptr %p3)
@@ -524,8 +537,9 @@ define void @size_returning_aligned_test() {
   %call4 = call {ptr, i64} @__size_returning_new_aligned(i64 10, i64 8) #8
   %p4  = extractvalue {ptr, i64} %call4, 0
   call void @dummy(ptr %p4)
-  ;; Attribute cold on a nobuiltin call has no effect.
-  ; HOTCOLD: @__size_returning_new_aligned(i64 10, i64 8)
+  ;; Attribute cold on a nobuiltin call has no effect, unless optionally enabled.
+  ; NOBUILTIN-OFF: @__size_returning_new_aligned(i64 10, i64 8)
+  ; NOBUILTIN-ON: @__size_returning_new_aligned_hot_cold(i64 10, i64 8, i8 [[COLD]])
   %call3 = call {ptr, i64} @__size_returning_new_aligned(i64 10, i64 8) #6
   %p3 = extractvalue {ptr, i64} %call3, 0
   call void @dummy(ptr %p3)

>From 2bfed5a6884acad5248c5c7b4bb9dabbd2599998 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:19:44 +0530
Subject: [PATCH 34/39] [clang-repl] Adding custom lambda in launchExecutor

---
 clang/include/clang/Interpreter/Interpreter.h | 4 +++-
 clang/lib/Interpreter/IncrementalExecutor.cpp | 6 +++++-
 clang/lib/Interpreter/IncrementalExecutor.h   | 3 ++-
 clang/lib/Interpreter/Interpreter.cpp         | 2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h
index fcc270a17001e..078d70b3b1749 100644
--- a/clang/include/clang/Interpreter/Interpreter.h
+++ b/clang/include/clang/Interpreter/Interpreter.h
@@ -135,13 +135,15 @@ class Interpreter {
     std::string OrcRuntimePath = "";
     /// PID of the out-of-process JIT executor.
     uint32_t ExecutorPID = 0;
+    /// Custom lambda to be executed inside child process/executor
+    std::function<void()> CustomizeFork = nullptr;
     /// An optional code model to provide to the JITTargetMachineBuilder
     std::optional<llvm::CodeModel::Model> CM = std::nullopt;
 
     JITConfig()
         : IsOutOfProcess(false), OOPExecutor(""), OOPExecutorConnect(""),
           UseSharedMemory(false), SlabAllocateSize(0), OrcRuntimePath(""),
-          ExecutorPID(0), CM(std::nullopt) {}
+          ExecutorPID(0), CustomizeFork(nullptr), CM(std::nullopt) {}
   };
 
 protected:
diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index b0eb7d0e9f072..0cf11939fefd1 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -138,7 +138,8 @@ IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
 
 Expected<std::unique_ptr<llvm::jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
-                          unsigned SlabAllocateSize) {
+                          unsigned SlabAllocateSize,
+                                    std::function<void()> CustomizeFork) {
   llvm::orc::SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Instance,
@@ -215,6 +216,9 @@ IncrementalExecutor::launchExecutor(llvm::StringRef ExecutablePath,
     close(ToExecutor[WriteEnd]);
     close(FromExecutor[ReadEnd]);
 
+    if (CustomizeFork)
+      CustomizeFork();
+
     // Execute the child process.
     std::unique_ptr<char[]> ExecutorPath, FDSpecifier;
     {
diff --git a/clang/lib/Interpreter/IncrementalExecutor.h b/clang/lib/Interpreter/IncrementalExecutor.h
index d091535166770..bb1ec33452515 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.h
+++ b/clang/lib/Interpreter/IncrementalExecutor.h
@@ -79,7 +79,8 @@ class IncrementalExecutor {
   static llvm::Expected<
       std::pair<std::unique_ptr<llvm::orc::SimpleRemoteEPC>, uint32_t>>
   launchExecutor(llvm::StringRef ExecutablePath, bool UseSharedMemory,
-                 unsigned SlabAllocateSize);
+                 unsigned SlabAllocateSize,
+                 std::function<void()> CustomizeFork = nullptr);
 
 #if LLVM_ON_UNIX && LLVM_ENABLE_THREADS
   static llvm::Expected<std::unique_ptr<llvm::orc::SimpleRemoteEPC>>
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 84f1c363b5f6f..d8808f9692575 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -355,7 +355,7 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
   if (!Config.OOPExecutor.empty()) {
     // Launch an out-of-process executor locally in a child process.
     auto ResultOrErr = IncrementalExecutor::launchExecutor(
-        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize);
+        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize, Config.CustomizeFork);
     if (!ResultOrErr)
       return ResultOrErr.takeError();
     childPid = ResultOrErr->second;

>From 50e61b70ce685dc8ad2283933c6dd453af7cb586 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:20:09 +0530
Subject: [PATCH 35/39] Formatting changes

---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 2 +-
 clang/lib/Interpreter/Interpreter.cpp         | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 0cf11939fefd1..792ecb08c5f33 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -139,7 +139,7 @@ IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
 Expected<std::unique_ptr<llvm::jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
                           unsigned SlabAllocateSize,
-                                    std::function<void()> CustomizeFork) {
+                          std::function<void()> CustomizeFork) {
   llvm::orc::SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Instance,
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index d8808f9692575..07c170a63ce82 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -355,7 +355,8 @@ Interpreter::outOfProcessJITBuilder(JITConfig Config) {
   if (!Config.OOPExecutor.empty()) {
     // Launch an out-of-process executor locally in a child process.
     auto ResultOrErr = IncrementalExecutor::launchExecutor(
-        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize, Config.CustomizeFork);
+        Config.OOPExecutor, Config.UseSharedMemory, Config.SlabAllocateSize,
+        Config.CustomizeFork);
     if (!ResultOrErr)
       return ResultOrErr.takeError();
     childPid = ResultOrErr->second;

>From d3af99021f4d38b91ef44dd76ef4148f7a61423d Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:35:44 +0530
Subject: [PATCH 36/39] Formatting changes & fixing bug

---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 792ecb08c5f33..5bec3b44a0dc0 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -173,7 +173,8 @@ createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
 llvm::Expected<std::pair<std::unique_ptr<llvm::orc::SimpleRemoteEPC>, uint32_t>>
 IncrementalExecutor::launchExecutor(llvm::StringRef ExecutablePath,
                                     bool UseSharedMemory,
-                                    unsigned SlabAllocateSize) {
+                                    unsigned SlabAllocateSize,
+                                    std::function<void()> CustomizeFork) {
 #ifndef LLVM_ON_UNIX
   // FIXME: Add support for Windows.
   return llvm::make_error<llvm::StringError>(

>From 5ef4a3797d91fdaa4ca8e9276d27120fe7bd7db2 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Mon, 8 Sep 2025 00:49:32 +0530
Subject: [PATCH 37/39] Removing extra arg from sharedMem

---
 clang/lib/Interpreter/IncrementalExecutor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/Interpreter/IncrementalExecutor.cpp b/clang/lib/Interpreter/IncrementalExecutor.cpp
index 5bec3b44a0dc0..45620fcd358c8 100644
--- a/clang/lib/Interpreter/IncrementalExecutor.cpp
+++ b/clang/lib/Interpreter/IncrementalExecutor.cpp
@@ -138,8 +138,7 @@ IncrementalExecutor::getSymbolAddress(llvm::StringRef Name,
 
 Expected<std::unique_ptr<llvm::jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(llvm::orc::SimpleRemoteEPC &SREPC,
-                          unsigned SlabAllocateSize,
-                          std::function<void()> CustomizeFork) {
+                          unsigned SlabAllocateSize) {
   llvm::orc::SharedMemoryMapper::SymbolAddrs SAs;
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Instance,

>From d3e2c7ad2c17430e38cd4f62e50d686708d45055 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Sat, 13 Sep 2025 03:23:53 +0530
Subject: [PATCH 38/39] ClangRepl Interpreter test for out-of-process

---
 clang/unittests/Interpreter/CMakeLists.txt    |  23 +-
 .../OutOfProcessInterpreterTests.cpp          | 203 ++++++++++++++++++
 2 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp

diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index db9f80d9f53fe..7b8dcfc9b0546 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -29,12 +29,25 @@ set(CLANG_LIBS_TO_LINK
   )
 endif()
 
-add_distinct_clang_unittest(ClangReplInterpreterTests
+set(CLANG_REPL_TEST_SOURCES
   IncrementalCompilerBuilderTest.cpp
   IncrementalProcessingTest.cpp
   InterpreterTest.cpp
   InterpreterExtensionsTest.cpp
   CodeCompletionTest.cpp
+)
+
+if(TARGET compiler-rt)
+  list(APPEND CLANG_REPL_TEST_SOURCES
+    OutOfProcessInterpreterTests.cpp
+  )
+  message(STATUS "Compiler-RT found, enabling out of process JIT tests")
+endif()
+
+add_distinct_clang_unittest(ClangReplInterpreterTests
+  ${CLANG_REPL_TEST_SOURCES}
+
+  PARTIAL_SOURCES_INTENDED
 
   EXPORT_SYMBOLS
 
@@ -48,6 +61,14 @@ add_distinct_clang_unittest(ClangReplInterpreterTests
   ${LLVM_COMPONENTS_TO_LINK}
   )
 
+if(TARGET compiler-rt)
+  add_dependencies(ClangReplInterpreterTests 
+    llvm-jitlink-executor 
+    compiler-rt
+  )
+  message(STATUS "Adding dependency on compiler-rt for out of process JIT tests")
+endif()
+
 if(EMSCRIPTEN)
 # Without the above you try to link to LLVMSupport twice, and end
 # up with a duplicate symbol error when creating the main module
diff --git a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
new file mode 100644
index 0000000000000..271820e4e5f25
--- /dev/null
+++ b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
@@ -0,0 +1,203 @@
+//===- unittests/Interpreter/OutOfProcessInterpreterTest.cpp --- Interpreter
+// tests when Out-of-Process ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for Clang's Interpreter library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InterpreterTestFixture.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclGroup.h"
+#include "clang/AST/Mangle.h"
+#include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "clang/Interpreter/Interpreter.h"
+#include "clang/Interpreter/Value.h"
+#include "clang/Sema/Lookup.h"
+#include "clang/Sema/Sema.h"
+#include "llvm/Support/Error.h"
+#include "llvm/TargetParser/Host.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <signal.h>
+#include <sstream>
+#include <unistd.h>
+
+using namespace clang;
+
+llvm::ExitOnError ExitOnError;
+
+namespace {
+
+using Args = std::vector<const char *>;
+
+struct FileDeleter {
+  void operator()(FILE *f) {
+    if (f)
+      fclose(f);
+  }
+};
+
+struct IOContext {
+  std::unique_ptr<FILE, FileDeleter> stdin_file;
+  std::unique_ptr<FILE, FileDeleter> stdout_file;
+  std::unique_ptr<FILE, FileDeleter> stderr_file;
+
+  bool initializeTempFiles() {
+    stdin_file.reset(tmpfile());
+    stdout_file.reset(tmpfile());
+    stderr_file.reset(tmpfile());
+    return stdin_file && stdout_file && stderr_file;
+  }
+
+  std::string readStdoutContent() {
+    if (!stdout_file)
+      return "";
+    rewind(stdout_file.get());
+    std::ostringstream content;
+    char buffer[1024];
+    size_t bytes_read;
+    while ((bytes_read = fread(buffer, 1, sizeof(buffer), stdout_file.get())) >
+           0) {
+      content.write(buffer, bytes_read);
+    }
+    return content.str();
+  }
+
+  std::string readStderrContent() {
+    if (!stderr_file)
+      return "";
+    rewind(stderr_file.get());
+    std::ostringstream content;
+    char buffer[1024];
+    size_t bytes_read;
+    while ((bytes_read = fread(buffer, 1, sizeof(buffer), stderr_file.get())) >
+           0) {
+      content.write(buffer, bytes_read);
+    }
+    return content.str();
+  }
+};
+
+static void removePathComponent(unsigned N, llvm::SmallString<256> &Path) {
+  for (unsigned i = 0; i < N; ++i)
+    llvm::sys::path::remove_filename(Path);
+}
+
+static std::string getExecutorPath() {
+  llvm::SmallString<256> ExecutorPath(llvm::sys::fs::getMainExecutable(
+      nullptr, reinterpret_cast<void *>(&getExecutorPath)));
+  removePathComponent(5, ExecutorPath);
+  llvm::sys::path::append(ExecutorPath, "bin", "llvm-jitlink-executor");
+  return ExecutorPath.str().str();
+}
+
+static std::string getOrcRuntimePath() {
+  llvm::SmallString<256> RuntimePath(llvm::sys::fs::getMainExecutable(
+      nullptr, reinterpret_cast<void *>(&getOrcRuntimePath)));
+  removePathComponent(5, RuntimePath);
+  llvm::sys::path::append(RuntimePath, CLANG_INSTALL_LIBDIR_BASENAME, "clang",
+                          CLANG_VERSION_MAJOR_STRING, "lib");
+
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+  if (SystemTriple.isOSBinFormatMachO()) {
+    llvm::sys::path::append(RuntimePath, "darwin", "liborc_rt_osx.a");
+  } else if (SystemTriple.isOSBinFormatELF()) {
+    llvm::sys::path::append(RuntimePath, "x86_64-unknown-linux-gnu",
+                            "liborc_rt.a");
+  }
+  return RuntimePath.str().str();
+}
+
+static std::unique_ptr<Interpreter>
+createInterpreterWithRemoteExecution(std::shared_ptr<IOContext> io_ctx,
+                                     const Args &ExtraArgs = {}) {
+  Args ClangArgs = {"-Xclang", "-emit-llvm-only"};
+  llvm::append_range(ClangArgs, ExtraArgs);
+  auto CB = clang::IncrementalCompilerBuilder();
+  CB.SetCompilerArgs(ClangArgs);
+  auto CI = cantFail(CB.CreateCpp());
+
+  clang::Interpreter::JITConfig Config;
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+
+  if (SystemTriple.isOSBinFormatELF() || SystemTriple.isOSBinFormatMachO()) {
+    Config.IsOutOfProcess = true;
+    Config.OOPExecutor = getExecutorPath();
+    Config.UseSharedMemory = false;
+    Config.SlabAllocateSize = 0;
+    Config.OrcRuntimePath = getOrcRuntimePath();
+
+    int stdin_fd = fileno(io_ctx->stdin_file.get());
+    int stdout_fd = fileno(io_ctx->stdout_file.get());
+    int stderr_fd = fileno(io_ctx->stderr_file.get());
+
+    Config.CustomizeFork = [=] {
+      auto redirect = [](int from, int to) {
+        if (from != to) {
+          dup2(from, to);
+          close(from);
+        }
+      };
+
+      redirect(stdin_fd, STDIN_FILENO);
+      redirect(stdout_fd, STDOUT_FILENO);
+      redirect(stderr_fd, STDERR_FILENO);
+
+      setvbuf(stdout, nullptr, _IONBF, 0);
+      setvbuf(stderr, nullptr, _IONBF, 0);
+
+      printf("CustomizeFork executed\n");
+      fflush(stdout);
+    };
+  }
+
+  return cantFail(clang::Interpreter::create(std::move(CI), Config));
+}
+
+static size_t DeclsSize(TranslationUnitDecl *PTUDecl) {
+  return std::distance(PTUDecl->decls().begin(), PTUDecl->decls().end());
+}
+
+TEST_F(InterpreterTestBase, SanityWithRemoteExecution) {
+  if (!HostSupportsJIT())
+    GTEST_SKIP();
+
+  std::string OrcRuntimePath = getOrcRuntimePath();
+  std::string ExecutorPath = getExecutorPath();
+  
+  if (!llvm::sys::fs::exists(OrcRuntimePath) ||
+      !llvm::sys::fs::exists(ExecutorPath))
+    GTEST_SKIP();
+
+  auto io_ctx = std::make_shared<IOContext>();
+  ASSERT_TRUE(io_ctx->initializeTempFiles());
+
+  std::unique_ptr<Interpreter> Interp =
+      createInterpreterWithRemoteExecution(io_ctx);
+  ASSERT_TRUE(Interp);
+
+  using PTU = PartialTranslationUnit;
+  PTU &R1(cantFail(Interp->Parse("void g(); void g() {}")));
+  EXPECT_EQ(2U, DeclsSize(R1.TUPart));
+
+  PTU &R2(cantFail(Interp->Parse("int i = 42;")));
+  EXPECT_EQ(1U, DeclsSize(R2.TUPart));
+
+  std::string captured_stdout = io_ctx->readStdoutContent();
+  std::string captured_stderr = io_ctx->readStderrContent();
+
+  EXPECT_TRUE(captured_stdout.find("CustomizeFork executed") !=
+              std::string::npos);
+}
+
+} // end anonymous namespace
\ No newline at end of file

>From a59f32cd65d74af67f975bf6a16653549b41a234 Mon Sep 17 00:00:00 2001
From: kr-2003 <kumar.kr.abhinav at gmail.com>
Date: Sat, 13 Sep 2025 07:40:58 +0530
Subject: [PATCH 39/39] Formatting changes

---
 clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
index 271820e4e5f25..704ddc37e642e 100644
--- a/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
+++ b/clang/unittests/Interpreter/OutOfProcessInterpreterTests.cpp
@@ -174,7 +174,7 @@ TEST_F(InterpreterTestBase, SanityWithRemoteExecution) {
 
   std::string OrcRuntimePath = getOrcRuntimePath();
   std::string ExecutorPath = getExecutorPath();
-  
+
   if (!llvm::sys::fs::exists(OrcRuntimePath) ||
       !llvm::sys::fs::exists(ExecutorPath))
     GTEST_SKIP();