[Openmp-commits] [clang] [clang-tools-extra] [flang] [lldb] [llvm] [mlir] [openmp] [Clang] unrecognized html tag causing undesirable comment lexing (PR #152944)

Fri Aug 15 19:39:07 PDT 2025

Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Zhaoxuan Jiang
 <jiangzhaoxuan94 at gmail.com>,Bill Wendling <morbo at google.com>,Morris Hafner
 <mmha at users.noreply.github.com>,David Green <david.green at arm.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Alex Bradbury <asb at igalia.com>,DeanSturtevant1
 <dsturtevant at google.com>,Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Craig
 Topper <craig.topper at sifive.com>,DeanSturtevant1 <dsturtevant at google.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Andy Kaylor <akaylor at nvidia.com>
 =?utf-8?q?,?=Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>,Daniel
 Paoliello <danpao at microsoft.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,joaosaffran
 <126493771+joaosaffran at users.noreply.github.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Stanislav Mekhanoshin
 <Stanislav.Mekhanoshin at amd.com>,Valentin Clement =?utf-8?b?KOODkOODrOODsw=? =?utf-8?q?,?=LLVM GN Syncbot <llvmgnsyncbot at gmail.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Stanislav Mekhanoshin
 <Stanislav.Mekhanoshin at amd.com>,Min-Yih Hsu <min.hsu at sifive.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Kyungwoo Lee <kyulee at meta.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>,Steven Wu <stevenwu at apple.com>,Stanislav
 Mekhanoshin <Stanislav.Mekhanoshin at amd.com>,Oliver Hunt <oliver at apple.com>,Min-Yih
 Hsu <min.hsu at sifive.com>,LLVM GN Syncbot <llvmgnsyncbot at gmail.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>,Brock Denson
 <brock.denson at virscient.com>,Brock Denson <brock.denson at virscient.com>,Brock
 Denson <brock.denson at virscient.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/152944 at github.com>


https://github.com/mdenson updated https://github.com/llvm/llvm-project/pull/152944

>From 9cfd8978eca58a584a3fc09fe3e839c44d5b3205 Mon Sep 17 00:00:00 2001
From: Brock Denson <brock.denson at virscient.com>
Date: Thu, 7 Aug 2025 10:09:25 -0500
Subject: [PATCH 01/53] [clang] unrecognized html tag causing undesirable
 comment lexing. fixes #32680

---
 clang/include/clang/AST/CommentHTMLTags.td | 1 +
 clang/test/AST/ast-dump-comment.cpp        | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/clang/include/clang/AST/CommentHTMLTags.td b/clang/include/clang/AST/CommentHTMLTags.td
index a1ce8c6da96c0..31bfd306867b7 100644
--- a/clang/include/clang/AST/CommentHTMLTags.td
+++ b/clang/include/clang/AST/CommentHTMLTags.td
@@ -51,6 +51,7 @@ def Col     : Tag<"col"> { let EndTagForbidden = 1; }
 def Tr      : Tag<"tr"> { let EndTagOptional = 1; }
 def Th      : Tag<"th"> { let EndTagOptional = 1; }
 def Td      : Tag<"td"> { let EndTagOptional = 1; }
+def Summary : Tag<"summary">;
 
 // Define a list of attributes that are not safe to pass through to HTML
 // output if the input is untrusted.
diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index 40c3edb62821b..f4cae32cfa732 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -131,3 +131,12 @@ void Test_TemplatedFunctionVariadic(int arg, ...);
 // CHECK:        ParamCommandComment{{.*}} [in] implicitly Param="..."
 // CHECK-NEXT:     ParagraphComment
 // CHECK-NEXT:       TextComment{{.*}} Text=" More arguments"
+
+/// \param[out] Aaa <summary>Short summary</summary>
+int Test_HTMLSummaryTag(int Aaa);
+// CHECK:     FunctionDecl{{.*}}Test_HTMLSummaryTag
+// CHECK:       ParamCommandComment{{.*}} [out] explicitly Param="Aaa"
+// CHECK-NEXT:    ParagraphComment
+// CHECK:           HTMLStartTagComment{{.*}} Name="summary"
+// CHECK-NEXT:        TextComment{{.*}} Text="Short summary"
+// CHECK-NEXT:        HTMLEndTagComment{{.*}} Name="summary"
\ No newline at end of file

>From 43b138de3ef72edcbea51b25bee6c1edd2224c3f Mon Sep 17 00:00:00 2001
From: Brock Denson <brock.denson at virscient.com>
Date: Thu, 7 Aug 2025 10:09:25 -0500
Subject: [PATCH 02/53] [clang] unrecognized html tag causing undesirable
 comment lexing. fixes #32680

---
 clang/docs/ReleaseNotes.rst                | 1 +
 clang/include/clang/AST/CommentHTMLTags.td | 4 ++++
 clang/test/AST/ast-dump-comment.cpp        | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a8b7a29933945..414c5cf88db49 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -195,6 +195,7 @@ Bug Fixes to AST Handling
 - Fix incorrect name qualifiers applied to alias CTAD. (#GH136624)
 - Fixed ElaboratedTypes appearing within NestedNameSpecifier, which was not a
   legal representation. This is fixed because ElaboratedTypes don't exist anymore. (#GH43179) (#GH68670) (#GH92757)
+- Fix unrecognized html tag causing undesirable comment lexing (#GH152944)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/CommentHTMLTags.td b/clang/include/clang/AST/CommentHTMLTags.td
index 31bfd306867b7..9b89bc0c811fc 100644
--- a/clang/include/clang/AST/CommentHTMLTags.td
+++ b/clang/include/clang/AST/CommentHTMLTags.td
@@ -52,6 +52,10 @@ def Tr      : Tag<"tr"> { let EndTagOptional = 1; }
 def Th      : Tag<"th"> { let EndTagOptional = 1; }
 def Td      : Tag<"td"> { let EndTagOptional = 1; }
 def Summary : Tag<"summary">;
+def Details : Tag<"details">;
+def Mark    : Tag<"mark">;
+def Figure  : Tag<"figure">;
+def FigCaption : Tag<"figcaption">;
 
 // Define a list of attributes that are not safe to pass through to HTML
 // output if the input is untrusted.
diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index f4cae32cfa732..52786b5f90ddd 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -139,4 +139,4 @@ int Test_HTMLSummaryTag(int Aaa);
 // CHECK-NEXT:    ParagraphComment
 // CHECK:           HTMLStartTagComment{{.*}} Name="summary"
 // CHECK-NEXT:        TextComment{{.*}} Text="Short summary"
-// CHECK-NEXT:        HTMLEndTagComment{{.*}} Name="summary"
\ No newline at end of file
+// CHECK-NEXT:        HTMLEndTagComment{{.*}} Name="summary"

>From 645636c98ab8a02d2eede6119819211671ca8cfe Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Thu, 14 Aug 2025 13:47:48 -0500
Subject: [PATCH 03/53] [lldb] Use (only) PyImport_AppendInittab to patch
 readline (#153329)

The current implementation tries to (1) patch the existing readline
module definition if it's already present in the inittab and (2) append
our patched readline module to the inittab. The former (1) uses the
non-stable Python API and I can't find a situation where this is
necessary.

We do this work before initialization, so for the readline
module to exist, it either needs to be added by Python itself (which
doesn't seem to be the case), or someone would have had to have added it
without initializing.
---
 .../Python/ScriptInterpreterPython.cpp               | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 15ea5e995af5c..9330a634489a2 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -98,17 +98,7 @@ struct InitializePythonRAII {
 #ifdef LLDB_USE_LIBEDIT_READLINE_COMPAT_MODULE
       // Python's readline is incompatible with libedit being linked into lldb.
       // Provide a patched version local to the embedded interpreter.
-      bool ReadlinePatched = false;
-      for (auto *p = PyImport_Inittab; p->name != nullptr; p++) {
-        if (strcmp(p->name, "readline") == 0) {
-          p->initfunc = initlldb_readline;
-          break;
-        }
-      }
-      if (!ReadlinePatched) {
-        PyImport_AppendInittab("readline", initlldb_readline);
-        ReadlinePatched = true;
-      }
+      PyImport_AppendInittab("readline", initlldb_readline);
 #endif
 
       // Register _lldb as a built-in module.

>From a4eeb345bcab8e874eec54806d0837a2f0164101 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 14 Aug 2025 11:52:56 -0700
Subject: [PATCH 04/53] [NFC] Use `[[maybe_unused]]` for variable used in
 assertion (#153639)

---
 llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 6719ce64b96b6..8f5ec782d5ad5 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -149,9 +149,8 @@ expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU,
         std::numeric_limits<uint32_t>::max(), TotalCount);
 
     for (const auto &[G, C] : Targets) {
-      auto It = GuidToCounter.insert({G, C});
+      [[maybe_unused]] auto It = GuidToCounter.insert({G, C});
       assert(It.second);
-      (void)It;
     }
   }
   for (auto [Index, Func] : llvm::enumerate(JT.Funcs)) {

>From 7db0a377b7bcab7dc03c3046f65f172b1d4c4eaa Mon Sep 17 00:00:00 2001
From: Kaitlin Peng <kaitlinpeng at microsoft.com>
Date: Thu, 14 Aug 2025 12:02:34 -0700
Subject: [PATCH 05/53] Fix typo in `step` intrinsic comment (#153642)

`y` should be the first argument and `x` should be the second, otherwise
the formula is wrong. This also matches the documentation
[here](https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-step).
---
 clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index cbc518d050583..21a9c30d9f445 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -2190,10 +2190,10 @@ float4 sqrt(float4);
 // step builtins
 //===----------------------------------------------------------------------===//
 
-/// \fn T step(T x, T y)
+/// \fn T step(T y, T x)
 /// \brief Returns 1 if the x parameter is greater than or equal to the y
-/// parameter; otherwise, 0. vector. \param x [in] The first floating-point
-/// value to compare. \param y [in] The first floating-point value to compare.
+/// parameter; otherwise, 0. vector. \param y [in] The first floating-point
+/// value to compare. \param x [in] The second floating-point value to compare.
 ///
 /// Step is based on the following formula: (x >= y) ? 1 : 0
 

>From 3b3e32b3b0bcda6cd0850004419dd27a9e1495db Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7 at gmail.com>
Date: Thu, 14 Aug 2025 12:21:40 -0700
Subject: [PATCH 06/53] [clang-doc] place HTML/JSON output inside their own
 directories (#150655)

Instead of just outputting everything into the designated root folder,
HTML and JSON output will be placed in html/ and json/ directories.
---
 .../clang-doc/HTMLMustacheGenerator.cpp        | 18 ++++++++++++------
 clang-tools-extra/clang-doc/JSONGenerator.cpp  |  4 +++-
 .../test/clang-doc/basic-project.mustache.test |  8 ++++----
 .../test/clang-doc/json/class-requires.cpp     |  2 +-
 .../clang-doc/json/class-specialization.cpp    |  4 ++--
 .../test/clang-doc/json/class-template.cpp     |  2 +-
 .../test/clang-doc/json/class.cpp              |  2 +-
 .../clang-doc/json/compound-constraints.cpp    |  2 +-
 .../test/clang-doc/json/concept.cpp            |  2 +-
 .../test/clang-doc/json/function-requires.cpp  |  2 +-
 .../clang-doc/json/function-specifiers.cpp     |  2 +-
 .../test/clang-doc/json/method-template.cpp    |  2 +-
 .../test/clang-doc/json/namespace.cpp          |  2 +-
 .../test/clang-doc/json/nested-namespace.cpp   |  4 ++--
 .../test/clang-doc/mustache-index.cpp          |  2 +-
 .../clang-doc/mustache-separate-namespace.cpp  |  2 +-
 16 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
index a64cb5ea26a79..1ab40aacbfe09 100644
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
@@ -144,17 +144,22 @@ Error MustacheHTMLGenerator::generateDocs(
     } else
       return JSONGenerator.takeError();
   }
+  SmallString<128> JSONPath;
+  sys::path::native(RootDir.str() + "/json", JSONPath);
 
   StringMap<json::Value> JSONFileMap;
   {
     llvm::TimeTraceScope TS("Iterate JSON files");
     std::error_code EC;
-    sys::fs::directory_iterator JSONIter(RootDir, EC);
+    sys::fs::directory_iterator JSONIter(JSONPath, EC);
     std::vector<json::Value> JSONFiles;
     JSONFiles.reserve(Infos.size());
     if (EC)
       return createStringError("Failed to create directory iterator.");
 
+    SmallString<128> HTMLDirPath(RootDir.str() + "/html/");
+    if (auto EC = sys::fs::create_directories(HTMLDirPath))
+      return createFileError(HTMLDirPath, EC);
     while (JSONIter != sys::fs::directory_iterator()) {
       if (EC)
         return createFileError("Failed to iterate: " + JSONIter->path(), EC);
@@ -177,14 +182,15 @@ Error MustacheHTMLGenerator::generateDocs(
         return Parsed.takeError();
 
       std::error_code FileErr;
-      SmallString<16> HTMLPath(Path.begin(), Path.end());
-      sys::path::replace_extension(HTMLPath, "html");
-      raw_fd_ostream InfoOS(HTMLPath, FileErr, sys::fs::OF_None);
+      SmallString<128> HTMLFilePath(HTMLDirPath);
+      sys::path::append(HTMLFilePath, sys::path::filename(Path));
+      sys::path::replace_extension(HTMLFilePath, "html");
+      raw_fd_ostream InfoOS(HTMLFilePath, FileErr, sys::fs::OF_None);
       if (FileErr)
         return createFileOpenError(Path, FileErr);
 
-      if (Error Err = generateDocForJSON(*Parsed, sys::path::stem(HTMLPath),
-                                         HTMLPath, InfoOS, CDCtx))
+      if (Error Err = generateDocForJSON(*Parsed, sys::path::stem(HTMLFilePath),
+                                         HTMLFilePath, InfoOS, CDCtx))
         return Err;
       JSONIter.increment(EC);
     }
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 599b381cea60d..26794a5e34d02 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -600,7 +600,9 @@ Error JSONGenerator::generateDocs(
     Info *Info = Group.getValue().get();
 
     SmallString<128> Path;
-    sys::path::native(RootDir, Path);
+    auto RootDirStr = RootDir.str() + "/json";
+    StringRef JSONDir = StringRef(RootDirStr);
+    sys::path::native(JSONDir, Path);
     if (!CreatedDirs.contains(Path)) {
       if (std::error_code Err = sys::fs::create_directories(Path);
           Err != std::error_code())
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index e2d9da60183fa..55099517101f2 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -2,10 +2,10 @@
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
 // RUN: clang-doc --format=mustache --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
-// RUN: FileCheck %s -input-file=%t/docs/_ZTV5Shape.html -check-prefix=HTML-SHAPE
-// RUN: FileCheck %s -input-file=%t/docs/_ZTV10Calculator.html -check-prefix=HTML-CALC
-// RUN: FileCheck %s -input-file=%t/docs/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
-// RUN: FileCheck %s -input-file=%t/docs/_ZTV6Circle.html -check-prefix=HTML-CIRCLE
+// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV5Shape.html -check-prefix=HTML-SHAPE
+// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV10Calculator.html -check-prefix=HTML-CALC
+// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
+// RUN: FileCheck %s -input-file=%t/docs/html/_ZTV6Circle.html -check-prefix=HTML-CIRCLE
 
 HTML-SHAPE: <html lang="en-US">
 HTML-SHAPE: <head>
diff --git a/clang-tools-extra/test/clang-doc/json/class-requires.cpp b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
index bf6c889849a70..513961723990e 100644
--- a/clang-tools-extra/test/clang-doc/json/class-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-requires.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
 
 template<typename T>
 concept Addable = requires(T a, T b) {
diff --git a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
index e9259edad5cb8..d3ad6957e7851 100644
--- a/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-specialization.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/_ZTV7MyClass.json --check-prefix=BASE
-// RUN: FileCheck %s < %t/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION
+// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json --check-prefix=BASE
+// RUN: FileCheck %s < %t/json/_ZTV7MyClassIiE.json --check-prefix=SPECIALIZATION
 
 template<typename T> struct MyClass {};
 
diff --git a/clang-tools-extra/test/clang-doc/json/class-template.cpp b/clang-tools-extra/test/clang-doc/json/class-template.cpp
index 149248c772055..5ef78f54854dd 100644
--- a/clang-tools-extra/test/clang-doc/json/class-template.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class-template.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
 
 template<typename T> struct MyClass {
   T MemberTemplate;
diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp
index 79b8fed0a0188..20a9f218b3d79 100644
--- a/clang-tools-extra/test/clang-doc/json/class.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
 
 struct Foo;
 
diff --git a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
index bb2b4ca770fc0..1a73a0ddb722f 100644
--- a/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
+++ b/clang-tools-extra/test/clang-doc/json/compound-constraints.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/index.json
+// RUN: FileCheck %s < %t/json/index.json
 
 template<typename T> concept Incrementable = requires (T a) {
   a++;
diff --git a/clang-tools-extra/test/clang-doc/json/concept.cpp b/clang-tools-extra/test/clang-doc/json/concept.cpp
index 4c810244ca41b..e96ec14d7dde4 100644
--- a/clang-tools-extra/test/clang-doc/json/concept.cpp
+++ b/clang-tools-extra/test/clang-doc/json/concept.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/index.json
+// RUN: FileCheck %s < %t/json/index.json
 
 // Requires that T suports post and pre-incrementing.
 template<typename T>
diff --git a/clang-tools-extra/test/clang-doc/json/function-requires.cpp b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
index 59ed39ee61fda..94271467cba63 100644
--- a/clang-tools-extra/test/clang-doc/json/function-requires.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-requires.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --extra-arg -std=c++20 --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/index.json
+// RUN: FileCheck %s < %t/json/index.json
 
 template<typename T>
 concept Incrementable = requires(T x) {
diff --git a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
index b194e3371bf76..faaccb7d4f63f 100644
--- a/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
+++ b/clang-tools-extra/test/clang-doc/json/function-specifiers.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/index.json
+// RUN: FileCheck %s < %t/json/index.json
 
 static void myFunction() {}
 
diff --git a/clang-tools-extra/test/clang-doc/json/method-template.cpp b/clang-tools-extra/test/clang-doc/json/method-template.cpp
index 14232d00e277a..87977f891a223 100644
--- a/clang-tools-extra/test/clang-doc/json/method-template.cpp
+++ b/clang-tools-extra/test/clang-doc/json/method-template.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/_ZTV7MyClass.json
+// RUN: FileCheck %s < %t/json/_ZTV7MyClass.json
 
 struct MyClass {
   template<class T> T methodTemplate(T param) {
diff --git a/clang-tools-extra/test/clang-doc/json/namespace.cpp b/clang-tools-extra/test/clang-doc/json/namespace.cpp
index 4b6b38869f714..dcf83236bae28 100644
--- a/clang-tools-extra/test/clang-doc/json/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/namespace.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/index.json
+// RUN: FileCheck %s < %t/json/index.json
 
 class MyClass {};
 
diff --git a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
index 255e540bd6c7c..b19afc1885104 100644
--- a/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/json/nested-namespace.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=json --executor=standalone %s
-// RUN: FileCheck %s < %t/nested.json --check-prefix=NESTED
-// RUN: FileCheck %s < %t/inner.json --check-prefix=INNER
+// RUN: FileCheck %s < %t/json/nested.json --check-prefix=NESTED
+// RUN: FileCheck %s < %t/json/inner.json --check-prefix=INNER
 
 namespace nested {
   int Global;
diff --git a/clang-tools-extra/test/clang-doc/mustache-index.cpp b/clang-tools-extra/test/clang-doc/mustache-index.cpp
index 910233b943666..f9aad193799b3 100644
--- a/clang-tools-extra/test/clang-doc/mustache-index.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-index.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/index.html
+// RUN: FileCheck %s < %t/html/index.html
 
 enum Color {
   RED,
diff --git a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
index 7d7d108e63873..a73a5ab6a843b 100644
--- a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/MyNamespace.html
+// RUN: FileCheck %s < %t/html/MyNamespace.html
 
 namespace MyNamespace {
   class Foo;

>From 20c272cad3f3b18d4c4d6aa3fd3e13ef158711ca Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba at intel.com>
Date: Thu, 14 Aug 2025 12:22:28 -0700
Subject: [PATCH 07/53] [NFC][Offload] Add missing maps to OpenMP offloading
 tests. (#153103)

A few tests were only mapping a pointee, like: `map(pp[0][0])`, on an
`int** pp`, but expecting the pointers, like `pp`, `pp[0]` to also be
mapped, which is incorrect.

This change fixes six such tests.
---
 offload/test/mapping/data_member_ref.cpp      |  3 ++-
 .../declare_mapper_nested_default_mappers.cpp |  4 ++--
 .../mapping/declare_mapper_nested_mappers.cpp |  4 ++--
 offload/test/mapping/ptr_and_obj_motion.c     |  2 +-
 .../target_derefence_array_pointrs.cpp        | 20 ++++++++++---------
 offload/test/mapping/target_has_device_addr.c |  5 +++--
 6 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/offload/test/mapping/data_member_ref.cpp b/offload/test/mapping/data_member_ref.cpp
index fdb8abcaa6506..7947a62c169f4 100644
--- a/offload/test/mapping/data_member_ref.cpp
+++ b/offload/test/mapping/data_member_ref.cpp
@@ -60,7 +60,8 @@ int main() {
   printf("Host %d %d.\n", Bar.VRef.Data, V.Data);
   // CHECK: Host 123456.
   printf("Host %d.\n", *Baz.VRef.Data);
-#pragma omp target map(*Baz.VRef.Data) map(from : D1, D2)
+#pragma omp target map(Baz.VRef.Data) map(*Baz.VRef.Data) map(V1.Data[0 : 0])  \
+    map(from : D1, D2)
   {
     // CHECK: Device 123456.
     D1 = *Baz.VRef.Data;
diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
index c6c5657ae6166..45fd042aedb0e 100644
--- a/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ b/offload/test/mapping/declare_mapper_nested_default_mappers.cpp
@@ -44,8 +44,8 @@ int main() {
 
   int spp00fa = -1, spp00fca = -1, spp00fb_r = -1;
   __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]);
-#pragma omp target map(tofrom: spp[0][0]) firstprivate(p)                           \
-                   map(from: spp00fa, spp00fca, spp00fb_r)
+#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0]) firstprivate(p) \
+    map(from : spp00fa, spp00fca, spp00fb_r)
   {
     spp00fa = spp[0][0].f.a;
     spp00fca = spp[0][0].f.c.a;
diff --git a/offload/test/mapping/declare_mapper_nested_mappers.cpp b/offload/test/mapping/declare_mapper_nested_mappers.cpp
index a9e3f05e0f5fd..a59ed6980ec4c 100644
--- a/offload/test/mapping/declare_mapper_nested_mappers.cpp
+++ b/offload/test/mapping/declare_mapper_nested_mappers.cpp
@@ -42,8 +42,8 @@ int main() {
   int spp00fa = -1, spp00fb_r = -1, spp00fg1 = -1, spp00fg_r = -1;
   __intptr_t p = reinterpret_cast<__intptr_t>(&x[0]),
              p1 = reinterpret_cast<__intptr_t>(&y[0]);
-#pragma omp target map(tofrom : spp[0][0]) firstprivate(p, p1)                  \
-                   map(from: spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
+#pragma omp target map(tofrom : spp[0][0]) map(alloc : spp[0])                 \
+    firstprivate(p, p1) map(from : spp00fa, spp00fb_r, spp00fg1, spp00fg_r)
   {
     spp00fa = spp[0][0].f.a;
     spp00fb_r = spp[0][0].f.b == reinterpret_cast<void *>(p) ? 1 : 0;
diff --git a/offload/test/mapping/ptr_and_obj_motion.c b/offload/test/mapping/ptr_and_obj_motion.c
index 8fa2c9865b4ac..a94c07aadc1bc 100644
--- a/offload/test/mapping/ptr_and_obj_motion.c
+++ b/offload/test/mapping/ptr_and_obj_motion.c
@@ -17,7 +17,7 @@ void init(double vertexx[]) {
 }
 
 void change(DV *dvptr) {
-#pragma omp target map(dvptr->dataptr[0 : 100])
+#pragma omp target map(dvptr->dataptr[0 : 100]) map(alloc : dvptr -> dataptr)
   {
     printf("In change: %lf, expected 77.0\n", dvptr->dataptr[77]);
     dvptr->dataptr[77] += 1.0;
diff --git a/offload/test/mapping/target_derefence_array_pointrs.cpp b/offload/test/mapping/target_derefence_array_pointrs.cpp
index a6dd4069a8f58..d213c87443634 100644
--- a/offload/test/mapping/target_derefence_array_pointrs.cpp
+++ b/offload/test/mapping/target_derefence_array_pointrs.cpp
@@ -18,23 +18,24 @@ void foo(int **t1d) {
 
   for (j = 0; j < 3; j++)
     (*t1d)[j] = 0;
-#pragma omp target map(tofrom : (*t1d)[0 : 3])
+#pragma omp target map(tofrom : (*t1d)[0 : 3]) map(alloc : *t1d)
   { (*t1d)[1] = 1; }
   // CHECK: 1
   printf("%d\n", (*t1d)[1]);
-#pragma omp target map(tofrom : (**t2d)[0 : 3])
+#pragma omp target map(tofrom : (**t2d)[0 : 3]) map(alloc : **t2d, *t2d)
   { (**t2d)[1] = 2; }
   // CHECK: 2
   printf("%d\n", (**t2d)[1]);
-#pragma omp target map(tofrom : (***t3d)[0 : 3])
+#pragma omp target map(tofrom : (***t3d)[0 : 3])                               \
+    map(alloc : ***t3d, **t3d, *t3d)
   { (***t3d)[1] = 3; }
   // CHECK: 3
   printf("%d\n", (***t3d)[1]);
-#pragma omp target map(tofrom : (**t1d))
+#pragma omp target map(tofrom : (**t1d)) map(alloc : *t1d)
   { (*t1d)[0] = 4; }
   // CHECK: 4
   printf("%d\n", (*t1d)[0]);
-#pragma omp target map(tofrom : (*(*(t1d + a) + b)))
+#pragma omp target map(tofrom : (*(*(t1d + a) + b))) map(to : *(t1d + a))
   { *(*(t1d + a) + b) = 5; }
   // CHECK: 5
   printf("%d\n", *(*(t1d + a) + b));
@@ -49,7 +50,7 @@ void bar() {
   for (int i = 0; i < 3; i++) {
     (**a)[1] = i;
   }
-#pragma omp target map((**a)[ : 3])
+#pragma omp target map((**a)[ : 3]) map(alloc : **a, *a)
   {
     (**a)[1] = 6;
     // CHECK: 6
@@ -73,7 +74,8 @@ void zoo(int **f, SSA *sa) {
   *(f + sa->i + 1) = t;
   *(sa->sa->i + *(f + sa->i + 1)) = 4;
   printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
-#pragma omp target map(sa, *(sa->sa->i + *(1 + sa->i + f)))
+#pragma omp target map(*(sa->sa->i + *(1 + sa->i + f))) map(alloc : sa->sa)    \
+    map(to : sa->i) map(to : sa->sa->i) map(to : *(1 + sa->i + f))
   { *(sa->sa->i + *(1 + sa->i + f)) = 7; }
   // CHECK: 7
   printf("%d\n", *(sa->sa->i + *(1 + sa->i + f)));
@@ -87,13 +89,13 @@ void xoo() {
 
 void yoo(int **x) {
   *x = (int *)malloc(2 * sizeof(int));
-#pragma omp target map(**x)
+#pragma omp target map(**x) map(alloc : *x)
   {
     **x = 8;
     // CHECK: 8
     printf("%d\n", **x);
   }
-#pragma omp target map(*(*x + 1))
+#pragma omp target map(*(*x + 1)) map(alloc : *x)
   {
     *(*x + 1) = 9;
     // CHECK: 9
diff --git a/offload/test/mapping/target_has_device_addr.c b/offload/test/mapping/target_has_device_addr.c
index e8bfff868c7ed..f238832c44054 100644
--- a/offload/test/mapping/target_has_device_addr.c
+++ b/offload/test/mapping/target_has_device_addr.c
@@ -66,8 +66,9 @@ void zoo() {
   short **xpp = &xp[0];
 
   x[1] = 111;
-#pragma omp target data map(tofrom : xpp[1][1]) use_device_addr(xpp[1][1])
-#pragma omp target has_device_addr(xpp[1][1])
+#pragma omp target data map(tofrom : xpp[1][1]) map(xpp[1])                    \
+    use_device_addr(xpp[1])
+#pragma omp target has_device_addr(xpp[1])
   {
     xpp[1][1] = 222;
     // CHECK: 222

>From bc3af88311a1aa1021144ef17ac828c1eac3151d Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 14 Aug 2025 12:37:17 -0700
Subject: [PATCH 08/53] Add dependency on `ProfileData` from ScalarOpts
 (#153651)

Fixing buildbot failures after PR #153305, e.g.
https://lab.llvm.org/buildbot/#/builders/203/builds/19861

Analysis already depends on `ProfileData`, so the transitive closure of
the dependencies of `ScalarOpts` doesn't change.

Also avoided an extra dependency (and very unnecessary) on
`Instrumentation`. The API previously used doesn't need to live in
Instrumentation to begin with, but that's something to address in a
follow-up.
---
 llvm/lib/Transforms/Scalar/CMakeLists.txt        |  1 +
 llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02043d01..765059d0c3b20 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -95,6 +95,7 @@ add_llvm_component_library(LLVMScalarOpts
   Analysis
   Core
   InstCombine
+  ProfileData
   Support
   TransformUtils
   )
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 8f5ec782d5ad5..9dde131185cf8 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -22,7 +22,6 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <limits>
 
@@ -181,8 +180,14 @@ expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU,
   if (HadProfile && !ProfcheckDisableMetadataFixes) {
     // At least one of the targets must've been taken.
     assert(llvm::any_of(BranchWeights, [](uint64_t V) { return V != 0; }));
-    setProfMetadata(F.getParent(), Switch, BranchWeights,
-                    *llvm::max_element(BranchWeights));
+    // FIXME: this duplicates logic in instrumentation. Note: since there's at
+    // least a nonzero and these are unsigned values, it follows MaxBW != 0.
+    uint64_t MaxBW = *llvm::max_element(BranchWeights);
+    SmallVector<uint32_t> ScaledBranchWeights(
+        llvm::map_range(BranchWeights, [MaxBW](uint64_t V) {
+          return static_cast<uint32_t>(V / MaxBW);
+        }));
+    setBranchWeights(*Switch, ScaledBranchWeights, /*IsExpected=*/false);
   } else
     setExplicitlyUnknownBranchWeights(*Switch);
   if (PHI)

>From 60495c22b9e4be410116f7c8d9123e68033ac59b Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov at gmail.com>
Date: Thu, 14 Aug 2025 16:42:34 -0300
Subject: [PATCH 09/53] [clang] fix source range computation for
 DeducedTemplateSpecializationType (#153646)

This was a regression introduced in
https://github.com/llvm/llvm-project/pull/147835

Since this regression was never released, there are no release notes.

Fixes https://github.com/llvm/llvm-project/issues/153540
---
 clang/include/clang/AST/TypeLoc.h     |   9 +
 clang/test/AST/ast-dump-templates.cpp | 969 ++++++++++++++++++++++++++
 2 files changed, 978 insertions(+)

diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 934aa14c4c1b6..6389bdea6d122 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -2519,6 +2519,15 @@ class DeducedTemplateSpecializationTypeLoc
     getLocalData()->QualifierData = QualifierLoc.getOpaqueData();
   }
 
+  SourceRange getLocalSourceRange() const {
+    SourceLocation BeginLoc = getElaboratedKeywordLoc();
+    if (BeginLoc.isInvalid())
+      BeginLoc = getQualifierLoc().getBeginLoc();
+    if (BeginLoc.isInvalid())
+      BeginLoc = getNameLoc();
+    return {BeginLoc, getNameLoc()};
+  }
+
   void initializeLocal(ASTContext &Context, SourceLocation Loc);
 };
 
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index b42aa6220e9fc..a842eabf6962f 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -221,6 +221,22 @@ namespace TestPartialSpecNTTP {
 // DUMP-NEXT: `-CXXRecordDecl {{.+}} implicit struct Template2
 } // namespace TestPartialSpecNTTP
 
+namespace GH153540 {
+// DUMP-LABEL: NamespaceDecl {{.*}} GH153540{{$}}
+
+  namespace N {
+    template<typename T> struct S { S(T); };
+  }
+  void f() {
+    N::S(0);
+  }
+
+// DUMP:      FunctionDecl {{.*}} f 'void ()'
+// DUMP-NEXT: CompoundStmt
+// DUMP-NEXT: CXXFunctionalCastExpr {{.*}} 'N::S<int>':'GH153540::N::S<int>'
+// DUMP-NEXT: CXXConstructExpr {{.*}} <col:5, col:11> 'N::S<int>':'GH153540::N::S<int>' 'void (int)'
+} // namespace GH153540
+
 // NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
 
 
@@ -7946,6 +7962,959 @@ namespace TestPartialSpecNTTP {
 // JSON-NEXT:      ]
 // JSON-NEXT:     }
 // JSON-NEXT:    ]
+// JSON-NEXT:   },
+// JSON-NEXT:   {
+// JSON-NEXT:    "id": "0x{{.*}}",
+// JSON-NEXT:    "kind": "NamespaceDecl",
+// JSON-NEXT:    "loc": {
+// JSON-NEXT:     "offset": 9382,
+// JSON-NEXT:     "line": 224,
+// JSON-NEXT:     "col": 11,
+// JSON-NEXT:     "tokLen": 8
+// JSON-NEXT:    },
+// JSON-NEXT:    "range": {
+// JSON-NEXT:     "begin": {
+// JSON-NEXT:      "offset": 9372,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 9
+// JSON-NEXT:     },
+// JSON-NEXT:     "end": {
+// JSON-NEXT:      "offset": 9791,
+// JSON-NEXT:      "line": 238,
+// JSON-NEXT:      "col": 1,
+// JSON-NEXT:      "tokLen": 1
+// JSON-NEXT:     }
+// JSON-NEXT:    },
+// JSON-NEXT:    "name": "GH153540",
+// JSON-NEXT:    "inner": [
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "NamespaceDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 9456,
+// JSON-NEXT:       "line": 227,
+// JSON-NEXT:       "col": 13,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 9446,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 9
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 9507,
+// JSON-NEXT:        "line": 229,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "N",
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "ClassTemplateDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 9492,
+// JSON-NEXT:         "line": 228,
+// JSON-NEXT:         "col": 33,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 9464,
+// JSON-NEXT:          "col": 5,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 9502,
+// JSON-NEXT:          "col": 43,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "name": "S",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9482,
+// JSON-NEXT:           "col": 23,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9473,
+// JSON-NEXT:            "col": 14,
+// JSON-NEXT:            "tokLen": 8
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9482,
+// JSON-NEXT:            "col": 23,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isReferenced": true,
+// JSON-NEXT:          "name": "T",
+// JSON-NEXT:          "tagUsed": "typename",
+// JSON-NEXT:          "depth": 0,
+// JSON-NEXT:          "index": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXRecordDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9492,
+// JSON-NEXT:           "col": 33,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9485,
+// JSON-NEXT:            "col": 26,
+// JSON-NEXT:            "tokLen": 6
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9502,
+// JSON-NEXT:            "col": 43,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "S",
+// JSON-NEXT:          "tagUsed": "struct",
+// JSON-NEXT:          "completeDefinition": true,
+// JSON-NEXT:          "definitionData": {
+// JSON-NEXT:           "canConstDefaultInit": true,
+// JSON-NEXT:           "copyAssign": {
+// JSON-NEXT:            "hasConstParam": true,
+// JSON-NEXT:            "implicitHasConstParam": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "copyCtor": {
+// JSON-NEXT:            "hasConstParam": true,
+// JSON-NEXT:            "implicitHasConstParam": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "defaultCtor": {
+// JSON-NEXT:            "defaultedIsConstexpr": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "dtor": {
+// JSON-NEXT:            "irrelevant": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "hasUserDeclaredConstructor": true,
+// JSON-NEXT:           "isEmpty": true,
+// JSON-NEXT:           "isStandardLayout": true,
+// JSON-NEXT:           "isTriviallyCopyable": true,
+// JSON-NEXT:           "moveAssign": {
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "moveCtor": {
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXRecordDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "col": 33,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9485,
+// JSON-NEXT:              "col": 26,
+// JSON-NEXT:              "tokLen": 6
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isImplicit": true,
+// JSON-NEXT:            "name": "S",
+// JSON-NEXT:            "tagUsed": "struct"
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXConstructorDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9496,
+// JSON-NEXT:             "col": 37,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9496,
+// JSON-NEXT:              "col": 37,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9499,
+// JSON-NEXT:              "col": 40,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "name": "GH153540::N::S<T>",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "void (T)"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "ParmVarDecl",
+// JSON-NEXT:              "loc": {
+// JSON-NEXT:               "offset": 9499,
+// JSON-NEXT:               "col": 40,
+// JSON-NEXT:               "tokLen": 1
+// JSON-NEXT:              },
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "col": 39,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "col": 39,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "T"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "ClassTemplateSpecializationDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9492,
+// JSON-NEXT:           "col": 33,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9464,
+// JSON-NEXT:            "col": 5,
+// JSON-NEXT:            "tokLen": 8
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9502,
+// JSON-NEXT:            "col": 43,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "name": "S",
+// JSON-NEXT:          "tagUsed": "struct",
+// JSON-NEXT:          "completeDefinition": true,
+// JSON-NEXT:          "definitionData": {
+// JSON-NEXT:           "canConstDefaultInit": true,
+// JSON-NEXT:           "canPassInRegisters": true,
+// JSON-NEXT:           "copyAssign": {
+// JSON-NEXT:            "hasConstParam": true,
+// JSON-NEXT:            "implicitHasConstParam": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "copyCtor": {
+// JSON-NEXT:            "hasConstParam": true,
+// JSON-NEXT:            "implicitHasConstParam": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "defaultCtor": {
+// JSON-NEXT:            "defaultedIsConstexpr": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "dtor": {
+// JSON-NEXT:            "irrelevant": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "hasUserDeclaredConstructor": true,
+// JSON-NEXT:           "isEmpty": true,
+// JSON-NEXT:           "isStandardLayout": true,
+// JSON-NEXT:           "isTriviallyCopyable": true,
+// JSON-NEXT:           "moveAssign": {
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "needsImplicit": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           },
+// JSON-NEXT:           "moveCtor": {
+// JSON-NEXT:            "exists": true,
+// JSON-NEXT:            "simple": true,
+// JSON-NEXT:            "trivial": true
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "kind": "TemplateArgument",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "BuiltinType",
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXRecordDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "col": 33,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9485,
+// JSON-NEXT:              "col": 26,
+// JSON-NEXT:              "tokLen": 6
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isImplicit": true,
+// JSON-NEXT:            "name": "S",
+// JSON-NEXT:            "tagUsed": "struct"
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXConstructorDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9496,
+// JSON-NEXT:             "col": 37,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9496,
+// JSON-NEXT:              "col": 37,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9499,
+// JSON-NEXT:              "col": 40,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isUsed": true,
+// JSON-NEXT:            "name": "S",
+// JSON-NEXT:            "mangledName": "_ZN8GH1535401N1SIiEC1Ei",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "void (int)"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "ParmVarDecl",
+// JSON-NEXT:              "loc": {
+// JSON-NEXT:               "offset": 9499,
+// JSON-NEXT:               "col": 40,
+// JSON-NEXT:               "tokLen": 1
+// JSON-NEXT:              },
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "col": 39,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 9498,
+// JSON-NEXT:                "col": 39,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXConstructorDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "col": 33,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isImplicit": true,
+// JSON-NEXT:            "name": "S",
+// JSON-NEXT:            "mangledName": "_ZN8GH1535401N1SIiEC1ERKS2_",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "void (const S<int> &)"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inline": true,
+// JSON-NEXT:            "constexpr": true,
+// JSON-NEXT:            "explicitlyDefaulted": "default",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "ParmVarDecl",
+// JSON-NEXT:              "loc": {
+// JSON-NEXT:               "offset": 9492,
+// JSON-NEXT:               "col": 33,
+// JSON-NEXT:               "tokLen": 1
+// JSON-NEXT:              },
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "col": 33,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "col": 33,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "const S<int> &"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXConstructorDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "col": 33,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isImplicit": true,
+// JSON-NEXT:            "name": "S",
+// JSON-NEXT:            "mangledName": "_ZN8GH1535401N1SIiEC1EOS2_",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "void (S<int> &&)"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inline": true,
+// JSON-NEXT:            "constexpr": true,
+// JSON-NEXT:            "explicitlyDefaulted": "default",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "ParmVarDecl",
+// JSON-NEXT:              "loc": {
+// JSON-NEXT:               "offset": 9492,
+// JSON-NEXT:               "col": 33,
+// JSON-NEXT:               "tokLen": 1
+// JSON-NEXT:              },
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "col": 33,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 9492,
+// JSON-NEXT:                "col": 33,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "S<int> &&"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXDestructorDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "col": 33,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "isImplicit": true,
+// JSON-NEXT:            "isReferenced": true,
+// JSON-NEXT:            "name": "~S",
+// JSON-NEXT:            "mangledName": "_ZN8GH1535401N1SIiED1Ev",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "void () noexcept"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inline": true,
+// JSON-NEXT:            "constexpr": true,
+// JSON-NEXT:            "explicitlyDefaulted": "default"
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionTemplateDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 9496,
+// JSON-NEXT:         "col": 37,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 9464,
+// JSON-NEXT:          "col": 5,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 9499,
+// JSON-NEXT:          "col": 40,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "<deduction guide for S>",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9482,
+// JSON-NEXT:           "col": 23,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9473,
+// JSON-NEXT:            "col": 14,
+// JSON-NEXT:            "tokLen": 8
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9482,
+// JSON-NEXT:            "col": 23,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isReferenced": true,
+// JSON-NEXT:          "name": "T",
+// JSON-NEXT:          "tagUsed": "typename",
+// JSON-NEXT:          "depth": 0,
+// JSON-NEXT:          "index": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXDeductionGuideDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9496,
+// JSON-NEXT:           "col": 37,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9496,
+// JSON-NEXT:            "col": 37,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9499,
+// JSON-NEXT:            "col": 40,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "<deduction guide for S>",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "auto (T) -> GH153540::N::S<T>"
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ParmVarDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9499,
+// JSON-NEXT:             "col": 40,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "col": 39,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "col": 39,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "T"
+// JSON-NEXT:            }
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXDeductionGuideDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9496,
+// JSON-NEXT:           "col": 37,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9496,
+// JSON-NEXT:            "col": 37,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9499,
+// JSON-NEXT:            "col": 40,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "isUsed": true,
+// JSON-NEXT:          "name": "<deduction guide for S>",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "auto (int) -> GH153540::N::S<int>"
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "kind": "TemplateArgument",
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            },
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "BuiltinType",
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              }
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           },
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ParmVarDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9499,
+// JSON-NEXT:             "col": 40,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "col": 39,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9498,
+// JSON-NEXT:              "col": 39,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "int"
+// JSON-NEXT:            }
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       },
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "FunctionTemplateDecl",
+// JSON-NEXT:        "loc": {
+// JSON-NEXT:         "offset": 9492,
+// JSON-NEXT:         "col": 33,
+// JSON-NEXT:         "tokLen": 1
+// JSON-NEXT:        },
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 9464,
+// JSON-NEXT:          "col": 5,
+// JSON-NEXT:          "tokLen": 8
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 9492,
+// JSON-NEXT:          "col": 33,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "isImplicit": true,
+// JSON-NEXT:        "name": "<deduction guide for S>",
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "TemplateTypeParmDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9482,
+// JSON-NEXT:           "col": 23,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9473,
+// JSON-NEXT:            "col": 14,
+// JSON-NEXT:            "tokLen": 8
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9482,
+// JSON-NEXT:            "col": 23,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isReferenced": true,
+// JSON-NEXT:          "name": "T",
+// JSON-NEXT:          "tagUsed": "typename",
+// JSON-NEXT:          "depth": 0,
+// JSON-NEXT:          "index": 0
+// JSON-NEXT:         },
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXDeductionGuideDecl",
+// JSON-NEXT:          "loc": {
+// JSON-NEXT:           "offset": 9492,
+// JSON-NEXT:           "col": 33,
+// JSON-NEXT:           "tokLen": 1
+// JSON-NEXT:          },
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9492,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9492,
+// JSON-NEXT:            "col": 33,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "isImplicit": true,
+// JSON-NEXT:          "name": "<deduction guide for S>",
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "qualType": "auto (GH153540::N::S<T>) -> GH153540::N::S<T>"
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "ParmVarDecl",
+// JSON-NEXT:            "loc": {
+// JSON-NEXT:             "offset": 9492,
+// JSON-NEXT:             "col": 33,
+// JSON-NEXT:             "tokLen": 1
+// JSON-NEXT:            },
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9492,
+// JSON-NEXT:              "col": 33,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "qualType": "GH153540::N::S<T>"
+// JSON-NEXT:            }
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     },
+// JSON-NEXT:     {
+// JSON-NEXT:      "id": "0x{{.*}}",
+// JSON-NEXT:      "kind": "FunctionDecl",
+// JSON-NEXT:      "loc": {
+// JSON-NEXT:       "offset": 9516,
+// JSON-NEXT:       "line": 230,
+// JSON-NEXT:       "col": 8,
+// JSON-NEXT:       "tokLen": 1
+// JSON-NEXT:      },
+// JSON-NEXT:      "range": {
+// JSON-NEXT:       "begin": {
+// JSON-NEXT:        "offset": 9511,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 4
+// JSON-NEXT:       },
+// JSON-NEXT:       "end": {
+// JSON-NEXT:        "offset": 9537,
+// JSON-NEXT:        "line": 232,
+// JSON-NEXT:        "col": 3,
+// JSON-NEXT:        "tokLen": 1
+// JSON-NEXT:       }
+// JSON-NEXT:      },
+// JSON-NEXT:      "name": "f",
+// JSON-NEXT:      "mangledName": "_ZN8GH1535401fEv",
+// JSON-NEXT:      "type": {
+// JSON-NEXT:       "qualType": "void ()"
+// JSON-NEXT:      },
+// JSON-NEXT:      "inner": [
+// JSON-NEXT:       {
+// JSON-NEXT:        "id": "0x{{.*}}",
+// JSON-NEXT:        "kind": "CompoundStmt",
+// JSON-NEXT:        "range": {
+// JSON-NEXT:         "begin": {
+// JSON-NEXT:          "offset": 9520,
+// JSON-NEXT:          "line": 230,
+// JSON-NEXT:          "col": 12,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         },
+// JSON-NEXT:         "end": {
+// JSON-NEXT:          "offset": 9537,
+// JSON-NEXT:          "line": 232,
+// JSON-NEXT:          "col": 3,
+// JSON-NEXT:          "tokLen": 1
+// JSON-NEXT:         }
+// JSON-NEXT:        },
+// JSON-NEXT:        "inner": [
+// JSON-NEXT:         {
+// JSON-NEXT:          "id": "0x{{.*}}",
+// JSON-NEXT:          "kind": "CXXFunctionalCastExpr",
+// JSON-NEXT:          "range": {
+// JSON-NEXT:           "begin": {
+// JSON-NEXT:            "offset": 9526,
+// JSON-NEXT:            "line": 231,
+// JSON-NEXT:            "col": 5,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           },
+// JSON-NEXT:           "end": {
+// JSON-NEXT:            "offset": 9532,
+// JSON-NEXT:            "col": 11,
+// JSON-NEXT:            "tokLen": 1
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "type": {
+// JSON-NEXT:           "desugaredQualType": "GH153540::N::S<int>",
+// JSON-NEXT:           "qualType": "N::S<int>"
+// JSON-NEXT:          },
+// JSON-NEXT:          "valueCategory": "prvalue",
+// JSON-NEXT:          "castKind": "ConstructorConversion",
+// JSON-NEXT:          "conversionFunc": {
+// JSON-NEXT:           "id": "0x{{.*}}",
+// JSON-NEXT:           "kind": "CXXConstructorDecl",
+// JSON-NEXT:           "name": "S",
+// JSON-NEXT:           "type": {
+// JSON-NEXT:            "qualType": "void (int)"
+// JSON-NEXT:           }
+// JSON-NEXT:          },
+// JSON-NEXT:          "inner": [
+// JSON-NEXT:           {
+// JSON-NEXT:            "id": "0x{{.*}}",
+// JSON-NEXT:            "kind": "CXXConstructExpr",
+// JSON-NEXT:            "range": {
+// JSON-NEXT:             "begin": {
+// JSON-NEXT:              "offset": 9526,
+// JSON-NEXT:              "col": 5,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             },
+// JSON-NEXT:             "end": {
+// JSON-NEXT:              "offset": 9532,
+// JSON-NEXT:              "col": 11,
+// JSON-NEXT:              "tokLen": 1
+// JSON-NEXT:             }
+// JSON-NEXT:            },
+// JSON-NEXT:            "type": {
+// JSON-NEXT:             "desugaredQualType": "GH153540::N::S<int>",
+// JSON-NEXT:             "qualType": "N::S<int>"
+// JSON-NEXT:            },
+// JSON-NEXT:            "valueCategory": "prvalue",
+// JSON-NEXT:            "ctorType": {
+// JSON-NEXT:             "qualType": "void (int)"
+// JSON-NEXT:            },
+// JSON-NEXT:            "hadMultipleCandidates": true,
+// JSON-NEXT:            "constructionKind": "complete",
+// JSON-NEXT:            "inner": [
+// JSON-NEXT:             {
+// JSON-NEXT:              "id": "0x{{.*}}",
+// JSON-NEXT:              "kind": "IntegerLiteral",
+// JSON-NEXT:              "range": {
+// JSON-NEXT:               "begin": {
+// JSON-NEXT:                "offset": 9531,
+// JSON-NEXT:                "col": 10,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               },
+// JSON-NEXT:               "end": {
+// JSON-NEXT:                "offset": 9531,
+// JSON-NEXT:                "col": 10,
+// JSON-NEXT:                "tokLen": 1
+// JSON-NEXT:               }
+// JSON-NEXT:              },
+// JSON-NEXT:              "type": {
+// JSON-NEXT:               "qualType": "int"
+// JSON-NEXT:              },
+// JSON-NEXT:              "valueCategory": "prvalue",
+// JSON-NEXT:              "value": "0"
+// JSON-NEXT:             }
+// JSON-NEXT:            ]
+// JSON-NEXT:           }
+// JSON-NEXT:          ]
+// JSON-NEXT:         }
+// JSON-NEXT:        ]
+// JSON-NEXT:       }
+// JSON-NEXT:      ]
+// JSON-NEXT:     }
+// JSON-NEXT:    ]
 // JSON-NEXT:   }
 // JSON-NEXT:  ]
 // JSON-NEXT: }

>From c720d302931ed7fa5588cfd93dd34e305b5e6bc6 Mon Sep 17 00:00:00 2001
From: Michael Berg <93234525+mcberg2021 at users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:50:17 -0700
Subject: [PATCH 10/53] [LoopDist] Consider reads and writes together for
 runtime checks (#145623)

Emit safety guards for ptr accesses when cross partition loads exist
which have a corresponding store to the same address in a different
partition. This will emit the necessary ptr checks for these accesses.

The test case was obtained from SuperTest, which SiFive runs regularly.
We enabled LoopDistribution by default in our downstream compiler, this
change was part of that enablement.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  11 +-
 llvm/lib/Transforms/Scalar/LoopDistribute.cpp |   6 +-
 .../LoopDistribute/cross-partition-access.ll  | 159 ++++++++++++++++++
 3 files changed, 169 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopDistribute/cross-partition-access.ll

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a5535339a714f..62baf9b632bc7 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2404,12 +2404,13 @@ bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands,
 SmallVector<Instruction *, 4>
 MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
   MemAccessInfo Access(Ptr, IsWrite);
-  auto &IndexVector = Accesses.find(Access)->second;
-
+  auto I = Accesses.find(Access);
   SmallVector<Instruction *, 4> Insts;
-  transform(IndexVector,
-                 std::back_inserter(Insts),
-                 [&](unsigned Idx) { return this->InstMap[Idx]; });
+  if (I != Accesses.end()) {
+    transform(I->second, std::back_inserter(Insts),
+              [&](unsigned Idx) { return this->InstMap[Idx]; });
+  }
+
   return Insts;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0ac1a15981770..27d3004d81947 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -502,8 +502,10 @@ class InstPartitionContainer {
     SmallVector<int, 8> PtrToPartitions(N);
     for (unsigned I = 0; I < N; ++I) {
       Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
-      auto Instructions =
-          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+      auto Instructions = LAI.getInstructionsForAccess(Ptr, /* IsWrite */ true);
+      auto ReadInstructions =
+          LAI.getInstructionsForAccess(Ptr, /* IsWrite */ false);
+      Instructions.append(ReadInstructions.begin(), ReadInstructions.end());
 
       int &Partition = PtrToPartitions[I];
       // First set it to uninitialized.
diff --git a/llvm/test/Transforms/LoopDistribute/cross-partition-access.ll b/llvm/test/Transforms/LoopDistribute/cross-partition-access.ll
new file mode 100644
index 0000000000000..6e1106c3277a7
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/cross-partition-access.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S %s | FileCheck %s
+
+; Test emit safety guards for ptr access of %a and %c on a cross parition load which
+; has a corresponding store to the same address. This ensures that if %a and %c
+; overlap in some scenarios, that we execute the original loop for safety reasons.
+
+define dso_local void @_Z13distribution3PiS_S_S_i(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, ptr nocapture noundef %c, ptr nocapture noundef writeonly %d, i64 noundef signext %len) {
+; CHECK-LABEL: define dso_local void @_Z13distribution3PiS_S_S_i(
+; CHECK-SAME: ptr noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef captures(none) [[C:%.*]], ptr noundef writeonly captures(none) [[D:%.*]], i64 noundef signext [[LEN:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[LEN]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[END:.*]], label %[[FOR_BODY_LVER_CHECK:.*]]
+; CHECK:       [[FOR_BODY_LVER_CHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[LEN]], 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 -4
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND1]], [[BOUND0]]
+; CHECK-NEXT:    [[BOUND07:%.*]] = icmp ult ptr [[A]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND18:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT9:%.*]] = and i1 [[BOUND07]], [[BOUND18]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT9]]
+; CHECK-NEXT:    [[BOUND010:%.*]] = icmp ult ptr [[A]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND111:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
+; CHECK-NEXT:    [[CONFLICT_RDX13:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT12]]
+; CHECK-NEXT:    [[BOUND014:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND115:%.*]] = icmp ult ptr [[D]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; CHECK-NEXT:    [[CONFLICT_RDX17:%.*]] = or i1 [[CONFLICT_RDX13]], [[FOUND_CONFLICT16]]
+; CHECK-NEXT:    [[BOUND018:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND119:%.*]] = icmp ult ptr [[B]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[FOUND_CONFLICT20:%.*]] = and i1 [[BOUND018]], [[BOUND119]]
+; CHECK-NEXT:    [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX17]], [[FOUND_CONFLICT20]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX21]], label %[[FOR_BODY_PH_LVER_ORIG:.*]], label %[[FOR_BODY_PH_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_ORIG:.*]]
+; CHECK:       [[FOR_BODY_LVER_ORIG]]:
+; CHECK-NEXT:    [[IDXPROM_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LVER_ORIG]] ], [ [[I6_LVER_ORIG:%.*]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[ARRAYIDX_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM_LVER_ORIG]]
+; CHECK-NEXT:    [[I2_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDX_LVER_ORIG]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[ADD4_LVER_ORIG:%.*]] = add nsw i32 [[I2_LVER_ORIG]], 1
+; CHECK-NEXT:    [[ARRAYIDX8_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[ADD4_LVER_ORIG]], ptr [[ARRAYIDX8_LVER_ORIG]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[I3_LVER_ORIG:%.*]] = getelementptr i32, ptr [[C]], i64 [[IDXPROM_LVER_ORIG]]
+; CHECK-NEXT:    [[ARRAYIDX17_LVER_ORIG:%.*]] = getelementptr i8, ptr [[I3_LVER_ORIG]], i64 -4
+; CHECK-NEXT:    [[I4_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDX17_LVER_ORIG]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[SUB18_LVER_ORIG:%.*]] = sub nsw i32 [[ADD4_LVER_ORIG]], [[I4_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[SUB18_LVER_ORIG]], ptr [[I3_LVER_ORIG]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[I5_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDX8_LVER_ORIG]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[ADD27_LVER_ORIG:%.*]] = add nsw i32 [[I5_LVER_ORIG]], 2
+; CHECK-NEXT:    [[ARRAYIDX31_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IDXPROM_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[ADD27_LVER_ORIG]], ptr [[ARRAYIDX31_LVER_ORIG]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[I6_LVER_ORIG]] = add i64 [[IDXPROM_LVER_ORIG]], 1
+; CHECK-NEXT:    [[CMP1_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[I6_LVER_ORIG]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP1_NOT_LVER_ORIG]], label %[[END_LOOPEXIT_LOOPEXIT:.*]], label %[[FOR_BODY_LVER_ORIG]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_BODY_PH_LDIST1]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_LDIST1]]:
+; CHECK-NEXT:    [[IDXPROM_LDIST1:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LDIST1]] ], [ [[I6_LDIST1:%.*]], %[[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[ARRAYIDX_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM_LDIST1]]
+; CHECK-NEXT:    [[I2_LDIST1:%.*]] = load i32, ptr [[ARRAYIDX_LDIST1]], align 4, !tbaa [[TBAA0]], !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    [[ADD4_LDIST1:%.*]] = add nsw i32 [[I2_LDIST1]], 1
+; CHECK-NEXT:    [[ARRAYIDX8_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM_LDIST1]]
+; CHECK-NEXT:    store i32 [[ADD4_LDIST1]], ptr [[ARRAYIDX8_LDIST1]], align 4, !tbaa [[TBAA0]], !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]]
+; CHECK-NEXT:    [[I3_LDIST1:%.*]] = getelementptr i32, ptr [[C]], i64 [[IDXPROM_LDIST1]]
+; CHECK-NEXT:    [[ARRAYIDX17_LDIST1:%.*]] = getelementptr i8, ptr [[I3_LDIST1]], i64 -4
+; CHECK-NEXT:    [[I4_LDIST1:%.*]] = load i32, ptr [[ARRAYIDX17_LDIST1]], align 4, !tbaa [[TBAA0]], !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]]
+; CHECK-NEXT:    [[SUB18_LDIST1:%.*]] = sub nsw i32 [[ADD4_LDIST1]], [[I4_LDIST1]]
+; CHECK-NEXT:    store i32 [[SUB18_LDIST1]], ptr [[I3_LDIST1]], align 4, !tbaa [[TBAA0]], !alias.scope [[META14]], !noalias [[META15]]
+; CHECK-NEXT:    [[I6_LDIST1]] = add i64 [[IDXPROM_LDIST1]], 1
+; CHECK-NEXT:    [[CMP1_NOT_LDIST1:%.*]] = icmp eq i64 [[I6_LDIST1]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP1_NOT_LDIST1]], label %[[FOR_BODY_PH:.*]], label %[[FOR_BODY_LDIST1]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[FOR_BODY_PH]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IDXPROM:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH]] ], [ [[I6:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I5:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA0]], !alias.scope [[META9]], !noalias [[META11]]
+; CHECK-NEXT:    [[ADD27:%.*]] = add nsw i32 [[I5]], 2
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i32 [[ADD27]], ptr [[ARRAYIDX31]], align 4, !tbaa [[TBAA0]], !alias.scope [[META15]], !noalias [[META6]]
+; CHECK-NEXT:    [[I6]] = add i64 [[IDXPROM]], 1
+; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[I6]], [[LEN]]
+; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[END_LOOPEXIT_LOOPEXIT20:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP16]]
+; CHECK:       [[END_LOOPEXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[END_LOOPEXIT:.*]]
+; CHECK:       [[END_LOOPEXIT_LOOPEXIT20]]:
+; CHECK-NEXT:    br label %[[END_LOOPEXIT]]
+; CHECK:       [[END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i64 %len, 0
+  br i1 %cmp, label %end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %i2 = load i32, ptr %arrayidx, align 4, !tbaa !0
+  %add4 = add nsw i32 %i2, 1
+  %arrayidx8 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add4, ptr %arrayidx8, align 4, !tbaa !0
+  %i3 = getelementptr i32, ptr %c, i64 %indvars.iv
+  %arrayidx17 = getelementptr i8, ptr %i3, i64 -4
+  %i4 = load i32, ptr %arrayidx17, align 4, !tbaa !0
+  %sub18 = sub nsw i32 %add4, %i4
+  store i32 %sub18, ptr %i3, align 4, !tbaa !0
+  %i5 = load i32, ptr %arrayidx8, align 4, !tbaa !0
+  %add27 = add nsw i32 %i5, 2
+  %arrayidx31 = getelementptr inbounds i32, ptr %d, i64 %indvars.iv
+  store i32 %add27, ptr %arrayidx31, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %cmp1.not = icmp eq i64 %indvars.iv.next, %len
+  br i1 %cmp1.not, label %end.loopexit, label %for.body, !llvm.loop !4
+
+end.loopexit:                                     ; preds = %for.body
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  ret void
+}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
+; CHECK: [[META9]] = !{[[META10:![0-9]+]]}
+; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]}
+; CHECK: [[META11]] = !{[[META12:![0-9]+]], [[META13:![0-9]+]], [[META7]]}
+; CHECK: [[META12]] = distinct !{[[META12]], [[META8]]}
+; CHECK: [[META13]] = distinct !{[[META13]], [[META8]]}
+; CHECK: [[META14]] = !{[[META12]]}
+; CHECK: [[META15]] = !{[[META13]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META5]]}
+;.

>From e7730fe71e5426442f2f6d1e2a89b1033de35d5a Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 12:52:00 -0700
Subject: [PATCH 11/53] [AMDGPU] Increase LDS to 320K on gfx1250 (#153645)

---
 llvm/docs/AMDGPUUsage.rst                     |  2 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  4 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  6 +-
 llvm/lib/Target/AMDGPU/AMDGPUFeatures.td      |  1 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  6 +-
 llvm/test/CodeGen/AMDGPU/extra-lds-size.ll    |  7 ++
 .../AMDGPU/lds-limit-diagnostics-gfx1250.ll   | 13 ++++
 .../CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll    | 72 +++++++++++++++++++
 .../CodeGen/AMDGPU/lds-size-pal-gfx1250.ll    | 61 ++++++++++++++++
 9 files changed, 168 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5343d66b083c7..8d0786ab0440d 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        roundup(lds-size / (128 * 4))
                                                      GFX950
                                                        roundup(lds-size / (320 * 4))
+                                                     GFX125*
+                                                       roundup(lds-size / (256 * 4))
 
      24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
                      _INVALID_OPERATION              with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f26639847be75..8e4b6365dc06b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
 
 def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
   "gfx12",
-  [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
+  [FeatureFP64, FeatureMIMG_R128,
    FeatureFlatAddressSpace, Feature16BitInsts,
    FeatureInv2PiInlineImm, FeatureApertureRegs,
    FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
@@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
 
 def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
+   FeatureAddressableLocalMemorySize65536,
    FeatureLDSBankCount32,
    FeatureDLInsts,
    FeatureDot7Insts,
@@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
   [FeatureGFX12,
    FeatureGFX1250Insts,
    FeatureCUStores,
+   FeatureAddressableLocalMemorySize327680,
    FeatureCuMode,
    Feature64BitLiterals,
    FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 626734a4752f3..c7d2d268a2707 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1103,7 +1103,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
+    // LDS is allocated in 256 dword blocks.
+    LDSAlignShift = 10;
+  } else if (STM.getFeatureBits().test(
+                 FeatureAddressableLocalMemorySize163840)) {
     // LDS is allocated in 320 dword blocks.
     LDSAlignShift = 11;
   } else if (STM.getFeatureBits().test(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 74d1faeb6f545..d14b5ce80d28e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
 def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
 def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
+def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e0ac040bdd226..ec9f1abdd8467 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
     return 65536;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
     return 163840;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+    return 327680;
   return 0;
 }
 
@@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
 }
 
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
-  // Currently this is 128 for all subtargets
-  return 128;
+  return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
+                                                                        : 128;
 }
 
 bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e1ce5341efdd1..4349b18fd394c 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -6,6 +6,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
 
 ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
 
@@ -29,6 +31,11 @@
 ; GFX1200-MESA: .long 45100
 ; GFX1200-MESA-NEXT: .long 1024
 
+; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX1250-MESA: .long 45100
+; GFX1250-MESA-NEXT: .long 512
+
 @lds = internal addrspace(3) global [4096 x i8] poison
 
 define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
new file mode 100644
index 0000000000000..da92dcdd7104e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX1250 supports upto 320 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
+ at dst = addrspace(3) global [81921 x i32] undef
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+  %gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+  store i32 %val, ptr addrspace(3) %gep
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
new file mode 100644
index 0000000000000..3db0fa8f21759
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+ at lds.i8 = addrspace(3) global i8 undef
+ at lds.array.i8 = addrspace(3) global [327679 x i8] undef
+ at lds.i16 = addrspace(3) global i16 undef
+ at lds.array.i16 = addrspace(3) global [163839 x i16] undef
+ at lds.i32 = addrspace(3) global i32 undef
+ at lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+; GCN-LABEL: test_lds_i8:
+; GCN: .amdhsa_group_segment_fixed_size 1
+; GCN: ; LDSByteSize: 1 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i8(i8 %val) {
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+; GCN-LABEL: test_lds_i16:
+; GCN: .amdhsa_group_segment_fixed_size 2
+; GCN: ; LDSByteSize: 2 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i16(i16 %val) {
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+; GCN-LABEL: test_lds_i32:
+; GCN: .amdhsa_group_segment_fixed_size 4
+; GCN: ; LDSByteSize: 4 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_i8:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i8() {
+  %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+  %val = load i8, ptr addrspace(3) %gep
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_i16:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i16() {
+  %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+  %val = load i16, ptr addrspace(3) %gep
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_i32:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
new file mode 100644
index 0000000000000..bfa7d37ce63a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i16:
+; PAL: .lds_size:       0x50000
+; PAL: test_lds_array_i32:
+; PAL: .lds_size:       0x50000
+; PAL: test_lds_array_i8:
+; PAL: .lds_size:       0x50000
+; PAL: test_lds_i16:
+; PAL: .lds_size:       0x2
+; PAL: test_lds_i32:
+; PAL: .lds_size:       0x4
+; PAL: test_lds_i8:
+; PAL: .lds_size:       0x1
+
+ at lds.i8 = addrspace(3) global i8 undef
+ at lds.array.i8 = addrspace(3) global [327679 x i8] undef
+ at lds.i16 = addrspace(3) global i16 undef
+ at lds.array.i16 = addrspace(3) global [163839 x i16] undef
+ at lds.i32 = addrspace(3) global i32 undef
+ at lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+define amdgpu_gfx void @test_lds_i8(i8 %val) {
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_i16(i16 %val) {
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i8() {
+  %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+  %val = load i8, ptr addrspace(3) %gep
+  store i8 %val, ptr addrspace(3) @lds.i8
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i16() {
+  %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+  %val = load i16, ptr addrspace(3) %gep
+  store i16 %val, ptr addrspace(3) @lds.i16
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}

>From 855fe3bbf5514dcf3cbdbe70e24b166f379326ce Mon Sep 17 00:00:00 2001
From: CatherineMoore <catmoore at amd.com>
Date: Thu, 14 Aug 2025 16:04:03 -0400
Subject: [PATCH 12/53] [OpenMP] Update printf stmt in  kmp_settings.cpp
 (#152800)

Remove extraneous argument from printf statement

---------

Co-authored-by: Joachim <protze at rz.rwth-aachen.de>
---
 openmp/runtime/src/kmp_settings.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 31342c8c6203d..b9d615f43b570 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -3278,9 +3278,9 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
         int eff = __kmp_affinity.core_attr_gran.core_eff;
         if (ct != KMP_HW_CORE_TYPE_UNKNOWN) {
           const char *ct_name = __kmp_hw_get_core_type_keyword(ct);
-          __kmp_str_buf_print(buffer, ":%s", name, ct_name);
+          __kmp_str_buf_print(buffer, ":%s", ct_name);
         } else if (eff >= 0 && eff < KMP_HW_MAX_NUM_CORE_EFFS) {
-          __kmp_str_buf_print(buffer, ":eff%d", name, eff);
+          __kmp_str_buf_print(buffer, ":eff%d", eff);
         }
       }
 

>From 9c0722d52a6d6730c3b9577afd5c1f3cb10345d1 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 14 Aug 2025 13:07:38 -0700
Subject: [PATCH 13/53] [Clang][attr] Add 'cfi_salt' attribute (#141846)

The 'cfi_salt' attribute specifies a string literal that is used as a
"salt" for Control-Flow Integrity (CFI) checks to distinguish between
functions with the same type signature. This attribute can be applied
to function declarations, function definitions, and function pointer
typedefs.

This attribute prevents function pointers from being replaced with
pointers to functions that have a compatible type, which can be a CFI
bypass vector.

The attribute affects type compatibility during compilation and CFI
hash generation during code generation.

  Attribute syntax: [[clang::cfi_salt("<salt_string>")]]
  GNU-style syntax: __attribute__((cfi_salt("<salt_string>")))

- The attribute takes a single string of non-NULL ASCII characters.
- It only applies to function types; using it on a non-function type
  will generate an error.
- All function declarations and the function definition must include
  the attribute and use identical salt values.

Example usage:

  // Header file:
  #define __cfi_salt(S) __attribute__((cfi_salt(S)))

  // Convenient typedefs to avoid nested declarator syntax.
  typedef int (*fp_unsalted_t)(void);
  typedef int (*fp_salted_t)(void) __cfi_salt("pepper");

  struct widget_ops {
    fp_unsalted_t init;     // Regular CFI.
    fp_salted_t exec;       // Salted CFI.
    fp_unsalted_t teardown; // Regular CFI.
  };

  // bar.c file:
  static int bar_init(void) { ... }
  static int bar_salted_exec(void) __cfi_salt("pepper") { ... }
  static int bar_teardown(void) { ... }

  static struct widget_generator _generator = {
    .init = bar_init,
    .exec = bar_salted_exec,
    .teardown = bar_teardown,
  };

  struct widget_generator *widget_gen = _generator;

  // 2nd .c file:
  int generate_a_widget(void) {
    int ret;

    // Called with non-salted CFI.
    ret = widget_gen.init();
    if (ret)
      return ret;

    // Called with salted CFI.
    ret = widget_gen.exec();
    if (ret)
      return ret;

    // Called with non-salted CFI.
    return widget_gen.teardown();
  }

Link: https://github.com/ClangBuiltLinux/linux/issues/1736
Link: https://github.com/KSPP/linux/issues/365

---------

Signed-off-by: Bill Wendling <morbo at google.com>
Co-authored-by: Aaron Ballman <aaron at aaronballman.com>
---
 clang/include/clang/AST/Type.h                |  66 +++++-
 clang/include/clang/Basic/Attr.td             |   8 +
 clang/include/clang/Basic/AttrDocs.td         |  93 +++++++++
 clang/lib/AST/ASTContext.cpp                  |   7 +
 clang/lib/AST/Type.cpp                        |  14 +-
 clang/lib/AST/TypePrinter.cpp                 |   3 +
 clang/lib/CodeGen/CodeGenFunction.cpp         |  14 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  15 +-
 clang/lib/CodeGen/CodeGenModule.h             |   2 +-
 clang/lib/Sema/SemaType.cpp                   |  31 +++
 clang/test/CodeGen/cfi-salt.c                 | 188 ++++++++++++++++++
 ...a-attribute-supported-attributes-list.test |   1 +
 clang/test/Sema/attr-cfi-salt.c               |  60 ++++++
 13 files changed, 482 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/CodeGen/cfi-salt.c
 create mode 100644 clang/test/Sema/attr-cfi-salt.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index c4c23c835ebc2..cfbb9de4f2a06 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4687,6 +4687,9 @@ class FunctionType : public Type {
     /// [implimits] 8 bits would be enough here.
     unsigned NumExceptionType : 10;
 
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned HasExtraAttributeInfo : 1;
+
     LLVM_PREFERRED_TYPE(bool)
     unsigned HasArmTypeAttributes : 1;
 
@@ -4695,14 +4698,26 @@ class FunctionType : public Type {
     unsigned NumFunctionEffects : 4;
 
     FunctionTypeExtraBitfields()
-        : NumExceptionType(0), HasArmTypeAttributes(false),
-          EffectsHaveConditions(false), NumFunctionEffects(0) {}
+        : NumExceptionType(0), HasExtraAttributeInfo(false),
+          HasArmTypeAttributes(false), EffectsHaveConditions(false),
+          NumFunctionEffects(0) {}
+  };
+
+  /// A holder for extra information from attributes which aren't part of an
+  /// \p AttributedType.
+  struct alignas(void *) FunctionTypeExtraAttributeInfo {
+    /// A CFI "salt" that differentiates functions with the same prototype.
+    StringRef CFISalt;
+
+    operator bool() const { return !CFISalt.empty(); }
+
+    void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddString(CFISalt); }
   };
 
   /// The AArch64 SME ACLE (Arm C/C++ Language Extensions) define a number
   /// of function type attributes that can be set on function types, including
   /// function pointers.
-  enum AArch64SMETypeAttributes : unsigned {
+  enum AArch64SMETypeAttributes : uint16_t {
     SME_NormalFunction = 0,
     SME_PStateSMEnabledMask = 1 << 0,
     SME_PStateSMCompatibleMask = 1 << 1,
@@ -4732,11 +4747,11 @@ class FunctionType : public Type {
   };
 
   static ArmStateValue getArmZAState(unsigned AttrBits) {
-    return (ArmStateValue)((AttrBits & SME_ZAMask) >> SME_ZAShift);
+    return static_cast<ArmStateValue>((AttrBits & SME_ZAMask) >> SME_ZAShift);
   }
 
   static ArmStateValue getArmZT0State(unsigned AttrBits) {
-    return (ArmStateValue)((AttrBits & SME_ZT0Mask) >> SME_ZT0Shift);
+    return static_cast<ArmStateValue>((AttrBits & SME_ZT0Mask) >> SME_ZT0Shift);
   }
 
   /// A holder for Arm type attributes as described in the Arm C/C++
@@ -4745,6 +4760,7 @@ class FunctionType : public Type {
   struct alignas(void *) FunctionTypeArmAttributes {
     /// Any AArch64 SME ACLE type attributes that need to be propagated
     /// on declarations and function pointers.
+    LLVM_PREFERRED_TYPE(AArch64SMETypeAttributes)
     unsigned AArch64SMEAttributes : 9;
 
     FunctionTypeArmAttributes() : AArch64SMEAttributes(SME_NormalFunction) {}
@@ -5226,6 +5242,7 @@ class FunctionProtoType final
       private llvm::TrailingObjects<
           FunctionProtoType, QualType, SourceLocation,
           FunctionType::FunctionTypeExtraBitfields,
+          FunctionType::FunctionTypeExtraAttributeInfo,
           FunctionType::FunctionTypeArmAttributes, FunctionType::ExceptionType,
           Expr *, FunctionDecl *, FunctionType::ExtParameterInfo, Qualifiers,
           FunctionEffect, EffectConditionExpr> {
@@ -5315,19 +5332,22 @@ class FunctionProtoType final
   /// the various bits of extra information about a function prototype.
   struct ExtProtoInfo {
     FunctionType::ExtInfo ExtInfo;
+    Qualifiers TypeQuals;
+    RefQualifierKind RefQualifier = RQ_None;
+    ExceptionSpecInfo ExceptionSpec;
+    const ExtParameterInfo *ExtParameterInfos = nullptr;
+    SourceLocation EllipsisLoc;
+    FunctionEffectsRef FunctionEffects;
+    FunctionTypeExtraAttributeInfo ExtraAttributeInfo;
+
     LLVM_PREFERRED_TYPE(bool)
     unsigned Variadic : 1;
     LLVM_PREFERRED_TYPE(bool)
     unsigned HasTrailingReturn : 1;
     LLVM_PREFERRED_TYPE(bool)
     unsigned CFIUncheckedCallee : 1;
+    LLVM_PREFERRED_TYPE(AArch64SMETypeAttributes)
     unsigned AArch64SMEAttributes : 9;
-    Qualifiers TypeQuals;
-    RefQualifierKind RefQualifier = RQ_None;
-    ExceptionSpecInfo ExceptionSpec;
-    const ExtParameterInfo *ExtParameterInfos = nullptr;
-    SourceLocation EllipsisLoc;
-    FunctionEffectsRef FunctionEffects;
 
     ExtProtoInfo()
         : Variadic(false), HasTrailingReturn(false), CFIUncheckedCallee(false),
@@ -5352,6 +5372,7 @@ class FunctionProtoType final
     bool requiresFunctionProtoTypeExtraBitfields() const {
       return ExceptionSpec.Type == EST_Dynamic ||
              requiresFunctionProtoTypeArmAttributes() ||
+             requiresFunctionProtoTypeExtraAttributeInfo() ||
              !FunctionEffects.empty();
     }
 
@@ -5359,6 +5380,10 @@ class FunctionProtoType final
       return AArch64SMEAttributes != SME_NormalFunction;
     }
 
+    bool requiresFunctionProtoTypeExtraAttributeInfo() const {
+      return static_cast<bool>(ExtraAttributeInfo);
+    }
+
     void setArmSMEAttribute(AArch64SMETypeAttributes Kind, bool Enable = true) {
       if (Enable)
         AArch64SMEAttributes |= Kind;
@@ -5384,6 +5409,11 @@ class FunctionProtoType final
     return hasExtraBitfields();
   }
 
+  unsigned
+  numTrailingObjects(OverloadToken<FunctionTypeExtraAttributeInfo>) const {
+    return hasExtraAttributeInfo();
+  }
+
   unsigned numTrailingObjects(OverloadToken<ExceptionType>) const {
     return getExceptionSpecSize().NumExceptionType;
   }
@@ -5476,6 +5506,12 @@ class FunctionProtoType final
 
   }
 
+  bool hasExtraAttributeInfo() const {
+    return FunctionTypeBits.HasExtraBitfields &&
+           getTrailingObjects<FunctionTypeExtraBitfields>()
+               ->HasExtraAttributeInfo;
+  }
+
   bool hasArmTypeAttributes() const {
     return FunctionTypeBits.HasExtraBitfields &&
            getTrailingObjects<FunctionTypeExtraBitfields>()
@@ -5509,6 +5545,7 @@ class FunctionProtoType final
     EPI.TypeQuals = getMethodQuals();
     EPI.RefQualifier = getRefQualifier();
     EPI.ExtParameterInfos = getExtParameterInfosOrNull();
+    EPI.ExtraAttributeInfo = getExtraAttributeInfo();
     EPI.AArch64SMEAttributes = getAArch64SMEAttributes();
     EPI.FunctionEffects = getFunctionEffects();
     return EPI;
@@ -5696,6 +5733,13 @@ class FunctionProtoType final
     return getTrailingObjects<ExtParameterInfo>();
   }
 
+  /// Return the extra attribute information.
+  FunctionTypeExtraAttributeInfo getExtraAttributeInfo() const {
+    if (hasExtraAttributeInfo())
+      return *getTrailingObjects<FunctionTypeExtraAttributeInfo>();
+    return FunctionTypeExtraAttributeInfo();
+  }
+
   /// Return a bitmask describing the SME attributes on the function type, see
   /// AArch64SMETypeAttributes for their values.
   unsigned getAArch64SMEAttributes() const {
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index a9fa4a8f07454..8c8e0b3bca46c 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3922,6 +3922,14 @@ def CFICanonicalJumpTable : InheritableAttr {
   let SimpleHandler = 1;
 }
 
+def CFISalt : TypeAttr {
+  let Spellings = [Clang<"cfi_salt">];
+  let Args = [StringArgument<"Salt">];
+  let Subjects = SubjectList<[FunctionLike], ErrorDiag>;
+  let Documentation = [CFISaltDocs];
+  let LangOpts = [COnly];
+}
+
 // C/C++ Thread safety attributes (e.g. for deadlock, data race checking)
 // Not all of these attributes will be given a [[]] spelling. The attributes
 // which require access to function parameter names cannot use the [[]] spelling
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2b095ab975202..00e8fc0787884 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3646,6 +3646,99 @@ make the function's CFI jump table canonical. See :ref:`the CFI documentation
   }];
 }
 
+def CFISaltDocs : Documentation {
+  let Category = DocCatFunction;
+  let Heading = "cfi_salt";
+  let Label = "langext-cfi_salt";
+  let Content = [{
+The ``cfi_salt`` attribute specifies a string literal that is used as a salt
+for Control-Flow Integrity (CFI) checks to distinguish between functions with
+the same type signature. This attribute can be applied to function declarations,
+function definitions, and function pointer typedefs.
+
+The attribute prevents function pointers from being replaced with pointers to
+functions that have a compatible type, which can be a CFI bypass vector.
+
+**Syntax:**
+
+* GNU-style: ``__attribute__((cfi_salt("<salt_string>")))``
+* C++11-style: ``[[clang::cfi_salt("<salt_string>")]]``
+
+**Usage:**
+
+The attribute takes a single string literal argument that serves as the salt.
+Functions or function types with different salt values will have different CFI
+hashes, even if they have identical type signatures.
+
+**Motivation:**
+
+In large codebases like the Linux kernel, there are often hundreds of functions
+with identical type signatures that are called indirectly:
+
+.. code-block::
+
+  1662 functions with void (*)(void)
+  1179 functions with int (*)(void)
+   ...
+
+By salting the CFI hashes, you can make CFI more robust by ensuring that
+functions intended for different purposes have distinct CFI identities.
+
+**Type Compatibility:**
+
+* Functions with different salt values are considered to have incompatible types
+* Function pointers with different salt values cannot be assigned to each other
+* All declarations of the same function must use the same salt value
+
+**Example:**
+
+.. code-block:: c
+
+  // Header file - define convenience macros
+  #define __cfi_salt(s) __attribute__((cfi_salt(s)))
+  
+  // Typedef for regular function pointers
+  typedef int (*fptr_t)(void);
+  
+  // Typedef for salted function pointers  
+  typedef int (*fptr_salted_t)(void) __cfi_salt("pepper");
+  
+  struct widget_ops {
+    fptr_t init;          // Regular CFI
+    fptr_salted_t exec;   // Salted CFI
+    fptr_t cleanup;       // Regular CFI
+  };
+  
+  // Function implementations
+  static int widget_init(void) { return 0; }
+  static int widget_exec(void) __cfi_salt("pepper") { return 1; }
+  static int widget_cleanup(void) { return 0; }
+  
+  static struct widget_ops ops = {
+    .init = widget_init,      // OK - compatible types
+    .exec = widget_exec,      // OK - both use "pepper" salt
+    .cleanup = widget_cleanup // OK - compatible types
+  };
+  
+  // Using C++11 attribute syntax
+  void secure_callback(void) [[clang::cfi_salt("secure")]];
+  
+  // This would cause a compilation error:
+  // fptr_t bad_ptr = widget_exec;  // Error: incompatible types
+
+**Notes:**
+
+* The salt string can contain non-NULL ASCII characters, including spaces and
+  quotes
+* This attribute only applies to function types; using it on non-function 
+  types will generate a warning
+* All declarations and definitions of the same function must use identical 
+  salt values
+* The attribute affects type compatibility during compilation and CFI hash 
+  generation during code generation
+  }];
+}
+
 def DocCatTypeSafety : DocumentationCategory<"Type Safety Checking"> {
   let Content = [{
 Clang supports additional attributes to enable checking type safety properties
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 95dd42681d870..2f2685495a8f1 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5128,10 +5128,12 @@ QualType ASTContext::getFunctionTypeInternal(
       EPI.ExceptionSpec.Type, EPI.ExceptionSpec.Exceptions.size());
   size_t Size = FunctionProtoType::totalSizeToAlloc<
       QualType, SourceLocation, FunctionType::FunctionTypeExtraBitfields,
+      FunctionType::FunctionTypeExtraAttributeInfo,
       FunctionType::FunctionTypeArmAttributes, FunctionType::ExceptionType,
       Expr *, FunctionDecl *, FunctionProtoType::ExtParameterInfo, Qualifiers,
       FunctionEffect, EffectConditionExpr>(
       NumArgs, EPI.Variadic, EPI.requiresFunctionProtoTypeExtraBitfields(),
+      EPI.requiresFunctionProtoTypeExtraAttributeInfo(),
       EPI.requiresFunctionProtoTypeArmAttributes(), ESH.NumExceptionType,
       ESH.NumExprPtr, ESH.NumFunctionDeclPtr,
       EPI.ExtParameterInfos ? NumArgs : 0,
@@ -11552,6 +11554,11 @@ QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
     if (lproto->getMethodQuals() != rproto->getMethodQuals())
       return {};
 
+    // Function protos with different 'cfi_salt' values aren't compatible.
+    if (lproto->getExtraAttributeInfo().CFISalt !=
+        rproto->getExtraAttributeInfo().CFISalt)
+      return {};
+
     // Function effects are handled similarly to noreturn, see above.
     FunctionEffectsRef LHSFX = lproto->getFunctionEffects();
     FunctionEffectsRef RHSFX = rproto->getFunctionEffects();
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index c382e58cb07c4..f7949e94d227e 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3751,6 +3751,16 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef<QualType> params,
     FunctionTypeBits.HasExtraBitfields = false;
   }
 
+  // Propagate any extra attribute information.
+  if (epi.requiresFunctionProtoTypeExtraAttributeInfo()) {
+    auto &ExtraAttrInfo = *getTrailingObjects<FunctionTypeExtraAttributeInfo>();
+    ExtraAttrInfo.CFISalt = epi.ExtraAttributeInfo.CFISalt;
+
+    // Also set the bit in FunctionTypeExtraBitfields.
+    auto &ExtraBits = *getTrailingObjects<FunctionTypeExtraBitfields>();
+    ExtraBits.HasExtraAttributeInfo = true;
+  }
+
   if (epi.requiresFunctionProtoTypeArmAttributes()) {
     auto &ArmTypeAttrs = *getTrailingObjects<FunctionTypeArmAttributes>();
     ArmTypeAttrs = FunctionTypeArmAttributes();
@@ -3968,7 +3978,8 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result,
   // This is followed by the ext info:
   //      int
   // Finally we have a trailing return type flag (bool)
-  // combined with AArch64 SME Attributes, to save space:
+  // combined with AArch64 SME Attributes and extra attribute info, to save
+  // space:
   //      int
   // combined with any FunctionEffects
   //
@@ -4003,6 +4014,7 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result,
   }
 
   epi.ExtInfo.Profile(ID);
+  epi.ExtraAttributeInfo.Profile(ID);
 
   unsigned EffectCount = epi.FunctionEffects.size();
   bool HasConds = !epi.FunctionEffects.Conditions.empty();
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index ce5870e2da690..85242b69f0679 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2154,6 +2154,9 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::ExtVectorType:
     OS << "ext_vector_type";
     break;
+  case attr::CFISalt:
+    OS << "cfi_salt(\"" << cast<CFISaltAttr>(T->getAttr())->getSalt() << "\")";
+    break;
   }
   OS << "))";
 }
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index d077ee50856b7..652fe672f15e3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2903,10 +2903,16 @@ void CodeGenFunction::EmitSanitizerStatReport(llvm::SanitizerStatKind SSK) {
 
 void CodeGenFunction::EmitKCFIOperandBundle(
     const CGCallee &Callee, SmallVectorImpl<llvm::OperandBundleDef> &Bundles) {
-  const FunctionProtoType *FP =
-      Callee.getAbstractInfo().getCalleeFunctionProtoType();
-  if (FP)
-    Bundles.emplace_back("kcfi", CGM.CreateKCFITypeId(FP->desugar()));
+  const CGCalleeInfo &CI = Callee.getAbstractInfo();
+  const FunctionProtoType *FP = CI.getCalleeFunctionProtoType();
+  if (!FP)
+    return;
+
+  StringRef Salt;
+  if (const auto &Info = FP->getExtraAttributeInfo())
+    Salt = Info.CFISalt;
+
+  Bundles.emplace_back("kcfi", CGM.CreateKCFITypeId(FP->desugar(), Salt));
 }
 
 llvm::Value *
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 2d37e0f13199b..414687640bf2d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2366,7 +2366,7 @@ static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty) {
   llvm_unreachable("Encountered unknown FunctionType");
 }
 
-llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T) {
+llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T, StringRef Salt) {
   if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
     T = GeneralizeFunctionType(getContext(), T);
   if (auto *FnType = T->getAs<FunctionProtoType>())
@@ -2379,6 +2379,9 @@ llvm::ConstantInt *CodeGenModule::CreateKCFITypeId(QualType T) {
   getCXXABI().getMangleContext().mangleCanonicalTypeName(
       T, Out, getCodeGenOpts().SanitizeCfiICallNormalizeIntegers);
 
+  if (!Salt.empty())
+    Out << "." << Salt;
+
   if (getCodeGenOpts().SanitizeCfiICallNormalizeIntegers)
     Out << ".normalized";
   if (getCodeGenOpts().SanitizeCfiICallGeneralizePointers)
@@ -3047,9 +3050,15 @@ void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
 void CodeGenModule::setKCFIType(const FunctionDecl *FD, llvm::Function *F) {
   llvm::LLVMContext &Ctx = F->getContext();
   llvm::MDBuilder MDB(Ctx);
+  llvm::StringRef Salt;
+
+  if (const auto *FP = FD->getType()->getAs<FunctionProtoType>())
+    if (const auto &Info = FP->getExtraAttributeInfo())
+      Salt = Info.CFISalt;
+
   F->setMetadata(llvm::LLVMContext::MD_kcfi_type,
-                 llvm::MDNode::get(
-                     Ctx, MDB.createConstant(CreateKCFITypeId(FD->getType()))));
+                 llvm::MDNode::get(Ctx, MDB.createConstant(CreateKCFITypeId(
+                                            FD->getType(), Salt))));
 }
 
 static bool allowKCFIIdentifier(StringRef Name) {
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index cb013feb769fc..705d9a3cb9de3 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1621,7 +1621,7 @@ class CodeGenModule : public CodeGenTypeCache {
   llvm::ConstantInt *CreateCrossDsoCfiTypeId(llvm::Metadata *MD);
 
   /// Generate a KCFI type identifier for T.
-  llvm::ConstantInt *CreateKCFITypeId(QualType T);
+  llvm::ConstantInt *CreateKCFITypeId(QualType T, StringRef Salt);
 
   /// Create a metadata identifier for the given type. This may either be an
   /// MDString (for external identifiers) or a distinct unnamed MDNode (for
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 0985b5b565dab..d745cdbf0526f 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -156,6 +156,7 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_Allocating:                                              \
   case ParsedAttr::AT_Regparm:                                                 \
   case ParsedAttr::AT_CFIUncheckedCallee:                                      \
+  case ParsedAttr::AT_CFISalt:                                                 \
   case ParsedAttr::AT_CmseNSCall:                                              \
   case ParsedAttr::AT_ArmStreaming:                                            \
   case ParsedAttr::AT_ArmStreamingCompatible:                                  \
@@ -7986,6 +7987,36 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
     return true;
   }
 
+  if (attr.getKind() == ParsedAttr::AT_CFISalt) {
+    if (attr.getNumArgs() != 1)
+      return true;
+
+    StringRef Argument;
+    if (!S.checkStringLiteralArgumentAttr(attr, 0, Argument))
+      return true;
+
+    // Delay if this is not a function type.
+    if (!unwrapped.isFunctionType())
+      return false;
+
+    const auto *FnTy = unwrapped.get()->getAs<FunctionProtoType>();
+    if (!FnTy) {
+      S.Diag(attr.getLoc(), diag::err_attribute_wrong_decl_type)
+          << attr << attr.isRegularKeywordAttribute()
+          << ExpectedFunctionWithProtoType;
+      attr.setInvalid();
+      return true;
+    }
+
+    FunctionProtoType::ExtProtoInfo EPI = FnTy->getExtProtoInfo();
+    EPI.ExtraAttributeInfo.CFISalt = Argument;
+
+    QualType newtype = S.Context.getFunctionType(FnTy->getReturnType(),
+                                                 FnTy->getParamTypes(), EPI);
+    type = unwrapped.wrap(S, newtype->getAs<FunctionType>());
+    return true;
+  }
+
   if (attr.getKind() == ParsedAttr::AT_ArmStreaming ||
       attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible ||
       attr.getKind() == ParsedAttr::AT_ArmPreserves ||
diff --git a/clang/test/CodeGen/cfi-salt.c b/clang/test/CodeGen/cfi-salt.c
new file mode 100644
index 0000000000000..7ba1e2fc14daa
--- /dev/null
+++ b/clang/test/CodeGen/cfi-salt.c
@@ -0,0 +1,188 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -DORIG_ATTR_SYN -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fpatchable-function-entry-offset=3 -DORIG_ATTR_SYN -o - %s | FileCheck %s --check-prefixes=CHECK,OFFSET
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fsanitize=kcfi -fpatchable-function-entry-offset=3 -o - %s | FileCheck %s --check-prefixes=CHECK,OFFSET
+
+// Note that the interleving of functions, which normally would be in sequence,
+// is due to the fact that Clang outputs them in a non-sequential order.
+
+#if !__has_feature(kcfi)
+#error Missing kcfi?
+#endif
+
+#ifdef ORIG_ATTR_SYN
+#define __cfi_salt       __attribute__((cfi_salt("pepper")))
+#define __cfi_salt_empty __attribute__((cfi_salt("")))
+#else
+#define __cfi_salt       [[clang::cfi_salt("pepper")]]
+#define __cfi_salt_empty [[clang::cfi_salt("")]]
+#endif
+
+typedef int (*fn_t)(void);
+typedef int (* __cfi_salt fn_salt_t)(void);
+typedef int (* __cfi_salt_empty fn_salt_empty_t)(void);
+
+typedef unsigned int (*ufn_t)(void);
+typedef unsigned int (* __cfi_salt ufn_salt_t)(void);
+
+/// Must emit __kcfi_typeid symbols for address-taken function declarations
+// CHECK: module asm ".weak __kcfi_typeid_[[F4:[a-zA-Z0-9_]+]]"
+// CHECK: module asm ".set __kcfi_typeid_[[F4]], [[#%d,LOW_SODIUM_HASH:]]"
+// CHECK: module asm ".weak __kcfi_typeid_[[F4_SALT:[a-zA-Z0-9_]+]]"
+// CHECK: module asm ".set __kcfi_typeid_[[F4_SALT]], [[#%d,ASM_SALTY_HASH:]]"
+
+/// Must not __kcfi_typeid symbols for non-address-taken declarations
+// CHECK-NOT: module asm ".weak __kcfi_typeid_f6"
+
+int f1(void);
+int f1_salt(void) __cfi_salt;
+
+unsigned int f2(void);
+unsigned int f2_salt(void) __cfi_salt;
+
+static int f3(void);
+static int f3_salt(void) __cfi_salt;
+
+extern int f4(void);
+extern int f4_salt(void) __cfi_salt;
+
+static int f5(void);
+static int f5_salt(void) __cfi_salt;
+
+extern int f6(void);
+extern int f6_salt(void) __cfi_salt;
+
+int f8(void);
+int f8_salt_empty(void) __cfi_salt_empty;
+
+struct cfi_struct {
+  fn_t __cfi_salt fptr;
+  fn_salt_t td_fptr;
+  fn_salt_empty_t td_empty_fptr;
+};
+
+int f7_salt(struct cfi_struct *ptr);
+int f7_typedef_salt(struct cfi_struct *ptr);
+
+// CHECK-LABEL: @__call
+// CHECK:         call{{.*}} i32
+// CHECK-NOT:     "kcfi"
+// CHECK-SAME:    ()
+__attribute__((__no_sanitize__("kcfi")))
+int __call(fn_t f) {
+  return f();
+}
+
+// CHECK-LABEL: @call
+// CHECK:         call{{.*}} i32 %{{.}}(){{.*}} [ "kcfi"(i32 [[#LOW_SODIUM_HASH]]) ]
+// CHECK-LABEL: @call_salt
+// CHECK:         call{{.*}} i32 %{{.}}(){{.*}} [ "kcfi"(i32 [[#%d,SALTY_HASH:]]) ]
+// CHECK-LABEL: @call_salt_ty
+// CHECK:         call{{.*}} i32 %{{.}}(){{.*}} [ "kcfi"(i32 [[#SALTY_HASH]]) ]
+int call(fn_t f) { return f(); }
+int call_salt(fn_t __cfi_salt f) { return f(); }
+int call_salt_ty(fn_salt_t f) { return f(); }
+int call_salt_empty_ty(fn_salt_empty_t f) { return f(); }
+
+// CHECK-LABEL: @ucall
+// CHECK:         call{{.*}} i32 %{{.}}(){{.*}} [ "kcfi"(i32 [[#%d,LOW_SODIUM_UHASH:]]) ]
+// CHECK-LABEL: @ucall_salt
+// CHECK:         call{{.*}} i32 %{{.}}(){{.*}} [ "kcfi"(i32 [[#%d,SALTY_UHASH:]]) ]
+// CHECK-LABEL: @ucall_salt_ty
+// CHECK:         call{{.*}} i32 %{{.}}(){{.*}} [ "kcfi"(i32 [[#SALTY_UHASH]]) ]
+unsigned int ucall(ufn_t f) { return f(); }
+unsigned int ucall_salt(ufn_t __cfi_salt f) { return f(); }
+unsigned int ucall_salt_ty(ufn_salt_t f) { return f(); }
+
+int test1(struct cfi_struct *ptr) {
+  return call(f1) +
+         call_salt(f1_salt) +
+         call_salt_ty(f1_salt) +
+
+         __call((fn_t)f2) +
+         __call((fn_t)f2_salt) +
+
+         ucall(f2) +
+         ucall_salt(f2_salt) +
+         ucall_salt_ty(f2_salt) +
+
+         call(f3) +
+         call_salt(f3_salt) +
+         call_salt_ty(f3_salt) +
+
+         call(f4) +
+         call_salt(f4_salt) +
+         call_salt_ty(f4_salt) +
+
+         f5() +
+         f5_salt() +
+
+         f6() +
+         f6_salt() +
+
+         f7_salt(ptr) +
+         f7_typedef_salt(ptr) +
+
+         f8() +
+         f8_salt_empty();
+}
+
+// CHECK-LABEL: define dso_local{{.*}} i32 @f1(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#LOW_SODIUM_TYPE:]]
+// CHECK-LABEL: define dso_local{{.*}} i32 @f1_salt(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#SALTY_TYPE:]]
+int f1(void) { return 0; }
+int f1_salt(void) __cfi_salt { return 0; }
+
+// CHECK-LABEL: define dso_local{{.*}} i32 @f2(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#LOW_SODIUM_UTYPE:]]
+// CHECK: define dso_local{{.*}} i32 @f2_salt(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#SALTY_UTYPE:]]
+unsigned int f2(void) { return 2; }
+unsigned int f2_salt(void) __cfi_salt { return 2; }
+
+// CHECK-LABEL: define internal{{.*}} i32 @f3(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#LOW_SODIUM_TYPE]]
+// CHECK-LABEL: define internal{{.*}} i32 @f3_salt(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#SALTY_TYPE]]
+static int f3(void) { return 1; }
+static int f3_salt(void) __cfi_salt { return 1; }
+
+// CHECK: declare !kcfi_type ![[#LOW_SODIUM_TYPE]]{{.*}} i32 @[[F4]]()
+// CHECK: declare !kcfi_type ![[#SALTY_TYPE]]{{.*}} i32 @[[F4_SALT]]()
+
+/// Must not emit !kcfi_type for non-address-taken local functions
+// CHECK-LABEL: define internal{{.*}} i32 @f5()
+// CHECK-NOT:   !kcfi_type
+// CHECK-SAME:  {
+// CHECK-LABEL: define internal{{.*}} i32 @f5_salt()
+// CHECK-NOT:   !kcfi_type
+// CHECK-SAME:  {
+static int f5(void) { return 2; }
+static int f5_salt(void) __cfi_salt { return 2; }
+
+// CHECK: declare !kcfi_type ![[#LOW_SODIUM_TYPE]]{{.*}} i32 @f6()
+// CHECK: declare !kcfi_type ![[#SALTY_TYPE]]{{.*}} i32 @f6_salt()
+
+// CHECK-LABEL: @f7_salt
+// CHECK:         call{{.*}} i32 %{{.*}}() [ "kcfi"(i32 [[#SALTY_HASH]]) ]
+// CHECK-LABEL: @f7_typedef_salt
+// CHECK:         call{{.*}} i32 %{{.*}}() [ "kcfi"(i32 [[#SALTY_HASH]]) ]
+int f7_salt(struct cfi_struct *ptr) { return ptr->fptr(); }
+int f7_typedef_salt(struct cfi_struct *ptr) { return ptr->td_fptr(); }
+
+// CHECK-LABEL: define dso_local{{.*}} i32 @f8(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#LOW_SODIUM_TYPE:]]
+// CHECK-LABEL: define dso_local{{.*}} i32 @f8_salt_empty(){{.*}} !kcfi_type
+// CHECK-SAME:  ![[#LOW_SODIUM_TYPE:]]
+int f8(void) { return 0; }
+int f8_salt_empty(void) __cfi_salt_empty { return 0; }
+
+// CHECK:  ![[#]] = !{i32 4, !"kcfi", i32 1}
+// OFFSET: ![[#]] = !{i32 4, !"kcfi-offset", i32 3}
+//
+// CHECK:  ![[#LOW_SODIUM_TYPE]] = !{i32 [[#LOW_SODIUM_HASH]]}
+// CHECK:  ![[#SALTY_TYPE]] = !{i32 [[#SALTY_HASH]]}
+//
+// CHECK:  ![[#LOW_SODIUM_UTYPE]] = !{i32 [[#LOW_SODIUM_UHASH]]}
+// CHECK:  ![[#SALTY_UTYPE]] = !{i32 [[#SALTY_UHASH]]}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 05693538252aa..b9cf7cf9462fe 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -31,6 +31,7 @@
 // CHECK-NEXT: CFConsumed (SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: CFGuard (SubjectMatchRule_function)
 // CHECK-NEXT: CFICanonicalJumpTable (SubjectMatchRule_function)
+// CHECK-NEXT: CFISalt (SubjectMatchRule_hasType_functionType)
 // CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
 // CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
 // CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
diff --git a/clang/test/Sema/attr-cfi-salt.c b/clang/test/Sema/attr-cfi-salt.c
new file mode 100644
index 0000000000000..687f54dc499d8
--- /dev/null
+++ b/clang/test/Sema/attr-cfi-salt.c
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 -fsyntax-only -fsanitize=kcfi -verify %s
+// RUN: %clang_cc1 -std=c89 -DKNR -fsyntax-only -fsanitize=kcfi -verify %s
+
+#define __cfi_salt(S) __attribute__((cfi_salt(S)))
+
+int bad1(void) __cfi_salt(); // expected-error{{'cfi_salt' attribute takes one argument}}
+int bad2(void) __cfi_salt(42); // expected-error{{expected string literal as argument of 'cfi_salt' attribute}}
+int bad3(void) __attribute__((cfi_salt("a", "b", "c"))); // expected-error{{'cfi_salt' attribute takes one argument}}
+
+
+int foo(int a, int b) __cfi_salt("pepper"); // ok
+int foo(int a, int b) __cfi_salt("pepper"); // ok
+
+#ifndef KNR
+typedef int (*bar_t)(void) __cfi_salt("pepper"); // ok
+typedef int (*bar_t)(void) __cfi_salt("pepper"); // ok
+#endif
+
+// FIXME: Should we allow this?
+// int b(void) __cfi_salt("salt 'n") __cfi_salt("pepper");
+// bar_t bar_fn __cfi_salt("salt 'n");
+
+int baz __cfi_salt("salt"); // expected-warning{{'cfi_salt' only applies to function types}}
+
+int baz_fn(int a, int b) __cfi_salt("salt 'n"); // expected-note{{previous declaration is here}}
+int baz_fn(int a, int b) __cfi_salt("pepper"); // expected-error{{conflicting types for 'baz_fn'}}
+
+int mux_fn(int a, int b) __cfi_salt("salt 'n"); // expected-note{{previous declaration is here}}
+int mux_fn(int a, int b) __cfi_salt("pepper") { // expected-error{{conflicting types for 'mux_fn'}}
+  return a * b;
+}
+
+typedef int qux_t __cfi_salt("salt"); // expected-warning{{'cfi_salt' only applies to function types}}
+
+typedef int (*quux_t)(void) __cfi_salt("salt 'n"); // expected-note{{previous definition is here}}
+typedef int (*quux_t)(void) __cfi_salt("pepper"); // expected-error{{typedef redefinition with different type}}
+
+void func1(int a) __cfi_salt("pepper"); // expected-note{{previous declaration is here}}
+void func1(int a) { } // expected-error{{conflicting types for 'func1'}}
+void (*fp1)(int) = func1; // expected-error{{incompatible function pointer types initializing 'void (*)(int)' with an expression of type 'void (int)'}}
+
+void func2(int) [[clang::cfi_salt("test")]]; // expected-note{{previous declaration is here}}
+void func2(int a) { } // expected-error{{conflicting types for 'func2'}}
+void (*fp2)(int) = func2; // expected-error{{incompatible function pointer types initializing 'void (*)(int)' with an expression of type 'void (int)'}}
+
+void func3(int) __cfi_salt("pepper"); // ok
+void func3(int a) __cfi_salt("pepper") { } // ok
+void (* __cfi_salt("pepper") fp3)(int) = func3; // ok
+void (*fp3_noattr)(int) = func3; // expected-error{{incompatible function pointer types initializing 'void (*)(int)' with an expression of type 'void (int)'}}
+
+void func4(int) [[clang::cfi_salt("test")]]; // ok
+void func4(int a) [[clang::cfi_salt("test")]] { } // ok
+void (* [[clang::cfi_salt("test")]] fp4)(int) = func4; // ok
+void (*fp4_noattr)(int) = func4; // expected-error{{incompatible function pointer types initializing 'void (*)(int)' with an expression of type 'void (int)'}}
+
+#ifdef KNR
+// K&R C function without a prototype
+void func() __attribute__((cfi_salt("pepper"))); // expected-error {{attribute only applies to non-K&R-style functions}}
+void (*fp)() __attribute__((cfi_salt("pepper")));  // expected-error {{attribute only applies to non-K&R-style functions}}
+#endif

>From 43fa7acc335040b96d39c5ef5e4b6e101c3e11fc Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane at nvidia.com>
Date: Thu, 14 Aug 2025 13:07:59 -0700
Subject: [PATCH 14/53] [OpenACC] Add firstprivate recipe helper methods to ACC
 dialect (#153604)

Like we did for the 'private' clause, this adds an easier to use helper
function to add the 'firstprivate' clause + recipe to the Parallel and
Serial ops.
---
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |  8 +++++
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 29 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 18d5f2db12e9c..47646b3b8fec9 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -1536,6 +1536,10 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
     /// Adds a private clause variable to this operation, including its recipe.
     void addPrivatization(MLIRContext *, mlir::acc::PrivateOp op,
                           mlir::acc::PrivateRecipeOp recipe);
+    /// Adds a firstprivate clause variable to this operation, including its
+    /// recipe.
+    void addFirstPrivatization(MLIRContext *, mlir::acc::FirstprivateOp op,
+                               mlir::acc::FirstprivateRecipeOp recipe);
   }];
 
   let assemblyFormat = [{
@@ -1681,6 +1685,10 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
     /// Adds a private clause variable to this operation, including its recipe.
     void addPrivatization(MLIRContext *, mlir::acc::PrivateOp op,
                           mlir::acc::PrivateRecipeOp recipe);
+    /// Adds a firstprivate clause variable to this operation, including its
+    /// recipe.
+    void addFirstPrivatization(MLIRContext *, mlir::acc::FirstprivateOp op,
+                               mlir::acc::FirstprivateRecipeOp recipe);
   }];
 
   let assemblyFormat = [{
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 485bb73672398..d7c8916f43a2c 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1390,6 +1390,20 @@ void acc::ParallelOp::addPrivatization(MLIRContext *context,
   setPrivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
 }
 
+void acc::ParallelOp::addFirstPrivatization(
+    MLIRContext *context, mlir::acc::FirstprivateOp op,
+    mlir::acc::FirstprivateRecipeOp recipe) {
+  getFirstprivateOperandsMutable().append(op.getResult());
+
+  llvm::SmallVector<mlir::Attribute> recipes;
+
+  if (getFirstprivatizationRecipesAttr())
+    llvm::copy(getFirstprivatizationRecipesAttr(), std::back_inserter(recipes));
+
+  recipes.push_back(
+      mlir::SymbolRefAttr::get(context, recipe.getSymName().str()));
+  setFirstprivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
+}
 static ParseResult parseNumGangs(
     mlir::OpAsmParser &parser,
     llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
@@ -2041,6 +2055,21 @@ void acc::SerialOp::addPrivatization(MLIRContext *context,
   setPrivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
 }
 
+void acc::SerialOp::addFirstPrivatization(
+    MLIRContext *context, mlir::acc::FirstprivateOp op,
+    mlir::acc::FirstprivateRecipeOp recipe) {
+  getFirstprivateOperandsMutable().append(op.getResult());
+
+  llvm::SmallVector<mlir::Attribute> recipes;
+
+  if (getFirstprivatizationRecipesAttr())
+    llvm::copy(getFirstprivatizationRecipesAttr(), std::back_inserter(recipes));
+
+  recipes.push_back(
+      mlir::SymbolRefAttr::get(context, recipe.getSymName().str()));
+  setFirstprivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
+}
+
 //===----------------------------------------------------------------------===//
 // KernelsOp
 //===----------------------------------------------------------------------===//

>From b73a794f60056261125fc21f9cffd90a75ded3d2 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 13:19:38 -0700
Subject: [PATCH 15/53] [AMDGPU] Encode NV bit in VIMAGE/VSAMPLE. NFC (#153654)

This is NFC as this target does not have it.
---
 llvm/lib/Target/AMDGPU/SIInstrFormats.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 89d9b0d32b25b..50964a94d6e58 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -473,6 +473,7 @@ class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 {
   let Inst{4} = r128;
   let Inst{5} = d16;
   let Inst{6} = a16;
+  let Inst{7} = cpol{5}; // nv
   let Inst{21-14} = op;
   let Inst{25-22} = dmask;
   let Inst{39-32} = vdata;

>From ab635a416d1380181a61aa3ab4c900e3e7796866 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 14 Aug 2025 21:21:03 +0100
Subject: [PATCH 16/53] [LV] Regenerate some more tests.

---
 .../X86/consecutive-ptr-uniforms.ll           | 187 ++++++++++++++----
 .../LoopVectorize/dbg-outer-loop-vect.ll      | 128 ++++++------
 llvm/test/Transforms/LoopVectorize/pr45525.ll |  31 ++-
 .../LoopVectorize/runtime-check-readonly.ll   |  44 ++---
 4 files changed, 255 insertions(+), 135 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e26221eeefd2d..ab8eae6823963 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 5
 ; REQUIRES: asserts
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-vectorize,instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE
@@ -18,32 +19,119 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, ptr %d, i64 0, i32 0, i64 %i
 ; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 ; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
-; CHECK:       define void @PR31671(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> poison, float %x, i64 0
+
+
+%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
+
+define void @PR31671(float %x, ptr %d) #0 {
+; CHECK-LABEL: define void @PR31671(
+; CHECK-SAME: float [[X:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> poison, float [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, ptr %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[DATA:%.*]], ptr [[D]], i64 0, i32 3, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, ptr %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x ptr> [[TMP3]], i64 0
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x ptr> [[TMP2]], i64 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP5]], <16 x ptr> [[TMP3]], i32 4, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP1]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP4]], <16 x ptr> [[TMP2]], i32 4, <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 80)
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
-
-%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
-
-define void @PR31671(float %x, ptr %d) #0 {
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6384
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+; FORCE-LABEL: define void @PR31671(
+; FORCE-SAME: float [[X:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; FORCE-NEXT:  [[ENTRY:.*:]]
+; FORCE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; FORCE:       [[VECTOR_PH]]:
+; FORCE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[X]], i64 0
+; FORCE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; FORCE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; FORCE:       [[VECTOR_BODY]]:
+; FORCE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; FORCE-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; FORCE-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; FORCE-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 5
+; FORCE-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 10
+; FORCE-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 15
+; FORCE-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 20
+; FORCE-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 25
+; FORCE-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 30
+; FORCE-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 35
+; FORCE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[DATA:%.*]], ptr [[D]], i64 0, i32 3, i64 [[TMP0]]
+; FORCE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 3, i64 [[TMP2]]
+; FORCE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 3, i64 [[TMP4]]
+; FORCE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 3, i64 [[TMP6]]
+; FORCE-NEXT:    [[WIDE_VEC:%.*]] = load <10 x float>, ptr [[TMP8]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x float> [[WIDE_VEC]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[WIDE_VEC1:%.*]] = load <10 x float>, ptr [[TMP9]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x float> [[WIDE_VEC1]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[WIDE_VEC3:%.*]] = load <10 x float>, ptr [[TMP10]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x float> [[WIDE_VEC3]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[WIDE_VEC5:%.*]] = load <10 x float>, ptr [[TMP11]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <10 x float> [[WIDE_VEC5]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[TMP12:%.*]] = fmul <2 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; FORCE-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
+; FORCE-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
+; FORCE-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC6]]
+; FORCE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP0]]
+; FORCE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP1]]
+; FORCE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP2]]
+; FORCE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP3]]
+; FORCE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP4]]
+; FORCE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP5]]
+; FORCE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP6]]
+; FORCE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[DATA]], ptr [[D]], i64 0, i32 0, i64 [[TMP7]]
+; FORCE-NEXT:    [[WIDE_VEC7:%.*]] = load <10 x float>, ptr [[TMP16]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <10 x float> [[WIDE_VEC7]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[WIDE_VEC9:%.*]] = load <10 x float>, ptr [[TMP18]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <10 x float> [[WIDE_VEC9]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[WIDE_VEC11:%.*]] = load <10 x float>, ptr [[TMP20]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <10 x float> [[WIDE_VEC11]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[WIDE_VEC13:%.*]] = load <10 x float>, ptr [[TMP22]], align 4
+; FORCE-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <10 x float> [[WIDE_VEC13]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
+; FORCE-NEXT:    [[TMP24:%.*]] = fadd <2 x float> [[STRIDED_VEC8]], [[TMP12]]
+; FORCE-NEXT:    [[TMP25:%.*]] = fadd <2 x float> [[STRIDED_VEC10]], [[TMP13]]
+; FORCE-NEXT:    [[TMP26:%.*]] = fadd <2 x float> [[STRIDED_VEC12]], [[TMP14]]
+; FORCE-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[STRIDED_VEC14]], [[TMP15]]
+; FORCE-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
+; FORCE-NEXT:    store float [[TMP28]], ptr [[TMP16]], align 4
+; FORCE-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
+; FORCE-NEXT:    store float [[TMP29]], ptr [[TMP17]], align 4
+; FORCE-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
+; FORCE-NEXT:    store float [[TMP30]], ptr [[TMP18]], align 4
+; FORCE-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
+; FORCE-NEXT:    store float [[TMP31]], ptr [[TMP19]], align 4
+; FORCE-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0
+; FORCE-NEXT:    store float [[TMP32]], ptr [[TMP20]], align 4
+; FORCE-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1
+; FORCE-NEXT:    store float [[TMP33]], ptr [[TMP21]], align 4
+; FORCE-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0
+; FORCE-NEXT:    store float [[TMP34]], ptr [[TMP22]], align 4
+; FORCE-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1
+; FORCE-NEXT:    store float [[TMP35]], ptr [[TMP23]], align 4
+; FORCE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FORCE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6392
+; FORCE-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FORCE:       [[MIDDLE_BLOCK]]:
+; FORCE-NEXT:    br label %[[SCALAR_PH]]
+; FORCE:       [[SCALAR_PH]]:
+;
 entry:
   br label %for.body
 
@@ -79,39 +167,56 @@ attributes #0 = { "target-cpu"="knl" }
 ; CHECK:     LV: Found not uniform due to requiring predication:  {{%.*}} = load i32, ptr {{%.*}}, align 1
 ; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}}
 ;
-; FORCE-LABEL: @PR40816(
-; FORCE-NEXT:  entry:
-; FORCE-NEXT:    br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]]
-; FORCE:       vector.ph:
-; FORCE-NEXT:    br label [[VECTOR_BODY:%.*]]
-; FORCE:       vector.body:
-; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
+;
+ at a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
+ at b = external global i32, align 1
+
+define void @PR40816() #1 {
+; CHECK-LABEL: define void @PR40816(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr @b, align 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 2
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[RETURN:.*]], label %[[FOR_BODY]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    ret void
+;
+; FORCE-LABEL: define void @PR40816(
+; FORCE-SAME: ) #[[ATTR1:[0-9]+]] {
+; FORCE-NEXT:  [[ENTRY:.*:]]
+; FORCE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; FORCE:       [[VECTOR_PH]]:
+; FORCE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; FORCE:       [[VECTOR_BODY]]:
+; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
+; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE4]] ]
 ; FORCE-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i8> [[VEC_IND]], splat (i8 2)
 ; FORCE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; FORCE:       pred.store.if:
+; FORCE-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; FORCE:       [[PRED_STORE_IF]]:
 ; FORCE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; FORCE-NEXT:    store i32 [[TMP0]], ptr @b, align 1
-; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; FORCE:       pred.store.continue:
+; FORCE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; FORCE:       [[PRED_STORE_CONTINUE]]:
 ; FORCE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; FORCE-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
-; FORCE:       pred.store.if1:
+; FORCE-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE4]]
+; FORCE:       [[PRED_STORE_IF1]]:
 ; FORCE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
 ; FORCE-NEXT:    store i32 [[TMP1]], ptr @b, align 1
-; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; FORCE:       pred.store.continue2:
+; FORCE-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; FORCE:       [[PRED_STORE_CONTINUE4]]:
 ; FORCE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
 ; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
-; FORCE-NEXT:    br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
+; FORCE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FORCE:       [[MIDDLE_BLOCK]]:
+; FORCE-NEXT:    br [[RETURN:label %.*]]
+; FORCE:       [[SCALAR_PH]]:
 ;
- at a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
- at b = external global i32, align 1
-
-define void @PR40816() #1 {
-
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
index 53de2523826fb..aa1b6cee09879 100644
--- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -S -passes=loop-vectorize -enable-vplan-native-path -force-vector-interleave=1 -force-vector-width=4 | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
@@ -6,60 +6,60 @@ target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
 define void @foo(ptr %h) !dbg !4 {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: ptr [[H:%.*]]) !dbg [[DBG4:![0-9]+]] {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:      #dbg_value(i64 0, [[META11:![0-9]+]], !DIExpression(), [[META20:![0-9]+]])
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG21:![0-9]+]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG21]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND_CLEANUP32:%.*]] ], !dbg [[DBG222:![0-9]+]]
-; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER1:%.*]]
-; CHECK:       for.cond5.preheader1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR_COND5_PREHEADER1]] ], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !dbg [[DBG21:![0-9]+]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG21]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_COND5_PREHEADER1:.*]]
+; CHECK:       [[FOR_COND5_PREHEADER1]]:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP5:%.*]], %[[FOR_COND5_PREHEADER1]] ], !dbg [[DBG23:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[H]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 1, !dbg [[DBG25:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 2, !dbg [[DBG25]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 3, !dbg [[DBG25]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23]]
-; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1), !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 5), !dbg [[DBG27:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0, !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]], !dbg [[DBG28]]
-; CHECK:       vector.latch:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG222]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20, !dbg [[DBG21]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG21]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG21]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]], !dbg [[DBG21]]
-; CHECK:       for.cond1.preheader:
-; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC13:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 1, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 2, !dbg [[DBG26]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 3, !dbg [[DBG26]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1), !dbg [[DBG27:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], splat (i64 5), !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0, !dbg [[DBG29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[VECTOR_LATCH]], label %[[FOR_COND5_PREHEADER1]], !dbg [[DBG29]]
+; CHECK:       [[VECTOR_LATCH]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG22]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20, !dbg [[DBG21]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG21]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG21]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]], !dbg [[DBG21]]
+; CHECK:       [[FOR_COND1_PREHEADER]]:
+; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC13:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
 ; CHECK-NEXT:      #dbg_value(i64 [[I_023]], [[META11]], !DIExpression(), [[META20]])
-; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG28]]
-; CHECK:       for.cond5.preheader:
-; CHECK-NEXT:    [[L_022:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INC10:%.*]], [[FOR_COND5_PREHEADER]] ], !dbg [[DBG22]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[H]], i64 [[L_022]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 4, !dbg [[DBG23]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr i32, ptr [[TMP10]], i64 1, !dbg [[DBG25]]
-; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX_1]], align 4, !dbg [[DBG23]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr i32, ptr [[TMP10]], i64 2, !dbg [[DBG25]]
-; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX_2]], align 4, !dbg [[DBG23]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr i32, ptr [[TMP10]], i64 3, !dbg [[DBG25]]
-; CHECK-NEXT:    store i32 3, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG23]]
-; CHECK-NEXT:    [[INC10]] = add nuw nsw i64 [[L_022]], 1, !dbg [[DBG26]]
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC10]], 5, !dbg [[DBG27]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG28]]
-; CHECK:       for.cond.cleanup3:
-; CHECK-NEXT:    [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    br label %[[FOR_COND5_PREHEADER:.*]], !dbg [[DBG29]]
+; CHECK:       [[FOR_COND5_PREHEADER]]:
+; CHECK-NEXT:    [[L_022:%.*]] = phi i64 [ 0, %[[FOR_COND1_PREHEADER]] ], [ [[INC10:%.*]], %[[FOR_COND5_PREHEADER]] ], !dbg [[DBG23]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[H]], i64 [[L_022]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP9]], align 4, !dbg [[DBG24]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr i32, ptr [[TMP9]], i64 1, !dbg [[DBG26]]
+; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX_1]], align 4, !dbg [[DBG24]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr i32, ptr [[TMP9]], i64 2, !dbg [[DBG26]]
+; CHECK-NEXT:    store i32 2, ptr [[ARRAYIDX_2]], align 4, !dbg [[DBG24]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr i32, ptr [[TMP9]], i64 3, !dbg [[DBG26]]
+; CHECK-NEXT:    store i32 3, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG24]]
+; CHECK-NEXT:    [[INC10]] = add nuw nsw i64 [[L_022]], 1, !dbg [[DBG27]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC10]], 5, !dbg [[DBG28]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP3]], label %[[FOR_COND5_PREHEADER]], !dbg [[DBG29]]
+; CHECK:       [[FOR_COND_CLEANUP3]]:
+; CHECK-NEXT:    [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG22]]
 ; CHECK-NEXT:      #dbg_value(i64 [[INC13]], [[META11]], !DIExpression(), [[META20]])
 ; CHECK-NEXT:    [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG34:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND24_NOT]], label [[EXIT]], label [[FOR_COND1_PREHEADER]], !dbg [[DBG21]], !llvm.loop [[LOOP35:![0-9]+]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[EXITCOND24_NOT]], label %[[EXIT]], label %[[FOR_COND1_PREHEADER]], !dbg [[DBG21]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void, !dbg [[DBG36:![0-9]+]]
 ;
 entry:
@@ -137,7 +137,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !34 = !DILocation(line: 10, column: 5, scope: !12)
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-; CHECK: [[META1]] = !DIFile(filename: "outer-loop-vect.c", directory: {{.*}})
+; CHECK: [[META1]] = !DIFile(filename: "{{.*}}outer-loop-vect.c", directory: {{.*}})
 ; CHECK: [[DBG4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 8, type: [[META5:![0-9]+]], scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META9:![0-9]+]])
 ; CHECK: [[META5]] = !DISubroutineType(types: [[META6:![0-9]+]])
 ; CHECK: [[META6]] = !{null, [[META7:![0-9]+]]}
@@ -156,19 +156,19 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; CHECK: [[META19]] = distinct !DILexicalBlock(scope: [[META15]], file: [[META1]], line: 11, column: 5)
 ; CHECK: [[META20]] = !DILocation(line: 0, scope: [[META12]])
 ; CHECK: [[DBG21]] = !DILocation(line: 10, column: 3, scope: [[META12]])
-; CHECK: [[DBG222]] = !DILocation(line: 10, column: 30, scope: [[META16]])
-; CHECK: [[DBG22]] = !DILocation(line: 10, column: 5, scope: [[META12]])
-; CHECK: [[DBG23]] = !DILocation(line: 13, column: 11, scope: [[META24:![0-9]+]])
-; CHECK: [[META24]] = distinct !DILexicalBlock(scope: [[META18]], file: [[META1]], line: 12, column: 7)
-; CHECK: [[DBG25]] = !DILocation(line: 13, column: 2, scope: [[META24]])
-; CHECK: [[DBG26]] = !DILocation(line: 11, column: 32, scope: [[META19]])
-; CHECK: [[DBG27]] = !DILocation(line: 11, column: 26, scope: [[META19]])
-; CHECK: [[DBG28]] = !DILocation(line: 11, column: 5, scope: [[META15]])
-; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[DBG21]], [[META30:![0-9]+]], [[META31:![0-9]+]], [[META32:![0-9]+]]}
-; CHECK: [[META30]] = !DILocation(line: 13, column: 13, scope: [[META12]])
-; CHECK: [[META31]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META32]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[DBG22]] = !DILocation(line: 10, column: 30, scope: [[META16]])
+; CHECK: [[DBG23]] = !DILocation(line: 10, column: 5, scope: [[META12]])
+; CHECK: [[DBG24]] = !DILocation(line: 13, column: 11, scope: [[META25:![0-9]+]])
+; CHECK: [[META25]] = distinct !DILexicalBlock(scope: [[META18]], file: [[META1]], line: 12, column: 7)
+; CHECK: [[DBG26]] = !DILocation(line: 13, column: 2, scope: [[META25]])
+; CHECK: [[DBG27]] = !DILocation(line: 11, column: 32, scope: [[META19]])
+; CHECK: [[DBG28]] = !DILocation(line: 11, column: 26, scope: [[META19]])
+; CHECK: [[DBG29]] = !DILocation(line: 11, column: 5, scope: [[META15]])
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[DBG21]], [[META31:![0-9]+]], [[META32:![0-9]+]], [[META33:![0-9]+]]}
+; CHECK: [[META31]] = !DILocation(line: 13, column: 13, scope: [[META12]])
+; CHECK: [[META32]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META33]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[DBG34]] = !DILocation(line: 10, column: 24, scope: [[META16]])
-; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[DBG21]], [[META30]], [[META31]]}
+; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[DBG21]], [[META31]], [[META32]]}
 ; CHECK: [[DBG36]] = !DILocation(line: 14, column: 1, scope: [[DBG4]])
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr45525.ll b/llvm/test/Transforms/LoopVectorize/pr45525.ll
index 25a32d9e3e32e..5606a76d85cec 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45525.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45525.ll
@@ -1,16 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph\:" --version 5
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s
 
 ; Test case for PR45525. Checks that phi's with a single predecessor and a mask are supported.
 
 define void @main(i1 %cond, ptr %arr) {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  bb.0:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK:         br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK:         [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK:         [[TMP5:%.*]] = mul <4 x i32> [[VEC_IND]], splat (i32 3)
+; CHECK-LABEL: define void @main(
+; CHECK-SAME: i1 [[COND:%.*]], ptr [[ARR:%.*]]) {
+; CHECK-NEXT:  [[BB_0:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[VEC_IND]], splat (i32 3)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> splat (i32 7), <4 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i32 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[BB_4:label %.*]]
+; CHECK:       [[SCALAR_PH]]:
 ;
 bb.0:
   br label %bb.1
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index 26bc36df94103..e6dc2cfd4a120 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -1,42 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "vector.ph\:" --version 5
 ; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @add_ints(ptr nocapture %A, ptr nocapture %B, ptr nocapture %C) {
-; CHECK-LABEL: @add_ints(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK-LABEL: vector.memcheck:
-; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
-; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64
-; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-LABEL: define void @add_ints(
+; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[C3]]
 ; CHECK-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 16
 ; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], [[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
 ;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %iv
   %1 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %1, %0
-  %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %iv
   store i32 %add, ptr %arrayidx4, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 200
-  br i1 %exitcond, label %for.end, label %for.body
+  %iv.next = add i64 %iv, 1
+  %iv.trunc = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %iv.trunc, 200
+  br i1 %exitcond, label %exit, label %loop
 
-for.end:
+exit:
   ret void
 }

>From 30e74db5a8bda526e3645575d7dcc66931c83aa3 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Thu, 14 Aug 2025 15:28:02 -0500
Subject: [PATCH 17/53] [lldb] Use the Python limited API with SWIG 4.2 or
 later (#153119) (#153472)

Use the Python limited API when building with SWIG 4.2 or later.
---
 lldb/cmake/modules/LLDBConfig.cmake | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index a9679d63e010d..c65c7e24badd9 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -68,7 +68,6 @@ add_optional_dependency(LLDB_ENABLE_FBSDVMCORE "Enable libfbsdvmcore support in
 option(LLDB_USE_ENTITLEMENTS "When codesigning, use entitlements if available" ON)
 option(LLDB_BUILD_FRAMEWORK "Build LLDB.framework (Darwin only)" OFF)
 option(LLDB_ENABLE_PROTOCOL_SERVERS "Enable protocol servers (e.g. MCP) in LLDB" ON)
-option(LLDB_ENABLE_PYTHON_LIMITED_API "Force LLDB to only use the Python Limited API (requires SWIG 4.2 or later)" OFF)
 option(LLDB_NO_INSTALL_DEFAULT_RPATH "Disable default RPATH settings in binaries" OFF)
 option(LLDB_USE_SYSTEM_DEBUGSERVER "Use the system's debugserver for testing (Darwin only)." OFF)
 option(LLDB_SKIP_STRIP "Whether to skip stripping of binaries when installing lldb." OFF)
@@ -174,11 +173,20 @@ if (LLDB_ENABLE_PYTHON)
     ${default_embed_python_home})
 
   include_directories(${Python3_INCLUDE_DIRS})
+
   if (LLDB_EMBED_PYTHON_HOME)
     get_filename_component(PYTHON_HOME "${Python3_EXECUTABLE}" DIRECTORY)
     set(LLDB_PYTHON_HOME "${PYTHON_HOME}" CACHE STRING
       "Path to use as PYTHONHOME in lldb. If a relative path is specified, it will be resolved at runtime relative to liblldb directory.")
   endif()
+
+  if (SWIG_VERSION VERSION_GREATER_EQUAL "4.2" AND NOT LLDB_EMBED_PYTHON_HOME)
+    set(default_enable_python_limited_api ON)
+  else()
+    set(default_enable_python_limited_api OFF)
+  endif()
+  option(LLDB_ENABLE_PYTHON_LIMITED_API "Force LLDB to only use the Python Limited API (requires SWIG 4.2 or later)"
+    ${default_enable_python_limited_api})
 endif()
 
 if (LLVM_EXTERNAL_CLANG_SOURCE_DIR)

>From f10f84a98cfe74f56bde5c07cc68d3e6532f5c01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 13:34:25 -0700
Subject: [PATCH 18/53] [flang][cuda] Add bind names for __double2ll_rX
 interfaces (#153660)

---
 flang/module/cudadevice.f90              | 14 +++++++-------
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 15 +++++++++++++++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 432125097e9d2..1e403b1ffdb91 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -625,29 +625,29 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
-  interface __double2ll_rn
-    attributes(device) integer(8) function __double2ll_rn(r) bind(c)
+  interface __double2ll_rd
+    attributes(device) integer(8) function __double2ll_rd(r) bind(c, name='__nv_double2ll_rd')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2ll_rz
-    attributes(device) integer(8) function __double2ll_rz(r) bind(c)
+  interface __double2ll_rn
+    attributes(device) integer(8) function __double2ll_rn(r) bind(c, name='__nv_double2ll_rn')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2ll_ru
-    attributes(device) integer(8) function __double2ll_ru(r) bind(c)
+    attributes(device) integer(8) function __double2ll_ru(r) bind(c, name='__nv_double2ll_ru')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2ll_rd
-    attributes(device) integer(8) function __double2ll_rd(r) bind(c)
+  interface __double2ll_rz
+    attributes(device) integer(8) function __double2ll_rz(r) bind(c, name='__nv_double2ll_rz')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index 6b80fb2e74146..844bdb954924a 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -100,3 +100,18 @@ end subroutine
 ! CHECK-LABEL: _QPtest_exp
 ! CHECK: %{{.*}} = fir.call @__nv_expf(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> f32
 ! CHECK: %{{.*}} = fir.call @__nv_exp10f(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> f32
+
+attributes(global) subroutine test_double2ll_rX()
+  integer(8) :: res
+  double precision :: r
+  res = __double2ll_rd(r)
+  res = __double2ll_rn(r)
+  res = __double2ll_ru(r)
+  res = __double2ll_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_double2ll_rx
+! CHECK: %{{.*}} = fir.call @__nv_double2ll_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ll_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ll_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ll_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64

>From d0648c2e80514abd6a597d41f261f7c521cac89c Mon Sep 17 00:00:00 2001
From: Zhaoxuan Jiang <jiangzhaoxuan94 at gmail.com>
Date: Fri, 15 Aug 2025 04:49:09 +0800
Subject: [PATCH 19/53] [CGData] Lazy loading support for stable function map
 (#151660)

The stable function map could be huge for a large application. Fully
loading it is slow and consumes a significant amount of memory, which is
unnecessary and drastically slows down compilation especially for
non-LTO and distributed-ThinLTO setups. This patch introduces an opt-in
lazy loading support for the stable function map. The detailed changes
are:

- `StableFunctionMap`
- The map now stores entries in an `EntryStorage` struct, which includes
offsets for serialized entries and a `std::once_flag` for thread-safe
lazy loading.
- The underlying map type is changed from `DenseMap` to
`std::unordered_map` for compatibility with `std::once_flag`.
- `contains()`, `size()` and `at()` are implemented to only load
requested entries on demand.

- Lazy Loading Mechanism
- When reading indexed codegen data, if the newly-introduced
`-indexed-codegen-data-lazy-loading` flag is set, the stable function
map is not fully deserialized up front. The binary format for the stable
function map now includes offsets and sizes to support lazy loading.
- The safety of lazy loading is guarded by the once flag per function
hash. This guarantees that even in a multi-threaded environment, the
deserialization for a given function hash will happen exactly once. The
first thread to request it performs the load, and subsequent threads
will wait for it to complete before using the data. For single-threaded
builds, the overhead is negligible (a single check on the once flag).
For multi-threaded scenarios, users can omit the flag to retain the
previous eager-loading behavior.
---
 llvm/include/llvm/CGData/CodeGenData.h        |   3 +
 llvm/include/llvm/CGData/CodeGenData.inc      |   2 +-
 llvm/include/llvm/CGData/StableFunctionMap.h  |  65 +++++++-
 .../llvm/CGData/StableFunctionMapRecord.h     |  48 +++++-
 llvm/lib/CGData/CodeGenData.cpp               |   2 +-
 llvm/lib/CGData/CodeGenDataReader.cpp         |  17 +-
 llvm/lib/CGData/StableFunctionMap.cpp         |  70 +++++++--
 llvm/lib/CGData/StableFunctionMapRecord.cpp   | 147 +++++++++++++-----
 llvm/lib/CodeGen/GlobalMergeFunctions.cpp     |  10 +-
 .../ThinLTO/AArch64/cgdata-merge-write.ll     |   4 +-
 llvm/test/tools/llvm-cgdata/empty.test        |   4 +-
 llvm/test/tools/llvm-cgdata/error.test        |   4 +-
 .../merge-combined-funcmap-hashtree.test      |   4 +-
 .../llvm-cgdata/merge-funcmap-archive.test    |   8 +-
 .../llvm-cgdata/merge-funcmap-concat.test     |   6 +-
 .../llvm-cgdata/merge-funcmap-double.test     |   7 +-
 .../llvm-cgdata/merge-funcmap-single.test     |   4 +-
 llvm/tools/llvm-cgdata/Opts.td                |   1 +
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        |   5 +
 .../CGData/StableFunctionMapTest.cpp          |   2 +-
 20 files changed, 327 insertions(+), 86 deletions(-)

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 38b96b72ccac6..e44497a408245 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -285,6 +285,9 @@ enum CGDataVersion {
   // Version 3 adds the total size of the Names in the stable function map so
   // we can skip reading them into the memory for non-assertion builds.
   Version3 = 3,
+  // Version 4 adjusts the structure of stable function merging map for
+  // efficient lazy loading support.
+  Version4 = 4,
   CurrentVersion = CG_DATA_INDEX_VERSION
 };
 const uint64_t Version = CGDataVersion::CurrentVersion;
diff --git a/llvm/include/llvm/CGData/CodeGenData.inc b/llvm/include/llvm/CGData/CodeGenData.inc
index 94de4c0b017a2..d5fbe2fb97718 100644
--- a/llvm/include/llvm/CGData/CodeGenData.inc
+++ b/llvm/include/llvm/CGData/CodeGenData.inc
@@ -49,4 +49,4 @@ CG_DATA_SECT_ENTRY(CG_merge, CG_DATA_QUOTE(CG_DATA_MERGE_COMMON),
 #endif
 
 /* Indexed codegen data format version (start from 1). */
-#define CG_DATA_INDEX_VERSION 3
+#define CG_DATA_INDEX_VERSION 4
diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h
index bcb72e8216973..ea3523c3a3299 100644
--- a/llvm/include/llvm/CGData/StableFunctionMap.h
+++ b/llvm/include/llvm/CGData/StableFunctionMap.h
@@ -20,6 +20,8 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <mutex>
 
 namespace llvm {
 
@@ -72,11 +74,37 @@ struct StableFunctionMap {
           IndexOperandHashMap(std::move(IndexOperandHashMap)) {}
   };
 
-  using HashFuncsMapType =
-      DenseMap<stable_hash, SmallVector<std::unique_ptr<StableFunctionEntry>>>;
+  using StableFunctionEntries =
+      SmallVector<std::unique_ptr<StableFunctionEntry>>;
+
+  /// In addition to the deserialized StableFunctionEntry, the struct stores
+  /// the offsets of corresponding serialized stable function entries, and a
+  /// once flag for safe lazy loading in a multithreaded environment.
+  struct EntryStorage {
+    /// The actual storage of deserialized stable function entries. If the map
+    /// is lazily loaded, this will be empty until the first access by the
+    /// corresponding function hash.
+    StableFunctionEntries Entries;
+
+  private:
+    /// This is used to deserialize the entry lazily. Each element is the
+    /// corresponding serialized stable function entry's offset in the memory
+    /// buffer (StableFunctionMap::Buffer).
+    /// The offsets are only populated when loading the map lazily, otherwise
+    /// it is empty.
+    SmallVector<uint64_t> Offsets;
+    std::once_flag LazyLoadFlag;
+    friend struct StableFunctionMap;
+    friend struct StableFunctionMapRecord;
+  };
+
+  // Note: DenseMap requires value type to be copyable even if only using
+  // in-place insertion. Use STL instead. This also affects the
+  // deletion-while-iteration in finalize().
+  using HashFuncsMapType = std::unordered_map<stable_hash, EntryStorage>;
 
   /// Get the HashToFuncs map for serialization.
-  const HashFuncsMapType &getFunctionMap() const { return HashToFuncs; }
+  const HashFuncsMapType &getFunctionMap() const;
 
   /// Get the NameToId vector for serialization.
   ArrayRef<std::string> getNames() const { return IdToName; }
@@ -99,6 +127,19 @@ struct StableFunctionMap {
   /// \returns true if there is no stable function entry.
   bool empty() const { return size() == 0; }
 
+  /// \returns true if there is an entry for the given function hash.
+  /// This does not trigger lazy loading.
+  bool contains(HashFuncsMapType::key_type FunctionHash) const {
+    return HashToFuncs.count(FunctionHash) > 0;
+  }
+
+  /// \returns the stable function entries for the given function hash. If the
+  /// map is lazily loaded, it will deserialize the entries if it is not already
+  /// done, other requests to the same hash at the same time will be blocked
+  /// until the entries are deserialized.
+  const StableFunctionEntries &
+  at(HashFuncsMapType::key_type FunctionHash) const;
+
   enum SizeType {
     UniqueHashCount,        // The number of unique hashes in HashToFuncs.
     TotalFunctionCount,     // The number of total functions in HashToFuncs.
@@ -119,17 +160,31 @@ struct StableFunctionMap {
   /// `StableFunctionEntry` is ready for insertion.
   void insert(std::unique_ptr<StableFunctionEntry> FuncEntry) {
     assert(!Finalized && "Cannot insert after finalization");
-    HashToFuncs[FuncEntry->Hash].emplace_back(std::move(FuncEntry));
+    HashToFuncs[FuncEntry->Hash].Entries.emplace_back(std::move(FuncEntry));
   }
 
+  void deserializeLazyLoadingEntry(HashFuncsMapType::iterator It) const;
+
+  /// Eagerly deserialize all the unloaded entries in the lazy loading map.
+  void deserializeLazyLoadingEntries() const;
+
+  bool isLazilyLoaded() const { return (bool)Buffer; }
+
   /// A map from a stable_hash to a vector of functions with that hash.
-  HashFuncsMapType HashToFuncs;
+  mutable HashFuncsMapType HashToFuncs;
   /// A vector of strings to hold names.
   SmallVector<std::string> IdToName;
   /// A map from StringRef (name) to an ID.
   StringMap<unsigned> NameToId;
   /// True if the function map is finalized with minimal content.
   bool Finalized = false;
+  /// The memory buffer that contains the serialized stable function map for
+  /// lazy loading.
+  /// Non-empty only if this StableFunctionMap is created from a MemoryBuffer
+  /// (i.e. by IndexedCodeGenDataReader::read()) and lazily deserialized.
+  std::shared_ptr<MemoryBuffer> Buffer;
+  /// Whether to read stable function names from the buffer.
+  bool ReadStableFunctionMapNames = true;
 
   friend struct StableFunctionMapRecord;
 };
diff --git a/llvm/include/llvm/CGData/StableFunctionMapRecord.h b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
index a75cb12a70ba6..2d8b573a3cb46 100644
--- a/llvm/include/llvm/CGData/StableFunctionMapRecord.h
+++ b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
@@ -24,6 +24,26 @@
 
 namespace llvm {
 
+/// The structure of the serialized stable function map is as follows:
+/// - Number of unique function/module names
+/// - Total size of unique function/module names for opt-in skipping
+/// - Unique function/module names
+/// - Padding to align to 4 bytes
+/// - Number of StableFunctionEntries
+/// - Hashes of each StableFunctionEntry
+/// - Fixed-size fields for each StableFunctionEntry (the order is consistent
+///   with the hashes above):
+///   - FunctionNameId
+///   - ModuleNameId
+///   - InstCount
+///   - Relative offset to the beginning of IndexOperandHashes for this entry
+/// - Total size of variable-sized IndexOperandHashes for lazy-loading support
+/// - Variable-sized IndexOperandHashes for each StableFunctionEntry:
+///   - Number of IndexOperandHashes
+///   - Contents of each IndexOperandHashes
+///     - InstIndex
+///     - OpndIndex
+///     - OpndHash
 struct StableFunctionMapRecord {
   std::unique_ptr<StableFunctionMap> FunctionMap;
 
@@ -40,13 +60,25 @@ struct StableFunctionMapRecord {
                                  const StableFunctionMap *FunctionMap,
                                  std::vector<CGDataPatchItem> &PatchItems);
 
+  /// A static helper function to deserialize the stable function map entry.
+  /// Ptr should be pointing to the start of the fixed-sized fields of the
+  /// entry when passed in.
+  LLVM_ABI static void deserializeEntry(const unsigned char *Ptr,
+                                        stable_hash Hash,
+                                        StableFunctionMap *FunctionMap);
+
   /// Serialize the stable function map to a raw_ostream.
   LLVM_ABI void serialize(raw_ostream &OS,
                           std::vector<CGDataPatchItem> &PatchItems) const;
 
   /// Deserialize the stable function map from a raw_ostream.
-  LLVM_ABI void deserialize(const unsigned char *&Ptr,
-                            bool ReadStableFunctionMapNames = true);
+  LLVM_ABI void deserialize(const unsigned char *&Ptr);
+
+  /// Lazily deserialize the stable function map from `Buffer` starting at
+  /// `Offset`. The individual stable function entry would be read lazily from
+  /// `Buffer` when the function map is accessed.
+  LLVM_ABI void lazyDeserialize(std::shared_ptr<MemoryBuffer> Buffer,
+                                uint64_t Offset);
 
   /// Serialize the stable function map to a YAML stream.
   LLVM_ABI void serializeYAML(yaml::Output &YOS) const;
@@ -70,6 +102,18 @@ struct StableFunctionMapRecord {
     yaml::Output YOS(OS);
     serializeYAML(YOS);
   }
+
+  /// Set whether to read stable function names from the buffer.
+  /// Has no effect if the function map is read from a YAML stream.
+  void setReadStableFunctionMapNames(bool Read) {
+    assert(
+        FunctionMap->empty() &&
+        "Cannot change ReadStableFunctionMapNames after the map is populated");
+    FunctionMap->ReadStableFunctionMapNames = Read;
+  }
+
+private:
+  void deserialize(const unsigned char *&Ptr, bool Lazy);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index cd012342e1958..b4f08c3d13b0d 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -186,7 +186,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
     return make_error<CGDataError>(cgdata_error::unsupported_version);
   H.DataKind = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
 
-  static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version3,
+  static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version4,
                 "Please update the offset computation below if a new field has "
                 "been added to the header.");
   H.OutlinedHashTreeOffset =
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index 0ab35499c8986..fc59be8df525a 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -26,6 +26,12 @@ static cl::opt<bool> IndexedCodeGenDataReadFunctionMapNames(
              "disabled to save memory and time for final consumption of the "
              "indexed CodeGenData in production."));
 
+cl::opt<bool> IndexedCodeGenDataLazyLoading(
+    "indexed-codegen-data-lazy-loading", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Lazily load indexed CodeGenData. Enable to save memory and time "
+        "for final consumption of the indexed CodeGenData in production."));
+
 namespace llvm {
 
 static Expected<std::unique_ptr<MemoryBuffer>>
@@ -109,11 +115,20 @@ Error IndexedCodeGenDataReader::read() {
       return error(cgdata_error::eof);
     HashTreeRecord.deserialize(Ptr);
   }
+
+  // TODO: lazy loading support for outlined hash tree.
+  std::shared_ptr<MemoryBuffer> SharedDataBuffer = std::move(DataBuffer);
   if (hasStableFunctionMap()) {
     const unsigned char *Ptr = Start + Header.StableFunctionMapOffset;
     if (Ptr >= End)
       return error(cgdata_error::eof);
-    FunctionMapRecord.deserialize(Ptr, IndexedCodeGenDataReadFunctionMapNames);
+    FunctionMapRecord.setReadStableFunctionMapNames(
+        IndexedCodeGenDataReadFunctionMapNames);
+    if (IndexedCodeGenDataLazyLoading)
+      FunctionMapRecord.lazyDeserialize(SharedDataBuffer,
+                                        Header.StableFunctionMapOffset);
+    else
+      FunctionMapRecord.deserialize(Ptr);
   }
 
   return success();
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
index 87f1e76afb60b..2f54fad0aa084 100644
--- a/llvm/lib/CGData/StableFunctionMap.cpp
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -15,8 +15,10 @@
 
 #include "llvm/CGData/StableFunctionMap.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/CGData/StableFunctionMapRecord.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include <mutex>
 
 #define DEBUG_TYPE "stable-function-map"
 
@@ -93,9 +95,10 @@ void StableFunctionMap::insert(const StableFunction &Func) {
 
 void StableFunctionMap::merge(const StableFunctionMap &OtherMap) {
   assert(!Finalized && "Cannot merge after finalization");
+  deserializeLazyLoadingEntries();
   for (auto &[Hash, Funcs] : OtherMap.HashToFuncs) {
-    auto &ThisFuncs = HashToFuncs[Hash];
-    for (auto &Func : Funcs) {
+    auto &ThisFuncs = HashToFuncs[Hash].Entries;
+    for (auto &Func : Funcs.Entries) {
       auto FuncNameId =
           getIdOrCreateForName(*OtherMap.getNameForId(Func->FunctionNameId));
       auto ModuleNameId =
@@ -114,25 +117,63 @@ size_t StableFunctionMap::size(SizeType Type) const {
   case UniqueHashCount:
     return HashToFuncs.size();
   case TotalFunctionCount: {
+    deserializeLazyLoadingEntries();
     size_t Count = 0;
     for (auto &Funcs : HashToFuncs)
-      Count += Funcs.second.size();
+      Count += Funcs.second.Entries.size();
     return Count;
   }
   case MergeableFunctionCount: {
+    deserializeLazyLoadingEntries();
     size_t Count = 0;
     for (auto &[Hash, Funcs] : HashToFuncs)
-      if (Funcs.size() >= 2)
-        Count += Funcs.size();
+      if (Funcs.Entries.size() >= 2)
+        Count += Funcs.Entries.size();
     return Count;
   }
   }
   llvm_unreachable("Unhandled size type");
 }
 
+const StableFunctionMap::StableFunctionEntries &
+StableFunctionMap::at(HashFuncsMapType::key_type FunctionHash) const {
+  auto It = HashToFuncs.find(FunctionHash);
+  if (isLazilyLoaded())
+    deserializeLazyLoadingEntry(It);
+  return It->second.Entries;
+}
+
+void StableFunctionMap::deserializeLazyLoadingEntry(
+    HashFuncsMapType::iterator It) const {
+  assert(isLazilyLoaded() && "Cannot deserialize non-lazily-loaded map");
+  auto &[Hash, Storage] = *It;
+  std::call_once(Storage.LazyLoadFlag,
+                 [this, HashArg = Hash, &StorageArg = Storage]() {
+                   for (auto Offset : StorageArg.Offsets)
+                     StableFunctionMapRecord::deserializeEntry(
+                         reinterpret_cast<const unsigned char *>(Offset),
+                         HashArg, const_cast<StableFunctionMap *>(this));
+                 });
+}
+
+void StableFunctionMap::deserializeLazyLoadingEntries() const {
+  if (!isLazilyLoaded())
+    return;
+  for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It)
+    deserializeLazyLoadingEntry(It);
+}
+
+const StableFunctionMap::HashFuncsMapType &
+StableFunctionMap::getFunctionMap() const {
+  // Ensure all entries are deserialized before returning the raw map.
+  if (isLazilyLoaded())
+    deserializeLazyLoadingEntries();
+  return HashToFuncs;
+}
+
 using ParamLocs = SmallVector<IndexPair>;
-static void removeIdenticalIndexPair(
-    SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>> &SFS) {
+static void
+removeIdenticalIndexPair(StableFunctionMap::StableFunctionEntries &SFS) {
   auto &RSF = SFS[0];
   unsigned StableFunctionCount = SFS.size();
 
@@ -159,9 +200,7 @@ static void removeIdenticalIndexPair(
       SF->IndexOperandHashMap->erase(Pair);
 }
 
-static bool isProfitable(
-    const SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>>
-        &SFS) {
+static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS) {
   unsigned StableFunctionCount = SFS.size();
   if (StableFunctionCount < GlobalMergingMinMerges)
     return false;
@@ -202,8 +241,11 @@ static bool isProfitable(
 }
 
 void StableFunctionMap::finalize(bool SkipTrim) {
+  deserializeLazyLoadingEntries();
+  SmallVector<HashFuncsMapType::iterator> ToDelete;
   for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It) {
-    auto &[StableHash, SFS] = *It;
+    auto &[StableHash, Storage] = *It;
+    auto &SFS = Storage.Entries;
 
     // Group stable functions by ModuleIdentifier.
     llvm::stable_sort(SFS, [&](const std::unique_ptr<StableFunctionEntry> &L,
@@ -236,7 +278,7 @@ void StableFunctionMap::finalize(bool SkipTrim) {
       }
     }
     if (Invalid) {
-      HashToFuncs.erase(It);
+      ToDelete.push_back(It);
       continue;
     }
 
@@ -248,8 +290,10 @@ void StableFunctionMap::finalize(bool SkipTrim) {
     removeIdenticalIndexPair(SFS);
 
     if (!isProfitable(SFS))
-      HashToFuncs.erase(It);
+      ToDelete.push_back(It);
   }
+  for (auto It : ToDelete)
+    HashToFuncs.erase(It);
 
   Finalized = true;
 }
diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp
index 423e068023088..e585995ba6a31 100644
--- a/llvm/lib/CGData/StableFunctionMapRecord.cpp
+++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp
@@ -53,7 +53,7 @@ static SmallVector<const StableFunctionMap::StableFunctionEntry *>
 getStableFunctionEntries(const StableFunctionMap &SFM) {
   SmallVector<const StableFunctionMap::StableFunctionEntry *> FuncEntries;
   for (const auto &P : SFM.getFunctionMap())
-    for (auto &Func : P.second)
+    for (auto &Func : P.second.Entries)
       FuncEntries.emplace_back(Func.get());
 
   llvm::stable_sort(
@@ -107,14 +107,25 @@ void StableFunctionMapRecord::serialize(
   // Write StableFunctionEntries whose pointers are sorted.
   auto FuncEntries = getStableFunctionEntries(*FunctionMap);
   Writer.write<uint32_t>(FuncEntries.size());
-
-  for (const auto *FuncRef : FuncEntries) {
+  for (const auto *FuncRef : FuncEntries)
     Writer.write<stable_hash>(FuncRef->Hash);
+  std::vector<uint64_t> IndexOperandHashesOffsets;
+  IndexOperandHashesOffsets.reserve(FuncEntries.size());
+  for (const auto *FuncRef : FuncEntries) {
     Writer.write<uint32_t>(FuncRef->FunctionNameId);
     Writer.write<uint32_t>(FuncRef->ModuleNameId);
     Writer.write<uint32_t>(FuncRef->InstCount);
-
+    const uint64_t Offset = Writer.OS.tell();
+    IndexOperandHashesOffsets.push_back(Offset);
+    Writer.write<uint64_t>(0);
+  }
+  const uint64_t IndexOperandHashesByteSizeOffset = Writer.OS.tell();
+  Writer.write<uint64_t>(0);
+  for (size_t I = 0; I < FuncEntries.size(); ++I) {
+    const uint64_t Offset = Writer.OS.tell() - IndexOperandHashesOffsets[I];
+    PatchItems.emplace_back(IndexOperandHashesOffsets[I], &Offset, 1);
     // Emit IndexOperandHashes sorted from IndexOperandHashMap.
+    const auto *FuncRef = FuncEntries[I];
     IndexOperandHashVecType IndexOperandHashes =
         getStableIndexOperandHashes(FuncRef);
     Writer.write<uint32_t>(IndexOperandHashes.size());
@@ -124,10 +135,62 @@ void StableFunctionMapRecord::serialize(
       Writer.write<stable_hash>(IndexOperandHash.second);
     }
   }
+  // Write the total size of IndexOperandHashes.
+  const uint64_t IndexOperandHashesByteSize =
+      Writer.OS.tell() - IndexOperandHashesByteSizeOffset - sizeof(uint64_t);
+  PatchItems.emplace_back(IndexOperandHashesByteSizeOffset,
+                          &IndexOperandHashesByteSize, 1);
+}
+
+void StableFunctionMapRecord::deserializeEntry(const unsigned char *Ptr,
+                                               stable_hash Hash,
+                                               StableFunctionMap *FunctionMap) {
+  auto FunctionNameId =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  if (FunctionMap->ReadStableFunctionMapNames)
+    assert(FunctionMap->getNameForId(FunctionNameId) &&
+           "FunctionNameId out of range");
+  auto ModuleNameId =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  if (FunctionMap->ReadStableFunctionMapNames)
+    assert(FunctionMap->getNameForId(ModuleNameId) &&
+           "ModuleNameId out of range");
+  auto InstCount =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+
+  // Read IndexOperandHashes to build IndexOperandHashMap
+  auto CurrentPosition = reinterpret_cast<uintptr_t>(Ptr);
+  auto IndexOperandHashesOffset =
+      endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
+  auto *IndexOperandHashesPtr = reinterpret_cast<const unsigned char *>(
+      CurrentPosition + IndexOperandHashesOffset);
+  auto NumIndexOperandHashes =
+      endian::readNext<uint32_t, endianness::little, unaligned>(
+          IndexOperandHashesPtr);
+  auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+  for (unsigned J = 0; J < NumIndexOperandHashes; ++J) {
+    auto InstIndex = endian::readNext<uint32_t, endianness::little, unaligned>(
+        IndexOperandHashesPtr);
+    auto OpndIndex = endian::readNext<uint32_t, endianness::little, unaligned>(
+        IndexOperandHashesPtr);
+    auto OpndHash =
+        endian::readNext<stable_hash, endianness::little, unaligned>(
+            IndexOperandHashesPtr);
+    assert(InstIndex < InstCount && "InstIndex out of range");
+
+    IndexOperandHashMap->try_emplace({InstIndex, OpndIndex}, OpndHash);
+  }
+
+  // Insert a new StableFunctionEntry into the map.
+  auto FuncEntry = std::make_unique<StableFunctionMap::StableFunctionEntry>(
+      Hash, FunctionNameId, ModuleNameId, InstCount,
+      std::move(IndexOperandHashMap));
+
+  FunctionMap->insert(std::move(FuncEntry));
 }
 
 void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr,
-                                          bool ReadStableFunctionMapNames) {
+                                          bool Lazy) {
   // Assert that Ptr is 4-byte aligned
   assert(((uintptr_t)Ptr % 4) == 0);
   // Read Names.
@@ -139,7 +202,7 @@ void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr,
   const auto NamesByteSize =
       endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
   const auto NamesOffset = reinterpret_cast<uintptr_t>(Ptr);
-  if (ReadStableFunctionMapNames) {
+  if (FunctionMap->ReadStableFunctionMapNames) {
     for (unsigned I = 0; I < NumNames; ++I) {
       StringRef Name(reinterpret_cast<const char *>(Ptr));
       Ptr += Name.size() + 1;
@@ -157,47 +220,51 @@ void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr,
   // Read StableFunctionEntries.
   auto NumFuncs =
       endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  auto FixedSizeFieldsOffset =
+      reinterpret_cast<uintptr_t>(Ptr) + NumFuncs * sizeof(stable_hash);
+  constexpr uint32_t FixedSizeFieldsSizePerEntry =
+      // FunctionNameId
+      sizeof(uint32_t) +
+      // ModuleNameId
+      sizeof(uint32_t) +
+      // InstCount
+      sizeof(uint32_t) +
+      // Relative offset to IndexOperandHashes
+      sizeof(uint64_t);
   for (unsigned I = 0; I < NumFuncs; ++I) {
     auto Hash =
         endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
-    [[maybe_unused]] auto FunctionNameId =
-        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-    [[maybe_unused]] auto ModuleNameId =
-        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-    // Only validate IDs if we've read the names
-    if (ReadStableFunctionMapNames) {
-      assert(FunctionMap->getNameForId(FunctionNameId) &&
-             "FunctionNameId out of range");
-      assert(FunctionMap->getNameForId(ModuleNameId) &&
-             "ModuleNameId out of range");
+    if (Lazy) {
+      auto It = FunctionMap->HashToFuncs.try_emplace(Hash).first;
+      StableFunctionMap::EntryStorage &Storage = It->second;
+      Storage.Offsets.push_back(FixedSizeFieldsOffset);
+    } else {
+      deserializeEntry(
+          reinterpret_cast<const unsigned char *>(FixedSizeFieldsOffset), Hash,
+          FunctionMap.get());
     }
+    FixedSizeFieldsOffset += FixedSizeFieldsSizePerEntry;
+  }
 
-    auto InstCount =
-        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-
-    // Read IndexOperandHashes to build IndexOperandHashMap
-    auto NumIndexOperandHashes =
-        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-    auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
-    for (unsigned J = 0; J < NumIndexOperandHashes; ++J) {
-      auto InstIndex =
-          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-      auto OpndIndex =
-          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-      auto OpndHash =
-          endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
-      assert(InstIndex < InstCount && "InstIndex out of range");
-
-      IndexOperandHashMap->try_emplace({InstIndex, OpndIndex}, OpndHash);
-    }
+  // Update Ptr to the end of the serialized map to meet the expectation of
+  // CodeGenDataReader.
+  Ptr = reinterpret_cast<const unsigned char *>(FixedSizeFieldsOffset);
+  auto IndexOperandHashesByteSize =
+      endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
+  Ptr = reinterpret_cast<const unsigned char *>(
+      reinterpret_cast<uintptr_t>(Ptr) + IndexOperandHashesByteSize);
+}
 
-    // Insert a new StableFunctionEntry into the map.
-    auto FuncEntry = std::make_unique<StableFunctionMap::StableFunctionEntry>(
-        Hash, FunctionNameId, ModuleNameId, InstCount,
-        std::move(IndexOperandHashMap));
+void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr) {
+  deserialize(Ptr, /*Lazy=*/false);
+}
 
-    FunctionMap->insert(std::move(FuncEntry));
-  }
+void StableFunctionMapRecord::lazyDeserialize(
+    std::shared_ptr<MemoryBuffer> Buffer, uint64_t Offset) {
+  const auto *Ptr = reinterpret_cast<const unsigned char *>(
+      reinterpret_cast<uintptr_t>(Buffer->getBufferStart()) + Offset);
+  deserialize(Ptr, /*Lazy=*/true);
+  FunctionMap->Buffer = std::move(Buffer);
 }
 
 void StableFunctionMapRecord::serializeYAML(yaml::Output &YOS) const {
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 73f11c1345daf..47640c4aac6df 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -350,9 +350,8 @@ checkConstLocationCompatible(const StableFunctionMap::StableFunctionEntry &SF,
   return true;
 }
 
-static ParamLocsVecTy computeParamInfo(
-    const SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>>
-        &SFS) {
+static ParamLocsVecTy
+computeParamInfo(const StableFunctionMap::StableFunctionEntries &SFS) {
   std::map<std::vector<stable_hash>, ParamLocs> HashSeqToLocs;
   auto &RSF = *SFS[0];
   unsigned StableFunctionCount = SFS.size();
@@ -396,19 +395,18 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
   // Collect stable functions related to the current module.
   DenseMap<stable_hash, SmallVector<std::pair<Function *, FunctionHashInfo>>>
       HashToFuncs;
-  auto &Maps = FunctionMap->getFunctionMap();
   for (auto &F : M) {
     if (!isEligibleFunction(&F))
       continue;
     auto FI = llvm::StructuralHashWithDifferences(F, ignoreOp);
-    if (Maps.contains(FI.FunctionHash))
+    if (FunctionMap->contains(FI.FunctionHash))
       HashToFuncs[FI.FunctionHash].emplace_back(&F, std::move(FI));
   }
 
   for (auto &[Hash, Funcs] : HashToFuncs) {
     std::optional<ParamLocsVecTy> ParamLocsVec;
     SmallVector<FuncMergeInfo> FuncMergeInfos;
-    auto &SFS = Maps.at(Hash);
+    auto &SFS = FunctionMap->at(Hash);
     assert(!SFS.empty());
     auto &RFS = SFS[0];
 
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
index a4022eb885b43..47042d23cc2ca 100644
--- a/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
@@ -36,9 +36,11 @@
 
 ; Merge the cgdata using llvm-cgdata.
 ; We now validate the content of the merged cgdata.
-; Two functions have the same hash with only one different constnat at a same location.
+; Two functions have the same hash with only one different constant at the same location.
 ; RUN: llvm-cgdata --merge -o %tout.cgdata %tout-nowrite.1 %tout-nowrite.2
 ; RUN: llvm-cgdata --convert %tout.cgdata   -o - | FileCheck %s
+; RUN: llvm-cgdata --merge -o %tout-lazy.cgdata %tout-nowrite.1 %tout-nowrite.2 -indexed-codegen-data-lazy-loading
+; RUN: llvm-cgdata --convert %tout-lazy.cgdata -indexed-codegen-data-lazy-loading -o - | FileCheck %s
 
 ; CHECK:      - Hash: [[#%d,HASH:]]
 ; CHECK-NEXT:   FunctionName: f1
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
index 0d2b0e848a2c9..2082eca58f073 100644
--- a/llvm/test/tools/llvm-cgdata/empty.test
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -16,7 +16,7 @@ RUN: llvm-cgdata --show %t_emptyheader.cgdata | count 0
 
 # The version number appears when asked, as it's in the header
 RUN: llvm-cgdata --show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
-VERSION: Version: 3
+VERSION: Version: 4
 
 # When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
 RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
@@ -30,7 +30,7 @@ RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
 #   uint64_t StableFunctionMapOffset;
 # }
 RUN: printf '\xffcgdata\x81' > %t_header.cgdata
-RUN: printf '\x03\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x04\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
index 92ff484e31caf..9484371848a72 100644
--- a/llvm/test/tools/llvm-cgdata/error.test
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -22,9 +22,9 @@ RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
 RUN: not llvm-cgdata --show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
 CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
 
-# The current version 3 while the header says 4.
+# The current version 4 while the header says 5.
 RUN: printf '\xffcgdata\x81' > %t_version.cgdata
-RUN: printf '\x04\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x05\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test b/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
index b060872113b1b..70b83af407e5a 100644
--- a/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
+++ b/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
@@ -23,6 +23,8 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-both-hashtree-funcma
 # Merge an object file having cgdata (__llvm_outline and __llvm_merge)
 RUN: llvm-cgdata -m --skip-trim %t/merge-both-hashtree-funcmap.o -o %t/merge-both-hashtree-funcmap.cgdata
 RUN: llvm-cgdata -s %t/merge-both-hashtree-funcmap.cgdata | FileCheck %s
+RUN: llvm-cgdata -m --skip-trim %t/merge-both-hashtree-funcmap.o -o %t/merge-both-hashtree-funcmap-lazy.cgdata -indexed-codegen-data-lazy-loading
+RUN: llvm-cgdata -s %t/merge-both-hashtree-funcmap-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 3
@@ -63,4 +65,4 @@ CHECK-NEXT:  Mergeable function Count: 0
 
 ;--- merge-both-template.ll
 @.data1 = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
- at .data2 = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data2 = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
index 2936086321028..c088ffbb4e83f 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
@@ -23,8 +23,8 @@ RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
 # Merge the archive into the codegen data file.
 RUN: llvm-cgdata --merge --skip-trim %t/merge-archive.a -o %t/merge-archive.cgdata
 RUN: llvm-cgdata --show %t/merge-archive.cgdata | FileCheck %s
-
-RUN: llvm-cgdata --show %t/merge-archive.cgdata| FileCheck %s
+RUN: llvm-cgdata --merge --skip-trim %t/merge-archive.a -o %t/merge-archive-lazy.cgdata -indexed-codegen-data-lazy-loading
+RUN: llvm-cgdata --show %t/merge-archive-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
 CHECK-NEXT:  Total function Count: 2
@@ -65,7 +65,7 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-1-template.ll
- at .data = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
 
 ;--- raw-2.cgtext
 :stable_function_map
@@ -80,4 +80,4 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-2-template.ll
- at .data = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
index d2965456a1999..90b5992973b49 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
@@ -17,6 +17,8 @@ RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat-template-
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
 RUN: llvm-cgdata --merge --skip-trim %t/merge-concat.o -o %t/merge-concat.cgdata
 RUN: llvm-cgdata --show %t/merge-concat.cgdata | FileCheck %s
+RUN: llvm-cgdata --merge --skip-trim %t/merge-concat.o -o %t/merge-concat-lazy.cgdata -indexed-codegen-data-lazy-loading
+RUN: llvm-cgdata --show %t/merge-concat-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
@@ -74,5 +76,5 @@ MAP-NEXT: ...
 ; In an linked executable (as opposed to an object file), cgdata in __llvm_merge might be concatenated.
 ; Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated.
 ; In other words, the following two trees are encoded back-to-back in a binary format.
- at .data1 = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
- at .data2 = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data1 = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data2 = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
index 8277e3272d77e..b986aef26f1d7 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
@@ -19,8 +19,9 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
 # Merge two object files into the codegen data file.
 RUN: llvm-cgdata --merge --skip-trim %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
-
 RUN: llvm-cgdata --show %t/merge.cgdata | FileCheck %s
+RUN: llvm-cgdata --merge --skip-trim %t/merge-1.o %t/merge-2.o -o %t/merge-lazy.cgdata -indexed-codegen-data-lazy-loading
+RUN: llvm-cgdata --show %t/merge-lazy.cgdata -indexed-codegen-data-lazy-loading  | FileCheck %s
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
 CHECK-NEXT:  Total function Count: 2
@@ -61,7 +62,7 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-1-template.ll
- at .data = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
 
 ;--- raw-2.cgtext
 :stable_function_map
@@ -76,4 +77,4 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-2-template.ll
- at .data = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
index 9469f1cbda331..eac852ff7e710 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
@@ -15,6 +15,8 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merg
 # Merge an object file having cgdata (__llvm_merge)
 RUN: llvm-cgdata -m --skip-trim %t/merge-single.o -o %t/merge-single.cgdata
 RUN: llvm-cgdata -s %t/merge-single.cgdata | FileCheck %s
+RUN: llvm-cgdata -m --skip-trim %t/merge-single.o -o %t/merge-single-lazy.cgdata -indexed-codegen-data-lazy-loading 
+RUN: llvm-cgdata -s %t/merge-single-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
 CHECK-NEXT:  Total function Count: 1
@@ -33,4 +35,4 @@ CHECK-NEXT:  Mergeable function Count: 0
 ...
 
 ;--- merge-single-template.ll
- at .data = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/tools/llvm-cgdata/Opts.td b/llvm/tools/llvm-cgdata/Opts.td
index 8da933f744e87..2b515a0140e67 100644
--- a/llvm/tools/llvm-cgdata/Opts.td
+++ b/llvm/tools/llvm-cgdata/Opts.td
@@ -31,3 +31,4 @@ def : JoinedOrSeparate<["-"], "o">, Alias<output>, MetaVarName<"<file>">, HelpTe
 def format : Option<["--"], "format", KIND_SEPARATE>,
              HelpText<"Specify the output format (text or binary)">, MetaVarName<"<value>">;
 def : JoinedOrSeparate<["-"], "f">, Alias<format>, HelpText<"Alias for --format">;
+def indexed_codegen_data_lazy_loading : F<"indexed-codegen-data-lazy-loading", "Lazily load indexed CodeGenData for testing purpose.">, Flags<[HelpHidden]>;
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 98fa5c5657353..047557e5a7fae 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -83,6 +83,8 @@ static CGDataAction Action;
 static std::optional<CGDataFormat> OutputFormat;
 static std::vector<std::string> InputFilenames;
 
+extern cl::opt<bool> IndexedCodeGenDataLazyLoading;
+
 static void exitWithError(Twine Message, StringRef Whence = "",
                           StringRef Hint = "") {
   WithColor::error();
@@ -361,6 +363,9 @@ static void parseArgs(int argc, char **argv) {
   default:
     llvm_unreachable("unrecognized action");
   }
+
+  IndexedCodeGenDataLazyLoading =
+      Args.hasArg(OPT_indexed_codegen_data_lazy_loading);
 }
 
 int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
diff --git a/llvm/unittests/CGData/StableFunctionMapTest.cpp b/llvm/unittests/CGData/StableFunctionMapTest.cpp
index d551ac8a814f4..5cf62ae0b3943 100644
--- a/llvm/unittests/CGData/StableFunctionMapTest.cpp
+++ b/llvm/unittests/CGData/StableFunctionMapTest.cpp
@@ -117,7 +117,7 @@ TEST(StableFunctionMap, Finalize3) {
   Map.finalize();
   auto &M = Map.getFunctionMap();
   EXPECT_THAT(M, SizeIs(1));
-  auto &FuncEntries = M.begin()->second;
+  auto &FuncEntries = M.begin()->second.Entries;
   for (auto &FuncEntry : FuncEntries) {
     EXPECT_THAT(*FuncEntry->IndexOperandHashMap, SizeIs(1));
     ASSERT_THAT(*FuncEntry->IndexOperandHashMap,

>From 08a8008ed7cf4d119de325b57e98e5a9d9e573a6 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo at google.com>
Date: Thu, 14 Aug 2025 13:46:05 -0700
Subject: [PATCH 20/53] [Clang][attr] Add '-std=c11' to allow for typedef
 redefinition

---
 clang/test/Sema/attr-cfi-salt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Sema/attr-cfi-salt.c b/clang/test/Sema/attr-cfi-salt.c
index 687f54dc499d8..bccdfc49ec5bb 100644
--- a/clang/test/Sema/attr-cfi-salt.c
+++ b/clang/test/Sema/attr-cfi-salt.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fsanitize=kcfi -verify %s
+// RUN: %clang_cc1 -std=c11 -fsyntax-only -fsanitize=kcfi -verify %s
 // RUN: %clang_cc1 -std=c89 -DKNR -fsyntax-only -fsanitize=kcfi -verify %s
 
 #define __cfi_salt(S) __attribute__((cfi_salt(S)))

>From 8f21c7aad050ac3f21594f0d7787bd2052e8b3da Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha at users.noreply.github.com>
Date: Thu, 14 Aug 2025 22:53:09 +0200
Subject: [PATCH 21/53] [CIR][NFC] Add Symbol Table to CIRGenFunction (#153625)

This patchs adds a symbol table to CIRGenFunction plus scopes and
insertions to the table where we were missing them previously.
---
 clang/include/clang/CIR/MissingFeatures.h |  3 ---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp      |  6 ++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp  |  9 ++++++++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h    | 18 +++++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp      |  2 ++
 5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 805c43e6d5054..baab62f572b98 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -27,9 +27,6 @@ struct MissingFeatures {
   // Address space related
   static bool addressSpace() { return false; }
 
-  // CIRGenFunction implementation details
-  static bool cgfSymbolTable() { return false; }
-
   // Unhandled global/linkage information.
   static bool opGlobalThreadLocal() { return false; }
   static bool opGlobalConstant() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index cc1fd81433b53..8bcca6f5d1803 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -591,6 +591,12 @@ LValue CIRGenFunction::emitDeclRefLValue(const DeclRefExpr *e) {
             ? emitLoadOfReferenceLValue(addr, getLoc(e->getSourceRange()),
                                         vd->getType(), AlignmentSource::Decl)
             : makeAddrLValue(addr, ty, AlignmentSource::Decl);
+
+    // Statics are defined as globals, so they are not include in the function's
+    // symbol table.
+    assert((vd->isStaticLocal() || symbolTable.count(vd)) &&
+           "non-static locals should be already mapped");
+
     return lv;
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 8a3f5ab78ab59..d6a0792292604 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -215,7 +215,7 @@ void CIRGenFunction::declare(mlir::Value addrVal, const Decl *var, QualType ty,
                              mlir::Location loc, CharUnits alignment,
                              bool isParam) {
   assert(isa<NamedDecl>(var) && "Needs a named decl");
-  assert(!cir::MissingFeatures::cgfSymbolTable());
+  assert(!symbolTable.count(var) && "not supposed to be available just yet");
 
   auto allocaOp = addrVal.getDefiningOp<cir::AllocaOp>();
   assert(allocaOp && "expected cir::AllocaOp");
@@ -224,6 +224,8 @@ void CIRGenFunction::declare(mlir::Value addrVal, const Decl *var, QualType ty,
     allocaOp.setInitAttr(mlir::UnitAttr::get(&getMLIRContext()));
   if (ty->isReferenceType() || ty.isConstQualified())
     allocaOp.setConstantAttr(mlir::UnitAttr::get(&getMLIRContext()));
+
+  symbolTable.insert(var, allocaOp);
 }
 
 void CIRGenFunction::LexicalScope::cleanup() {
@@ -485,6 +487,9 @@ void CIRGenFunction::finishFunction(SourceLocation endLoc) {
 }
 
 mlir::LogicalResult CIRGenFunction::emitFunctionBody(const clang::Stmt *body) {
+  // We start with function level scope for variables.
+  SymTableScopeTy varScope(symbolTable);
+
   auto result = mlir::LogicalResult::success();
   if (const CompoundStmt *block = dyn_cast<CompoundStmt>(body))
     emitCompoundStmtWithoutScope(*block);
@@ -531,6 +536,8 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
   FunctionArgList args;
   QualType retTy = buildFunctionArgList(gd, args);
 
+  // Create a scope in the symbol table to hold variable declarations.
+  SymTableScopeTy varScope(symbolTable);
   {
     LexicalScope lexScope(*this, fusedLoc, entryBB);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index ddc1edd77010c..c3e77c99cca35 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -31,6 +31,7 @@
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "clang/CIR/TypeEvaluationKind.h"
+#include "llvm/ADT/ScopedHashTable.h"
 
 namespace {
 class ScalarExprEmitter;
@@ -103,6 +104,14 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// Sanitizers enabled for this function.
   clang::SanitizerSet sanOpts;
 
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated,
+  /// the scope is destroyed and the mappings created in this scope are
+  /// dropped.
+  using SymTableTy = llvm::ScopedHashTable<const clang::Decl *, mlir::Value>;
+  SymTableTy symbolTable;
+
   /// Whether or not a Microsoft-style asm block has been processed within
   /// this fuction. These can potentially set the return value.
   bool sawAsmBlock = false;
@@ -325,6 +334,9 @@ class CIRGenFunction : public CIRGenTypeCache {
     ~SourceLocRAIIObject() { restore(); }
   };
 
+  using SymTableScopeTy =
+      llvm::ScopedHashTableScope<const clang::Decl *, mlir::Value>;
+
   /// Hold counters for incrementally naming temporaries
   unsigned counterRefTmp = 0;
   unsigned counterAggTmp = 0;
@@ -499,7 +511,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
     assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!");
     localDeclMap.insert({vd, addr});
-    // TODO: Add symbol table support
+
+    // Add to the symbol table if not there already.
+    if (symbolTable.count(vd))
+      return;
+    symbolTable.insert(vd, addr.getPointer());
   }
 
   bool shouldNullCheckClassCastValue(const CastExpr *ce);
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index dffe8b408b6da..d1e4a14824011 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -33,6 +33,8 @@ void CIRGenFunction::emitCompoundStmtWithoutScope(const CompoundStmt &s) {
 }
 
 void CIRGenFunction::emitCompoundStmt(const CompoundStmt &s) {
+  // Add local scope to track new declared variables.
+  SymTableScopeTy varScope(symbolTable);
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
   mlir::OpBuilder::InsertPoint scopeInsPt;
   builder.create<cir::ScopeOp>(

>From cbab043a77c95942385de8823f43ebabd562c9af Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 14 Aug 2025 21:53:45 +0100
Subject: [PATCH 22/53] [AArch64] Change the cost of fma and fmuladd to match
 fmul. (#152963)

As fmul and fmadd are so similar, their performance characteristics tend
to be the same on most platforms, at least in terms of reciprocal
throughputs. Processors capable of performing a given number of fmul per
cycle can usually perform the same number of fma, with the extra add
being relatively simple on top. This patch makes the scores of the two
operations the same, which brings the throughput cost of a fma/fmuladd
to 2, and the latency to 3, which are the defaults for fmul.

Note that we might also want to change the throughput cost of a fmul to
1, as most processors have ample bandwidth for them, but they should
still stay in-line with one another.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  10 ++
 .../Analysis/CostModel/AArch64/arith-fp.ll    |  44 +++---
 .../CostModel/AArch64/sve-arith-fp.ll         |  32 ++--
 .../AArch64/f128-fmuladd-reduction.ll         | 148 +++++++++++-------
 .../AArch64/veclib-intrinsic-calls.ll         |   6 +-
 .../AArch64/reused-scalar-repeated-in-node.ll | 142 +++++------------
 .../AArch64/vec3-reorder-reshuffle.ll         |  39 +++--
 .../SLPVectorizer/insertelement-postpone.ll   |  22 ++-
 8 files changed, 214 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3fba7e853eafb..3042251cf754d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -666,6 +666,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return LT.first;
     break;
   }
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd: {
+    // Given a fma or fmuladd, cost it the same as a fmul instruction which are
+    // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
+    Type *EltTy = RetTy->getScalarType();
+    if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
+        (EltTy->isHalfTy() && ST->hasFullFP16()))
+      return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
+    break;
+  }
   case Intrinsic::stepvector: {
     InstructionCost Cost = 1; // Cost of the `index' instruction
     auto LT = getTypeLegalizationCost(RetTy);
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index 0a154d09c36ba..c208d03ff94b7 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -663,13 +663,13 @@ define void @fcopysign_fp16() {
 
 define void @fma() {
 ; CHECK-LABEL: 'fma'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F128 = call fp128 @llvm.fma.f128(fp128 undef, fp128 undef, fp128 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V2F128 = call <2 x fp128> @llvm.fma.v2f128(<2 x fp128> undef, <2 x fp128> undef, <2 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -698,10 +698,10 @@ define void @fma_fp16() {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fma_fp16'
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
@@ -713,13 +713,13 @@ define void @fma_fp16() {
 
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %F64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:6 SizeLat:2 for: %F128 = call fp128 @llvm.fmuladd.f128(fp128 undef, fp128 undef, fp128 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:2 Lat:6 SizeLat:2 for: %V2F128 = call <2 x fp128> @llvm.fmuladd.v2f128(<2 x fp128> undef, <2 x fp128> undef, <2 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -748,10 +748,10 @@ define void @fmuladd_fp16() {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fmuladd_fp16'
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %F16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V4F16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %V8F16 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %F16 = call half @llvm.fmuladd.f32(half undef, half undef, half undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
index f7ebd406d230a..1c40354892191 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
@@ -167,14 +167,14 @@ define void @frem() {
 
 define void @fma() {
 ; CHECK-LABEL: 'fma'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -193,14 +193,14 @@ define void @fma() {
 
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
index d9710328d6048..f3542f63a4273 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
@@ -4,73 +4,102 @@
 define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr %end1, double %x, i64 %n) {
 ; CHECK-LABEL: define double @fp128_fmuladd_reduction(
 ; CHECK-SAME: ptr [[START0:%.*]], ptr [[START1:%.*]], ptr [[END0:%.*]], ptr [[END1:%.*]], double [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP2]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi double [ [[X]], %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 48
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START0]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP5]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi double [ [[X]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x fp128>, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x fp128>, ptr [[TMP24]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x fp128>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x fp128>, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP3]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP28]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP35]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x double>, ptr [[TMP36]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD3]] to <2 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD4]] to <2 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD5]] to <2 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP10]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP11]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP12]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP13]], [[WIDE_LOAD9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP14]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP18]], <2 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP19]], <2 x double> [[TMP16]])
+; CHECK-NEXT:    [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP20]], <2 x double> [[TMP17]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 16
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]]
-; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = mul i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX4]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX4]], 16
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX4]], 24
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START1]], i64 [[OFFSET_IDX4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[X]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[N]], 2
+; CHECK-NEXT:    [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[N_VEC11]], 16
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC11]], 8
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP8]]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[VEC_EPILOG_PH]] ], [ [[TMP33:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX12]], 16
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[START0]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[INDEX12]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load fp128, ptr [[NEXT_GEP]], align 16
-; CHECK-NEXT:    [[TMP11:%.*]] = load fp128, ptr [[NEXT_GEP1]], align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = load fp128, ptr [[NEXT_GEP2]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = load fp128, ptr [[NEXT_GEP3]], align 16
-; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[NEXT_GEP5]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[NEXT_GEP6]], align 16
-; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[NEXT_GEP7]], align 16
-; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[NEXT_GEP8]], align 16
-; CHECK-NEXT:    [[TMP18:%.*]] = fptrunc fp128 [[TMP10]] to double
-; CHECK-NEXT:    [[TMP19:%.*]] = fptrunc fp128 [[TMP11]] to double
-; CHECK-NEXT:    [[TMP20:%.*]] = fptrunc fp128 [[TMP12]] to double
-; CHECK-NEXT:    [[TMP21:%.*]] = fptrunc fp128 [[TMP13]] to double
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul double [[TMP18]], [[TMP14]]
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul double [[TMP19]], [[TMP15]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul double [[TMP20]], [[TMP16]]
-; CHECK-NEXT:    [[TMP25:%.*]] = fmul double [[TMP21]], [[TMP17]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd double [[VEC_PHI]], [[TMP22]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd double [[TMP26]], [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd double [[TMP27]], [[TMP24]]
-; CHECK-NEXT:    [[TMP29]] = fadd double [[TMP28]], [[TMP25]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[START0]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[START1]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x fp128>, ptr [[NEXT_GEP14]], align 16
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x double>, ptr [[NEXT_GEP8]], align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD17]] to <2 x double>
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul <2 x double> [[TMP31]], [[WIDE_LOAD18]]
+; CHECK-NEXT:    [[TMP33]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI13]], <2 x double> [[TMP32]])
+; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX12]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[N]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[NEXT_GEP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START0]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL21:%.*]] = phi ptr [ [[NEXT_GEP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[NEXT_GEP6]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START1]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL22:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi double [ [[TMP33]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[X]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR0:%.*]] = phi ptr [ [[PTR0_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ [[PTR1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL10]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi double [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR0:%.*]] = phi ptr [ [[PTR0_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ [[PTR1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL22]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi double [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX23]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PTR0_NEXT]] = getelementptr i8, ptr [[PTR0]], i64 16
 ; CHECK-NEXT:    [[PTR1_NEXT]] = getelementptr i8, ptr [[PTR1]], i64 8
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load fp128, ptr [[PTR0]], align 16
@@ -79,9 +108,9 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr
 ; CHECK-NEXT:    [[RED_NEXT]] = tail call double @llvm.fmuladd.f64(double [[TRUNC]], double [[LOAD1]], double [[RED]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP1_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[TMP33]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret double [[LCSSA]]
 ;
 entry:
@@ -110,5 +139,6 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index f6f2e39594dd8..64fc573e660bd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1310,7 +1310,8 @@ define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 ;
 ; LIBMVEC-SVE-LABEL: define void @fma_f64
 ; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[IN:%.*]], double [[IN]], double [[IN]])
 ;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
@@ -1357,7 +1358,8 @@ define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
 ;
 ; LIBMVEC-SVE-LABEL: define void @fma_f32
 ; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[IN:%.*]], float [[IN]], float [[IN]])
 ;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index cca58d8d66f04..26ce0fc6e6a3b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -12,8 +12,7 @@ define void @test() {
 ; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    br label %[[BB64]]
 ; CHECK:       [[BB64]]:
-; CHECK-NEXT:    [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
-; CHECK-NEXT:    [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
 ; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
@@ -25,122 +24,57 @@ define void @test() {
 ; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[I67]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[I69]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[I66]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[I68]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[I66]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[I67]], i32 6
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I69]], i32 7
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP20]], <16 x float> [[TMP0]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[TMP46:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP39:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP30:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[TMP53:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[TMP40:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP0]], %[[BB77]] ], [ [[TMP38:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP1]], %[[BB77]] ], [ [[TMP35:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x float> [ [[TMP3]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x float> [ [[TMP4]], %[[BB77]] ], [ [[TMP29:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[TMP6]], poison
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[TMP5]], poison
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul fast <2 x float> [[TMP6]], poison
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <2 x float> [[TMP5]], poison
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP19]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd fast <2 x float> [[TMP21]], poison
-; CHECK-NEXT:    [[TMP23:%.*]] = fadd fast <2 x float> [[TMP16]], [[TMP10]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd fast <2 x float> [[TMP23]], poison
-; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <2 x float> [[TMP18]], [[TMP11]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd fast <2 x float> [[TMP25]], poison
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <2 x float> [[TMP20]], [[TMP12]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <2 x float> [[TMP27]], poison
-; CHECK-NEXT:    [[TMP29]] = fadd fast <2 x float> [[TMP22]], poison
-; CHECK-NEXT:    [[TMP30]] = extractelement <2 x float> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP31]] = extractelement <2 x float> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP32]] = fadd fast <2 x float> [[TMP24]], poison
-; CHECK-NEXT:    [[TMP53]] = extractelement <2 x float> [[TMP32]], i32 1
-; CHECK-NEXT:    [[TMP46]] = extractelement <2 x float> [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP35]] = fadd fast <2 x float> [[TMP26]], poison
-; CHECK-NEXT:    [[TMP36]] = extractelement <2 x float> [[TMP35]], i32 1
-; CHECK-NEXT:    [[TMP37]] = extractelement <2 x float> [[TMP35]], i32 0
-; CHECK-NEXT:    [[TMP38]] = fadd fast <2 x float> [[TMP28]], poison
-; CHECK-NEXT:    [[TMP39]] = extractelement <2 x float> [[TMP38]], i32 1
-; CHECK-NEXT:    [[TMP40]] = extractelement <2 x float> [[TMP38]], i32 0
-; CHECK-NEXT:    [[I135:%.*]] = fmul fast float [[I85]], [[I65]]
-; CHECK-NEXT:    [[I128:%.*]] = fmul fast float [[I80]], [[I65]]
-; CHECK-NEXT:    [[I129:%.*]] = fmul fast float [[I81]], [[I65]]
-; CHECK-NEXT:    [[I130:%.*]] = fmul fast float [[I82]], [[I65]]
-; CHECK-NEXT:    [[I133:%.*]] = fmul fast float [[I84]], [[I77]]
-; CHECK-NEXT:    [[I136:%.*]] = fmul fast float [[I127]], [[I77]]
-; CHECK-NEXT:    [[I138:%.*]] = fmul fast float [[I131]], [[I77]]
-; CHECK-NEXT:    [[I137:%.*]] = fmul fast float [[I86]], [[I77]]
-; CHECK-NEXT:    [[OP_RDX14:%.*]] = fadd fast float poison, [[I133]]
-; CHECK-NEXT:    [[OP_RDX15:%.*]] = fadd fast float [[OP_RDX14]], [[I135]]
-; CHECK-NEXT:    [[OP_RDX12:%.*]] = fadd fast float poison, [[I136]]
-; CHECK-NEXT:    [[OP_RDX13:%.*]] = fadd fast float [[OP_RDX12]], [[I128]]
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = fadd fast float poison, [[I138]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = fadd fast float [[OP_RDX10]], [[I129]]
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = fadd fast float poison, [[I137]]
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = fadd fast float [[OP_RDX8]], [[I130]]
-; CHECK-NEXT:    [[TMP41:%.*]] = fmul fast <2 x float> [[TMP8]], poison
-; CHECK-NEXT:    [[TMP42:%.*]] = fmul fast <2 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[TMP43:%.*]] = fmul fast <2 x float> [[TMP6]], poison
-; CHECK-NEXT:    [[TMP44:%.*]] = fmul fast <2 x float> [[TMP5]], poison
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x float> [[TMP41]], i32 1
-; CHECK-NEXT:    [[I157:%.*]] = fadd fast float poison, [[TMP45]]
-; CHECK-NEXT:    [[I150:%.*]] = extractelement <2 x float> [[TMP41]], i32 0
-; CHECK-NEXT:    [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x float> [[TMP42]], i32 1
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast float poison, [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x float> [[TMP42]], i32 0
-; CHECK-NEXT:    [[OP_RDX5:%.*]] = fadd fast float [[OP_RDX4]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x float> [[TMP43]], i32 1
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float poison, [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x float> [[TMP43]], i32 0
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x float> [[TMP44]], i32 0
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float poison, [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x float> [[TMP44]], i32 1
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP52]]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP14]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <8 x float> [ [[TMP19]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast <16 x float> [[TMP25]], [[TMP0]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
+; CHECK-NEXT:    [[TMP30:%.*]] = fadd fast <16 x float> [[TMP29]], poison
+; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
-; CHECK-NEXT:    [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[OP_RDX1]], %[[BB78]] ]
-; CHECK-NEXT:    [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX3]], %[[BB78]] ]
-; CHECK-NEXT:    [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX5]], %[[BB78]] ]
-; CHECK-NEXT:    [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[TMP60]], %[[BB78]] ]
-; CHECK-NEXT:    [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[OP_RDX9]], %[[BB78]] ]
-; CHECK-NEXT:    [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[OP_RDX11]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[OP_RDX13]], %[[BB78]] ]
-; CHECK-NEXT:    [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[OP_RDX15]], %[[BB78]] ]
-; CHECK-NEXT:    [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[TMP40]], %[[BB78]] ]
-; CHECK-NEXT:    [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP36]], %[[BB78]] ]
-; CHECK-NEXT:    [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[TMP53]], %[[BB78]] ]
-; CHECK-NEXT:    [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP30]], %[[BB78]] ]
-; CHECK-NEXT:    [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP39]], %[[BB78]] ]
-; CHECK-NEXT:    [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[TMP37]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[TMP46]], %[[BB78]] ]
-; CHECK-NEXT:    [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[TMP31]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP30]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP35]], i32 14
 ; CHECK-NEXT:    store float [[TMP33]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP35]], i32 13
 ; CHECK-NEXT:    store float [[TMP34]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x float> [[TMP35]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
 ; CHECK:       [[BB184]]:
 ; CHECK-NEXT:    br label %[[BB185:.*]]
 ; CHECK:       [[BB185]]:
 ; CHECK-NEXT:    br i1 poison, label %[[BB185]], label %[[BB186]]
 ; CHECK:       [[BB186]]:
-; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ]
+; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[TMP36]], %[[BB167]] ], [ poison, %[[BB185]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 2b591a2165534..683b92752c702 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -402,19 +402,32 @@ entry:
 }
 
 define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
-; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
-; CHECK-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
-; CHECK-NEXT:    store float [[TMP1]], ptr [[BEZT]], align 4
-; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
+; NON-POW2-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[FNEG]], i32 0
+; NON-POW2-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer
+; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> <float poison, float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> <i32 0, i32 0, i32 2>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP2]], <3 x float> [[TMP4]], <3 x float> zeroinitializer)
+; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[BEZT]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: define void @reuse_shuffle_indices_cost_crash_2(
+; POW2-ONLY-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) {
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; POW2-ONLY-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP5]], ptr [[BEZT]], align 4
+; POW2-ONLY-NEXT:    [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
+; POW2-ONLY-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
+; POW2-ONLY-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %fneg = fmul float %0, 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index b5d74f0b91ab8..eefc99feebb95 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -24,21 +24,17 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
 ; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]]
-; AARCH86-NEXT:    [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]]
-; AARCH86-NEXT:    [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00
-; AARCH86-NEXT:    [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]]
-; AARCH86-NEXT:    [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00
-; AARCH86-NEXT:    [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
-; AARCH86-NEXT:    [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00
 ; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
 ; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]]
-; AARCH86-NEXT:    [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]]
-; AARCH86-NEXT:    [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0
-; AARCH86-NEXT:    [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1
-; AARCH86-NEXT:    [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2
-; AARCH86-NEXT:    [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;
 entry:

>From 4b074ec1f03ecf157d950763217bfe5944cc774d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 14:10:20 -0700
Subject: [PATCH 23/53] [flang][cuda] Add bind names for __double2ull_rX
 interfaces (#153678)

---
 flang/module/cudadevice.f90              | 14 +++++++-------
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 15 +++++++++++++++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 1e403b1ffdb91..259812d0eed1d 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -653,29 +653,29 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
-  interface __double2ull_rn
-    attributes(device) integer(8) function __double2ull_rn(r) bind(c)
+  interface __double2ull_rd
+    attributes(device) integer(8) function __double2ull_rd(r) bind(c, name='__nv_double2ull_rd')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2ull_rz
-    attributes(device) integer(8) function __double2ull_rz(r) bind(c)
+  interface __double2ull_rn
+    attributes(device) integer(8) function __double2ull_rn(r) bind(c, name='__nv_double2ull_rn')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
   interface __double2ull_ru
-    attributes(device) integer(8) function __double2ull_ru(r) bind(c)
+    attributes(device) integer(8) function __double2ull_ru(r) bind(c, name='__nv_double2ull_ru')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
   end interface
 
-  interface __double2ull_rd
-    attributes(device) integer(8) function __double2ull_rd(r) bind(c)
+  interface __double2ull_rz
+    attributes(device) integer(8) function __double2ull_rz(r) bind(c, name='__nv_double2ull_rz')
       !dir$ ignore_tkr (d) r
       double precision, value :: r
     end function
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index 844bdb954924a..52cf9c391c8de 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -115,3 +115,18 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+
+attributes(global) subroutine test_double2ull_rX()
+  integer(8) :: res
+  double precision :: r
+  res = __double2ull_rd(r)
+  res = __double2ull_rn(r)
+  res = __double2ull_ru(r)
+  res = __double2ull_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_double2ull_rx
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64

>From 705c86cdb4329569a6fdf0d49056409228ed64fc Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Thu, 14 Aug 2025 22:18:24 +0100
Subject: [PATCH 24/53] Revert "[SLP]Support LShr as base for copyable
 elements"

This reverts commit ca4ebf95172d24f8c47655709b2c9eb85bda5cb2.

Causes compile-time crashes for some inputs with RVV zvl512b/zvl1024b
configurations. See here for a minimal reproducer:
https://github.com/llvm/llvm-project/pull/153393#issuecomment-3189898813
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 86 ++++++-------------
 .../alternate-vectorization-split-node.ll     |  7 +-
 .../X86/load-merge-inseltpoison.ll            |  6 +-
 .../SLPVectorizer/X86/load-merge.ll           |  6 +-
 4 files changed, 40 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7362d5b0b5865..c35a7552b4058 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10564,12 +10564,6 @@ class InstructionsCompatibilityAnalysis {
   unsigned MainOpcode = 0;
   Instruction *MainOp = nullptr;
 
-  /// Checks if the opcode is supported as the main opcode for copyable
-  /// elements.
-  static bool isSupportedOpcode(const unsigned Opcode) {
-    return Opcode == Instruction::Add || Opcode == Instruction::LShr;
-  }
-
   /// Identifies the best candidate value, which represents main opcode
   /// operation.
   /// Currently the best candidate is the Add instruction with the parent
@@ -10577,28 +10571,28 @@ class InstructionsCompatibilityAnalysis {
   void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
     BasicBlock *Parent = nullptr;
     // Checks if the instruction has supported opcode.
-    auto IsSupportedInstruction = [&](Instruction *I) {
-      return I && isSupportedOpcode(I->getOpcode()) &&
+    auto IsSupportedOpcode = [&](Instruction *I) {
+      return I && I->getOpcode() == Instruction::Add &&
              (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
     };
     // Exclude operands instructions immediately to improve compile time, it
     // will be unable to schedule anyway.
     SmallDenseSet<Value *, 8> Operands;
-    SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
     for (Value *V : VL) {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
       if (!DT.isReachableFromEntry(I->getParent()))
         continue;
-      if (Candidates.empty()) {
-        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+      if (!MainOp) {
+        MainOp = I;
         Parent = I->getParent();
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
       if (Parent == I->getParent()) {
-        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+        if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
+          MainOp = I;
         Operands.insert(I->op_begin(), I->op_end());
         continue;
       }
@@ -10610,35 +10604,24 @@ class InstructionsCompatibilityAnalysis {
                  (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
              "Different nodes should have different DFS numbers");
       if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
-        Candidates.clear();
-        Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+        MainOp = I;
         Parent = I->getParent();
         Operands.clear();
         Operands.insert(I->op_begin(), I->op_end());
       }
     }
-    unsigned BestOpcodeNum = 0;
-    MainOp = nullptr;
-    for (const auto &P : Candidates) {
-      if (P.second.size() < BestOpcodeNum)
-        continue;
-      for (Instruction *I : P.second) {
-        if (IsSupportedInstruction(I) && !Operands.contains(I)) {
-          MainOp = I;
-          BestOpcodeNum = P.second.size();
-          break;
-        }
-      }
+    if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+      MainOp = nullptr;
+      return;
     }
-    if (MainOp)
-      MainOpcode = MainOp->getOpcode();
+    MainOpcode = MainOp->getOpcode();
   }
 
   /// Returns the idempotent value for the \p MainOp with the detected \p
   /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
   /// the operand itself, since V or V == V.
   Value *selectBestIdempotentValue() const {
-    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
+    assert(MainOpcode == Instruction::Add && "Unsupported opcode");
     return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
                                           !MainOp->isCommutative());
   }
@@ -10651,8 +10634,13 @@ class InstructionsCompatibilityAnalysis {
       return {V, V};
     if (!S.isCopyableElement(V))
       return convertTo(cast<Instruction>(V), S).second;
-    assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
-    return {V, selectBestIdempotentValue()};
+    switch (MainOpcode) {
+    case Instruction::Add:
+      return {V, selectBestIdempotentValue()};
+    default:
+      break;
+    }
+    llvm_unreachable("Unsupported opcode");
   }
 
   /// Builds operands for the original instructions.
@@ -10865,21 +10853,6 @@ class InstructionsCompatibilityAnalysis {
       }
       if (!Res)
         return InstructionsState::invalid();
-      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
-      InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
-      InstructionCost VectorCost;
-      FixedVectorType *VecTy =
-          getWidenedType(S.getMainOp()->getType(), VL.size());
-      switch (MainOpcode) {
-      case Instruction::Add:
-      case Instruction::LShr:
-        VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
-        break;
-      default:
-        llvm_unreachable("Unexpected instruction.");
-      }
-      if (VectorCost > ScalarCost)
-        return InstructionsState::invalid();
       return S;
     }
     assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21117,7 +21090,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
         ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
         const auto *It = find(Op, CD->getInst());
         assert(It != Op.end() && "Lane not set");
-        SmallPtrSet<Instruction *, 4> Visited;
         do {
           int Lane = std::distance(Op.begin(), It);
           assert(Lane >= 0 && "Lane not set");
@@ -21139,15 +21111,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
                   (InsertInReadyList && UseSD->isReady()))
                 WorkList.push_back(UseSD);
             }
-          } else if (Visited.insert(In).second) {
-            if (ScheduleData *UseSD = getScheduleData(In)) {
-              CD->incDependencies();
-              if (!UseSD->isScheduled())
-                CD->incrementUnscheduledDeps(1);
-              if (!UseSD->hasValidDependencies() ||
-                  (InsertInReadyList && UseSD->isReady()))
-                WorkList.push_back(UseSD);
-            }
+          } else if (ScheduleData *UseSD = getScheduleData(In)) {
+            CD->incDependencies();
+            if (!UseSD->isScheduled())
+              CD->incrementUnscheduledDeps(1);
+            if (!UseSD->hasValidDependencies() ||
+                (InsertInReadyList && UseSD->isReady()))
+              WorkList.push_back(UseSD);
           }
           It = find(make_range(std::next(It), Op.end()), CD->getInst());
         } while (It != Op.end());
@@ -21905,11 +21875,9 @@ bool BoUpSLP::collectValuesToDemote(
       return all_of(E.Scalars, [&](Value *V) {
         if (isa<PoisonValue>(V))
           return true;
-        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
-        if (E.isCopyableElement(V))
-          return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
         auto *I = cast<Instruction>(V);
         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
+        APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
         return AmtKnownBits.getMaxValue().ult(BitWidth) &&
                MaskedValueIsZero(I->getOperand(0), ShiftedBits,
                                  SimplifyQuery(*DL));
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
index 6d961fc3378b4..8d44d03e0e5cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -8,8 +8,11 @@ define i32 @test(ptr %c) {
 ; CHECK-NEXT:    [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
 ; CHECK-NEXT:    [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index c02ef8388b066..4f94784a24dd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
+; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 0545e5403f594..700e3ed9effc4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
+; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]

>From 486d58df880e6568158bad3b0619cc16650dada1 Mon Sep 17 00:00:00 2001
From: DeanSturtevant1 <dsturtevant at google.com>
Date: Thu, 14 Aug 2025 17:32:18 -0400
Subject: [PATCH 25/53] Fix mlir/BUILD.bazel for XeGPUUtils. (#153689)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index cd96f50186143..dba7af1c683b9 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3723,11 +3723,14 @@ cc_library(
     includes = ["include"],
     deps = [
         ":DialectUtils",
+        ":GPUDialect",
         ":IR",
+        ":LLVMDialect",
         ":LoopLikeInterface",
         ":SCFTransforms",
         ":TransformUtils",
         ":XeGPUDialect",
+        ":XeVMDialect",
         "//llvm:Support",
     ],
 )

>From 10b85b6fef239710ac4ef0c73a263d51a763fd35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 14:44:47 -0700
Subject: [PATCH 26/53] [flang][cuda] Add interfaces for __drcp_rX (#153681)

---
 flang/module/cudadevice.f90              | 28 ++++++++++++++++++++++++
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 17 ++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 259812d0eed1d..3e86dac5f5641 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -779,6 +779,34 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __drcp_rd
+    attributes(device) double precision function __drcp_rd(x) bind(c, name='__nv_drcp_rd')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
+  interface __drcp_rn
+    attributes(device) double precision function __drcp_rn(x) bind(c, name='__nv_drcp_rn')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
+  interface __drcp_ru
+    attributes(device) double precision function __drcp_ru(x) bind(c, name='__nv_drcp_ru')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
+  interface __drcp_rz
+    attributes(device) double precision function __drcp_rz(x) bind(c, name='__nv_drcp_rz')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
   interface __dsqrt_rd
     attributes(device) double precision function __dsqrt_rd(x) bind(c, name='__nv_dsqrt_rd')
       !dir$ ignore_tkr (d) x
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index 52cf9c391c8de..374d9f30ff008 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -116,6 +116,22 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 
+
+attributes(global) subroutine test_drcp_rX()
+  double precision :: res
+  double precision :: r
+  res = __drcp_rd(r)
+  res = __drcp_rn(r)
+  res = __drcp_ru(r)
+  res = __drcp_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_drcp_rx
+! CHECK: %{{.*}} = fir.call @__nv_drcp_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_drcp_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_drcp_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_drcp_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+
 attributes(global) subroutine test_double2ull_rX()
   integer(8) :: res
   double precision :: r
@@ -130,3 +146,4 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+

>From f964be1ef98fa76316af37303495847912d7d1d4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 14 Aug 2025 14:52:54 -0700
Subject: [PATCH 27/53] [RISCV][MoveMerge] Don't copy kill flag when moving
 past an instruction that reads the register. (#153644)

If we're moving the second copy before another instruction that reads
the copied register, we need to clear the kill flag on the combined
move.

Fixes #153598.
---
 llvm/lib/Target/RISCV/RISCVMoveMerger.cpp | 16 ++++++++++++++--
 llvm/test/CodeGen/RISCV/pr153598.mir      | 23 +++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/pr153598.mir

diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
index 7a2541a652b58..0d37db0138e47 100644
--- a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
@@ -137,6 +137,11 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
     NextI = next_nodbg(NextI, E);
   DebugLoc DL = I->getDebugLoc();
 
+  // Make a copy so we can update the kill flag in the MoveFromAToS case. The
+  // copied operand needs to be scoped outside the if since we make a pointer
+  // to it.
+  MachineOperand PairedSource = *PairedRegs.Source;
+
   // The order of S-reg depends on which instruction holds A0, instead of
   // the order of register pair.
   // e,g.
@@ -147,8 +152,15 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
   //   mv a1, s1    =>  cm.mva01s s2,s1
   bool StartWithX10 = ARegInFirstPair == RISCV::X10;
   if (isMoveFromAToS(Opcode)) {
-    Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source;
-    Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source;
+    // We are moving one of the copies earlier so its kill flag may become
+    // invalid. Clear the copied kill flag if there are any reads of the
+    // register between the new location and the old location.
+    for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It)
+      if (It->readsRegister(PairedSource.getReg(), TRI))
+        PairedSource.setIsKill(false);
+
+    Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource;
+    Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source;
   } else {
     Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination;
     Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination;
diff --git a/llvm/test/CodeGen/RISCV/pr153598.mir b/llvm/test/CodeGen/RISCV/pr153598.mir
new file mode 100644
index 0000000000000..a084197fe83cc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr153598.mir
@@ -0,0 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -mattr=+zcmp -run-pass=riscv-move-merge -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: mov-merge
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x8, $x9
+    ; CHECK-LABEL: name: mov-merge
+    ; CHECK: liveins: $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = ADDI $x0, -3
+    ; CHECK-NEXT: SW renamable $x9, $x2, 56
+    ; CHECK-NEXT: CM_MVA01S killed renamable $x9, renamable $x8, implicit-def $x10, implicit-def $x11
+    ; CHECK-NEXT: SW renamable $x8, $x2, 60
+    ; CHECK-NEXT: PseudoRET
+    $x12 = ADDI $x0, -3
+    SW renamable $x9, $x2, 56
+    $x10 = ADDI killed renamable $x9, 0
+    SW renamable $x8, $x2, 60
+    $x11 = ADDI killed renamable $x8, 0
+    PseudoRET
+...

>From 77f85aaa85a2da6b62813195601c63533b72820a Mon Sep 17 00:00:00 2001
From: DeanSturtevant1 <dsturtevant at google.com>
Date: Thu, 14 Aug 2025 18:03:41 -0400
Subject: [PATCH 28/53] [bazel] Fix mlir/BUILD.bazel for VectorToXeGPU.
 (#153696)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index dba7af1c683b9..19b6ad57b28eb 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7956,6 +7956,7 @@ cc_library(
         ":Transforms",
         ":VectorDialect",
         ":XeGPUDialect",
+        ":XeGPUUtils",
         "//llvm:Support",
     ],
 )

>From af689011328c614575437e1d53bdc1358fd54df8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 15:08:33 -0700
Subject: [PATCH 29/53] [flang][cuda] Add interfaces for __dsqrt_rn and
 __dsqrt_rz (#153624)

---
 flang/module/cudadevice.f90                | 14 ++++++++++++++
 flang/test/Lower/CUDA/cuda-device-proc.cuf |  4 ----
 flang/test/Lower/CUDA/cuda-libdevice.cuf   | 16 +++++++++++++++-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 3e86dac5f5641..a8b1d19a4c1ff 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -814,6 +814,13 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __dsqrt_rn
+    attributes(device) double precision function __dsqrt_rn(x) bind(c, name='__nv_dsqrt_rn')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
   interface __dsqrt_ru
     attributes(device) double precision function __dsqrt_ru(x) bind(c, name='__nv_dsqrt_ru')
       !dir$ ignore_tkr (d) x
@@ -821,6 +828,13 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __dsqrt_rz
+    attributes(device) double precision function __dsqrt_rz(x) bind(c, name='__nv_dsqrt_rz')
+      !dir$ ignore_tkr (d) x
+      double precision, value :: x
+    end function
+  end interface
+
   interface __ddiv_rn
     attributes(device) double precision function __ddiv_rn(x,y) bind(c, name='__nv_ddiv_rn')
       !dir$ ignore_tkr (d) x, (d) y
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index ab90dec790046..a6e8c69b2e52e 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -82,8 +82,6 @@ attributes(global) subroutine devsub()
   ai = __mul24(ai, ai)
   ai = __umul24(ai, ai)
   af = __powf(af, af)
-  ad = __dsqrt_rd(ad)
-  ad = __dsqrt_ru(ad)
   ad = __ull2double_rd(al)
   ad = __ull2double_rn(al)
   ad = __ull2double_ru(al)
@@ -162,8 +160,6 @@ end
 ! CHECK: %{{.*}} = fir.call @__nv_mul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32
 ! CHECK: %{{.*}} = fir.call @__nv_umul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32
 ! CHECK: %{{.*}} = fir.call @__nv_powf(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32, f32) -> f32
-! CHECK: %{{.*}} = fir.call @__nv_dsqrt_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
-! CHECK: %{{.*}} = fir.call @__nv_dsqrt_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
 ! CHECK: %{{.*}} = fir.call @__nv_ull2double_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
 ! CHECK: %{{.*}} = fir.call @__nv_ull2double_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
 ! CHECK: %{{.*}} = fir.call @__nv_ull2double_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f64
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index 374d9f30ff008..447e09cad747b 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -20,6 +20,21 @@ end subroutine
 ! CHECK-LABEL: _QPtest_usad
 ! CHECK: %{{.*}} = fir.call @__nv_usad(%{{.*}}, %{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32, i32) -> i32
 
+attributes(global) subroutine test_dsqrt_rX()
+  double precision :: res
+  double precision :: p
+  res = __dsqrt_rd(p)
+  res = __dsqrt_rn(p)
+  res = __dsqrt_ru(p)
+  res = __dsqrt_rz(p)
+end subroutine
+
+! CHECK-LABEL: _QPtest_dsqrt_rx
+! CHECK: %{{.*}} = fir.call @__nv_dsqrt_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_dsqrt_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_dsqrt_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+! CHECK: %{{.*}} = fir.call @__nv_dsqrt_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
+
 attributes(global) subroutine test_uint2float_rX()
   real :: res
   integer :: i
@@ -146,4 +161,3 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
-

>From 80b30931c2523213e0601daf58d5441f22087821 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor at nvidia.com>
Date: Thu, 14 Aug 2025 15:14:12 -0700
Subject: [PATCH 30/53] [CIR] Add index support for global_view (#153254)

The #cir.global_view attribute was initially added without support for
the optional index list. This change adds index list support. This is
used when the address of an array or structure member is used as an
initializer.

This patch does not include support for taking the address of a
structure or class member. That will be added later.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  5 +-
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 51 +++++++++++++--
 .../clang/CIR/Dialect/IR/CIRDataLayout.h      | 20 ++++++
 clang/lib/CIR/CodeGen/CIRGenBuilder.cpp       | 64 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  9 +++
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp  | 26 +++++---
 clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp    | 23 ++++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 17 ++++-
 clang/test/CIR/CodeGen/globals.cpp            | 17 +++++
 9 files changed, 215 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 986c8c3d133ac..0bf3cb26be850 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -214,9 +214,10 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
 
   /// Get constant address of a global variable as an MLIR attribute.
   cir::GlobalViewAttr getGlobalViewAttr(cir::PointerType type,
-                                        cir::GlobalOp globalOp) {
+                                        cir::GlobalOp globalOp,
+                                        mlir::ArrayAttr indices = {}) {
     auto symbol = mlir::FlatSymbolRefAttr::get(globalOp.getSymNameAttr());
-    return cir::GlobalViewAttr::get(type, symbol);
+    return cir::GlobalViewAttr::get(type, symbol, indices);
   }
 
   mlir::Value createGetGlobal(mlir::Location loc, cir::GlobalOp global) {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 9db236eb5580e..e899db84bb615 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -379,13 +379,20 @@ def CIR_GlobalViewAttr : CIR_Attr<"GlobalView", "global_view", [
 ]> {
   let summary = "Provides constant access to a global address";
   let description = [{
-    Get constant address of global `symbol`. It provides a way to access globals
-    from other global and always produces a pointer.
+    Get constant address of global `symbol` and optionally apply offsets to
+    access existing subelements. It provides a way to access globals from other
+    global and always produces a pointer.
 
     The type of the input symbol can be different from `#cir.global_view`
     output type, since a given view of the global might require a static
     cast for initializing other globals.
 
+    A list of indices can be optionally passed and each element subsequently
+    indexes underlying types. For `symbol` types like `!cir.array`
+    and `!cir.record`, it leads to the constant address of sub-elements, while
+    for `!cir.ptr`, an offset is applied. The first index is relative to the
+    original symbol type, not the produced one.
+
     The result type of this attribute may be an integer type. In such a case,
     the pointer to the referenced global is casted to an integer and this
     attribute represents the casted result.
@@ -396,16 +403,49 @@ def CIR_GlobalViewAttr : CIR_Attr<"GlobalView", "global_view", [
       cir.global external @s = @".str2": !cir.ptr<i8>
       cir.global external @x = #cir.global_view<@s> : !cir.ptr<i8>
       cir.global external @s_addr = #cir.global_view<@s> : !s64i
+
+      cir.global external @rgb = #cir.const_array<[0 : i8, -23 : i8, 33 : i8]
+                                                   : !cir.array<i8 x 3>>
+      cir.global external @elt_ptr = #cir.global_view<@rgb, [1]> : !cir.ptr<i8>
+    ```
+
+    Note, that unlike LLVM IR's gep instruction, CIR doesn't add the leading
+    zero index when it's known to be constant zero, e.g. for pointers, i.e. we
+    use indexes exactly to access sub elements or for the offset. The leading
+    zero index is added later in the lowering.
+
+    Example:
+    ```
+    struct A {
+      int a;
+    };
+
+    struct B:  virtual A {
+      int b;
+    };
+    ```
+    VTT for B in CIR:
+    ```
+    cir.global linkonce_odr @_ZTT1B = #cir.const_array<[
+              #cir.global_view<@_ZTV1B, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>]>
+                   : !cir.array<!cir.ptr<!u8i> x 1>
+    ```
+    VTT for B in LLVM IR:
+    ```
+    @_ZTT1B = linkonce_odr global [1 x ptr] [ptr getelementptr inbounds
+              ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 3)], align 8
     ```
   }];
 
   let parameters = (ins AttributeSelfTypeParameter<"">:$type,
-                        "mlir::FlatSymbolRefAttr":$symbol);
+                        "mlir::FlatSymbolRefAttr":$symbol,
+                        OptionalParameter<"mlir::ArrayAttr">:$indices);
 
   let builders = [
     AttrBuilderWithInferredContext<(ins "mlir::Type":$type,
-                                        "mlir::FlatSymbolRefAttr":$symbol), [{
-      return $_get(type.getContext(), type, symbol);
+                                        "mlir::FlatSymbolRefAttr":$symbol,
+                                        CArg<"mlir::ArrayAttr", "{}">:$indices), [{
+      return $_get(type.getContext(), type, symbol, indices);
     }]>
   ];
 
@@ -413,6 +453,7 @@ def CIR_GlobalViewAttr : CIR_Attr<"GlobalView", "global_view", [
   let assemblyFormat = [{
     `<`
       $symbol
+      (`,` $indices^)?
     `>`
   }];
 }
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
index a46c2679d0481..ecc681ee310e3 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
@@ -35,6 +35,13 @@ class CIRDataLayout {
 
   bool isBigEndian() const { return bigEndian; }
 
+  /// Internal helper method that returns requested alignment for type.
+  llvm::Align getAlignment(mlir::Type ty, bool abiOrPref) const;
+
+  llvm::Align getABITypeAlign(mlir::Type ty) const {
+    return getAlignment(ty, true);
+  }
+
   /// Returns the maximum number of bytes that may be overwritten by
   /// storing the specified type.
   ///
@@ -48,6 +55,19 @@ class CIRDataLayout {
             baseSize.isScalable()};
   }
 
+  /// Returns the offset in bytes between successive objects of the
+  /// specified type, including alignment padding.
+  ///
+  /// If Ty is a scalable vector type, the scalable property will be set and
+  /// the runtime size will be a positive integer multiple of the base size.
+  ///
+  /// This is the amount that alloca reserves for this type. For example,
+  /// returns 12 or 16 for x86_fp80, depending on alignment.
+  llvm::TypeSize getTypeAllocSize(mlir::Type ty) const {
+    // Round up to the next alignment boundary.
+    return llvm::alignTo(getTypeStoreSize(ty), getABITypeAlign(ty).value());
+  }
+
   llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const;
 };
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
index 4a5a1dd53a05a..755c76c89a645 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CIRGenBuilder.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace clang::CIRGen;
 
@@ -66,6 +67,69 @@ clang::CIRGen::CIRGenBuilderTy::getConstFP(mlir::Location loc, mlir::Type t,
   return create<cir::ConstantOp>(loc, cir::FPAttr::get(t, fpVal));
 }
 
+void CIRGenBuilderTy::computeGlobalViewIndicesFromFlatOffset(
+    int64_t offset, mlir::Type ty, cir::CIRDataLayout layout,
+    llvm::SmallVectorImpl<int64_t> &indices) {
+  if (!offset)
+    return;
+
+  auto getIndexAndNewOffset =
+      [](int64_t offset, int64_t eltSize) -> std::pair<int64_t, int64_t> {
+    int64_t divRet = offset / eltSize;
+    if (divRet < 0)
+      divRet -= 1; // make sure offset is positive
+    int64_t modRet = offset - (divRet * eltSize);
+    return {divRet, modRet};
+  };
+
+  mlir::Type subType =
+      llvm::TypeSwitch<mlir::Type, mlir::Type>(ty)
+          .Case<cir::ArrayType>([&](auto arrayTy) {
+            int64_t eltSize = layout.getTypeAllocSize(arrayTy.getElementType());
+            const auto [index, newOffset] =
+                getIndexAndNewOffset(offset, eltSize);
+            indices.push_back(index);
+            offset = newOffset;
+            return arrayTy.getElementType();
+          })
+          .Case<cir::RecordType>([&](auto recordTy) {
+            ArrayRef<mlir::Type> elts = recordTy.getMembers();
+            int64_t pos = 0;
+            for (size_t i = 0; i < elts.size(); ++i) {
+              int64_t eltSize =
+                  (int64_t)layout.getTypeAllocSize(elts[i]).getFixedValue();
+              unsigned alignMask = layout.getABITypeAlign(elts[i]).value() - 1;
+              if (recordTy.getPacked())
+                alignMask = 0;
+              // Union's fields have the same offset, so no need to change pos
+              // here, we just need to find eltSize that is greater then the
+              // required offset. The same is true for the similar union type
+              // check below
+              if (!recordTy.isUnion())
+                pos = (pos + alignMask) & ~alignMask;
+              assert(offset >= 0);
+              if (offset < pos + eltSize) {
+                indices.push_back(i);
+                offset -= pos;
+                return elts[i];
+              }
+              // No need to update pos here, see the comment above.
+              if (!recordTy.isUnion())
+                pos += eltSize;
+            }
+            llvm_unreachable("offset was not found within the record");
+          })
+          .Default([](mlir::Type otherTy) {
+            llvm_unreachable("unexpected type");
+            return otherTy; // Even though this is unreachable, we need to
+                            // return a type to satisfy the return type of the
+                            // lambda.
+          });
+
+  assert(subType);
+  computeGlobalViewIndicesFromFlatOffset(offset, subType, layout, indices);
+}
+
 // This can't be defined in Address.h because that file is included by
 // CIRGenBuilder.h
 Address Address::withElementType(CIRGenBuilderTy &builder,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 8b2538c941f47..59d2adc15a01a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -12,6 +12,7 @@
 #include "Address.h"
 #include "CIRGenRecordLayout.h"
 #include "CIRGenTypeCache.h"
+#include "clang/CIR/Dialect/IR/CIRDataLayout.h"
 #include "clang/CIR/Interfaces/CIRTypeInterfaces.h"
 #include "clang/CIR/MissingFeatures.h"
 
@@ -401,6 +402,14 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   mlir::Value maybeBuildArrayDecay(mlir::Location loc, mlir::Value arrayPtr,
                                    mlir::Type eltTy);
 
+  // Convert byte offset to sequence of high-level indices suitable for
+  // GlobalViewAttr. Ideally we shouldn't deal with low-level offsets at all
+  // but currently some parts of Clang AST, which we don't want to touch just
+  // yet, return them.
+  void computeGlobalViewIndicesFromFlatOffset(
+      int64_t offset, mlir::Type ty, cir::CIRDataLayout layout,
+      llvm::SmallVectorImpl<int64_t> &indices);
+
   /// Creates a versioned global variable. If the symbol is already taken, an ID
   /// will be appended to the symbol. The returned global must always be queried
   /// for its name so it can be referenced correctly.
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 87ea34df6be59..c2b3734173923 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -388,11 +388,20 @@ class ConstantLValueEmitter
   /// Return GEP-like value offset
   mlir::ArrayAttr getOffset(mlir::Type ty) {
     int64_t offset = value.getLValueOffset().getQuantity();
-    if (offset == 0)
-      return {};
+    cir::CIRDataLayout layout(cgm.getModule());
+    SmallVector<int64_t, 3> idxVec;
+    cgm.getBuilder().computeGlobalViewIndicesFromFlatOffset(offset, ty, layout,
+                                                            idxVec);
+
+    llvm::SmallVector<mlir::Attribute, 3> indices;
+    for (int64_t i : idxVec) {
+      mlir::IntegerAttr intAttr = cgm.getBuilder().getI32IntegerAttr(i);
+      indices.push_back(intAttr);
+    }
 
-    cgm.errorNYI("ConstantLValueEmitter: global view with offset");
-    return {};
+    if (indices.empty())
+      return {};
+    return cgm.getBuilder().getArrayAttr(indices);
   }
 
   /// Apply the value offset to the given constant.
@@ -400,10 +409,11 @@ class ConstantLValueEmitter
     // Handle attribute constant LValues.
     if (auto attr = mlir::dyn_cast<mlir::Attribute>(c.value)) {
       if (auto gv = mlir::dyn_cast<cir::GlobalViewAttr>(attr)) {
-        if (value.getLValueOffset().getQuantity() == 0)
-          return gv;
-        cgm.errorNYI("ConstantLValue: global view with offset");
-        return {};
+        auto baseTy = mlir::cast<cir::PointerType>(gv.getType()).getPointee();
+        mlir::Type destTy = cgm.getTypes().convertTypeForMem(destType);
+        assert(!gv.getIndices() && "Global view is already indexed");
+        return cir::GlobalViewAttr::get(destTy, gv.getSymbol(),
+                                        getOffset(baseTy));
       }
       llvm_unreachable("Unsupported attribute type to offset");
     }
diff --git a/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp b/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp
index 8b806b406a536..42d45819de0f3 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp
@@ -23,10 +23,30 @@ void CIRDataLayout::reset(mlir::DataLayoutSpecInterface spec) {
   }
 }
 
+llvm::Align CIRDataLayout::getAlignment(mlir::Type ty, bool useABIAlign) const {
+  if (auto recTy = llvm::dyn_cast<cir::RecordType>(ty)) {
+    // Packed record types always have an ABI alignment of one.
+    if (recTy && recTy.getPacked() && useABIAlign)
+      return llvm::Align(1);
+
+    // Get the layout annotation... which is lazily created on demand.
+    llvm_unreachable("getAlignment()) for record type is not implemented");
+  }
+
+  // FIXME(cir): This does not account for differnt address spaces, and relies
+  // on CIR's data layout to give the proper alignment.
+  assert(!cir::MissingFeatures::addressSpace());
+
+  // Fetch type alignment from MLIR's data layout.
+  unsigned align = useABIAlign ? layout.getTypeABIAlignment(ty)
+                               : layout.getTypePreferredAlignment(ty);
+  return llvm::Align(align);
+}
+
 // The implementation of this method is provided inline as it is particularly
 // well suited to constant folding when called on a specific Type subclass.
 llvm::TypeSize CIRDataLayout::getTypeSizeInBits(mlir::Type ty) const {
-  assert(!cir::MissingFeatures::dataLayoutTypeIsSized());
+  assert(cir::isSized(ty) && "Cannot getTypeInfo() on a type that is unsized!");
 
   if (auto recordTy = llvm::dyn_cast<cir::RecordType>(ty)) {
     // FIXME(cir): CIR record's data layout implementation doesn't do a good job
@@ -38,5 +58,6 @@ llvm::TypeSize CIRDataLayout::getTypeSizeInBits(mlir::Type ty) const {
   // on CIR's data layout to give the proper ABI-specific type width.
   assert(!cir::MissingFeatures::addressSpace());
 
+  // This is calling mlir::DataLayout::getTypeSizeInBits().
   return llvm::TypeSize::getFixed(layout.getTypeSizeInBits(ty));
 }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 20b8787d4f55f..1ea296a6887ef 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -426,7 +426,22 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) {
   mlir::Value addrOp = rewriter.create<mlir::LLVM::AddressOfOp>(
       loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()), symName);
 
-  assert(!cir::MissingFeatures::globalViewIndices());
+  if (globalAttr.getIndices()) {
+    llvm::SmallVector<mlir::LLVM::GEPArg> indices;
+
+    if (mlir::isa<mlir::LLVM::LLVMArrayType, mlir::LLVM::LLVMStructType>(
+            sourceType))
+      indices.push_back(0);
+
+    for (mlir::Attribute idx : globalAttr.getIndices()) {
+      auto intAttr = mlir::cast<mlir::IntegerAttr>(idx);
+      indices.push_back(intAttr.getValue().getSExtValue());
+    }
+    mlir::Type resTy = addrOp.getType();
+    mlir::Type eltTy = converter->convertType(sourceType);
+    addrOp = rewriter.create<mlir::LLVM::GEPOp>(
+        loc, resTy, eltTy, addrOp, indices, mlir::LLVM::GEPNoWrapFlags::none);
+  }
 
   // The incubator has handling here for the attribute having integer type, but
   // the only test case I could find that reaches it is a direct CIR-to-LLVM IR
diff --git a/clang/test/CIR/CodeGen/globals.cpp b/clang/test/CIR/CodeGen/globals.cpp
index 7a08a4824276e..a3e16139a41a9 100644
--- a/clang/test/CIR/CodeGen/globals.cpp
+++ b/clang/test/CIR/CodeGen/globals.cpp
@@ -2,6 +2,8 @@
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
 // RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
 // Should constant initialize global with constant address.
 int var = 1;
@@ -11,6 +13,8 @@ int *constAddr = &var;
 
 // LLVM: @constAddr = global ptr @var, align 8
 
+// OGCG: @constAddr = global ptr @var, align 8
+
 // Should constant initialize global with constant address.
 int f();
 int (*constFnAddr)() = f;
@@ -18,3 +22,16 @@ int (*constFnAddr)() = f;
 // CIR: cir.global external @constFnAddr = #cir.global_view<@_Z1fv> : !cir.ptr<!cir.func<() -> !s32i>>
 
 // LLVM: @constFnAddr = global ptr @_Z1fv, align 8
+
+// OGCG: @constFnAddr = global ptr @_Z1fv, align 8
+
+int arr[4][16];
+int *constArrAddr = &arr[2][1];
+
+// CIR: cir.global external @constArrAddr = #cir.global_view<@arr, [2 : i32, 1 : i32]> : !cir.ptr<!s32i>
+
+// The 'inbounds' and 'nuw' flags are inferred by LLVM's constant folder. The
+// same flags show up at -O1 in OGCG.
+// LLVM: @constArrAddr = global ptr getelementptr inbounds nuw (i8, ptr @arr, i64 132), align 8
+
+// OGCG: @constArrAddr = global ptr getelementptr (i8, ptr @arr, i64 132), align 8

>From 50050d69e38bbe06665bf0bd014c6da3210ec4d0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 15:16:56 -0700
Subject: [PATCH 31/53] [AMDGPU] Don't allow wgp mode on gfx1250 (#153680)

- gfx1250 only supports cu mode
---
 clang/test/CodeGenHIP/hip-cumode.hip          |   10 +-
 clang/test/Driver/hip-macros.hip              |   14 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |    5 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |    5 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    6 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |    3 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   21 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    1 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |  101 +-
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  |  364 ++---
 .../memory-legalizer-local-workgroup.ll       | 1211 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  288 ++--
 llvm/test/MC/AMDGPU/hsa-diag-v4.s             |   33 +-
 13 files changed, 1623 insertions(+), 439 deletions(-)

diff --git a/clang/test/CodeGenHIP/hip-cumode.hip b/clang/test/CodeGenHIP/hip-cumode.hip
index 1aa1ca7a1a7ee..61fd53c644e8c 100644
--- a/clang/test/CodeGenHIP/hip-cumode.hip
+++ b/clang/test/CodeGenHIP/hip-cumode.hip
@@ -5,14 +5,20 @@
 // RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=NOWGP %s
 // RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=NOWGP,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=NOWGP,WARN-CUMODE %s
 // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
 // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
-// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=NOWGP %s
+// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=NOWGP,WARN-CUMODE %s
+// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
 // NOWGP-NOT: .amdhsa_workgroup_processor_mode
 // CUMODE-ON: .amdhsa_workgroup_processor_mode 0
 // CUMODE-OFF: .amdhsa_workgroup_processor_mode 1
diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip
index bd93f9985a774..516e01a6c4743 100644
--- a/clang/test/Driver/hip-macros.hip
+++ b/clang/test/Driver/hip-macros.hip
@@ -27,21 +27,27 @@
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
 // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
 // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
 // RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \
+// RUN:   %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s
+// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 
 // Check no duplicate warnings.
 // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \
 // RUN:   -mno-cumode -mno-cumode \
-// RUN:   %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s
+// RUN:   %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s
 
-// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
-// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored]
+// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
+// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored]
 // CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1
 // CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c7d2d268a2707..188c126cb9fbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1144,8 +1144,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                               CreateExpr(STM.getWavefrontSize()), Ctx),
       CreateExpr(1ULL << ScratchAlignShift));
 
-  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
+  if (STM.supportsWGP()) {
     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
+  }
+
+  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
     ProgInfo.MemOrdered = 1;
     ProgInfo.FwdProgress = 1;
   }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0184075c2c909..951473264d089 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6270,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
                        ExprVal, ValRange);
     } else if (ID == ".amdhsa_workgroup_processor_mode") {
-      if (IVersion.Major < 10)
-        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+      if (!supportsWGP(getSTI()))
+        return Error(IDRange.Start,
+                     "directive unsupported on " + getSTI().getCPU(), IDRange);
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
                        COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal,
                        ValRange);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f47ddf5d93ec3..7ca7e8448c63d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -390,7 +390,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// the original value.
   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
 
-  bool supportsWGP() const { return getGeneration() >= GFX10; }
+  bool supportsWGP() const {
+    if (GFX1250Insts)
+      return false;
+    return getGeneration() >= GFX10;
+  }
 
   bool hasIntClamp() const {
     return HasIntClamp;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 68302f0dd0d64..1f35e92151bfc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PrintField(KD.compute_pgm_rsrc3,
                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split");
-  if (IVersion.Major >= 10) {
+  if (AMDGPU::supportsWGP(STI))
     PrintField(KD.compute_pgm_rsrc1,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
                ".amdhsa_workgroup_processor_mode");
+  if (IVersion.Major >= 10) {
     PrintField(KD.compute_pgm_rsrc1,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
                amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ec9f1abdd8467..c41d62748c4be 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1167,12 +1167,21 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
 
 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
   // "Per CU" really means "per whatever functional block the waves of a
-  // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+  // workgroup must share".
+
+  // GFX12.5 only supports CU mode, which contains four SIMDs.
+  if (isGFX1250(*STI)) {
+    assert(STI->getFeatureBits().test(FeatureCuMode));
+    return 4;
+  }
+
+  // For gfx10 in CU mode the functional block is the CU, which contains
   // two SIMDs.
   if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode))
     return 2;
-  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
-  // two CUs, so a total of four SIMDs.
+
+  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP
+  // contains two CUs, so a total of four SIMDs.
   return 4;
 }
 
@@ -2480,6 +2489,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
 }
 
+bool supportsWGP(const MCSubtargetInfo &STI) {
+  if (isGFX1250(STI))
+    return false;
+  return isGFX10Plus(STI);
+}
+
 bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); }
 
 bool isNotGFX10Plus(const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 704bf106ace76..befab68bb5698 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1549,6 +1549,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
 bool isGFX12(const MCSubtargetInfo &STI);
 bool isGFX12Plus(const MCSubtargetInfo &STI);
 bool isGFX1250(const MCSubtargetInfo &STI);
+bool supportsWGP(const MCSubtargetInfo &STI);
 bool isNotGFX12Plus(const MCSubtargetInfo &STI);
 bool isNotGFX11Plus(const MCSubtargetInfo &STI);
 bool isGCN3Encoding(const MCSubtargetInfo &STI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3daae98961bff..01854c8560ce2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v0, v14, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v0, v12, 0
+; GFX1250-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
 ; GFX1250-NEXT:    v_mul_lo_u32 v27, v5, v10
 ; GFX1250-NEXT:    v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v14, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v12, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v0, v10, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v16, v10, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
 ; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
 ; GFX1250-NEXT:    v_mul_lo_u32 v22, v6, v9
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
 ; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[16:17], v0, v8, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v8, 0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
 ; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT:    v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT:    v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v21, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
-; GFX1250-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v21, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT:    v_mul_lo_u32 v2, v16, v15
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX1250-NEXT:    v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT:    v_mul_lo_u32 v9, v17, v14
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v23, v0, s2
-; GFX1250-NEXT:    v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v15
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v20, s4
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v20, s4
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v27, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v25, s1
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v27, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
-; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, v16
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i256 %num, %den
   ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f0db321d3931a..e532deaca98a8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
 
-; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Test S_WAIT_XCNT insertion for global_load/store clauses.
 ; Introduced additional operations in between the clauses to have the register dependency
 ; between the operands of VMEM operations and the def ops of VALU instructions that followed.
 
@@ -123,29 +123,10 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT:    s_clause 0xd
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:224
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v39, v4 :: v_dual_mov_b32 v38, v3
+; GCN-SDAG-NEXT:    s_clause 0xf
+; GCN-SDAG-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:224
 ; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:240
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-SDAG-NEXT:    s_clause 0xd
 ; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:192
 ; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:208
 ; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:160
@@ -155,138 +136,103 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:96
 ; GCN-SDAG-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:112
 ; GCN-SDAG-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:80
-; GCN-SDAG-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:32
-; GCN-SDAG-NEXT:    global_load_b128 v[56:59], v[0:1], off offset:48
-; GCN-SDAG-NEXT:    global_load_b128 v[60:63], v[0:1], off
-; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
-; GCN-SDAG-NEXT:    scratch_load_b128 v[6:9], off, s32 offset:56 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:224
-; GCN-SDAG-NEXT:    scratch_load_b128 v[6:9], off, s32 offset:72 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    s_clause 0xe
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:240
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:192
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:208
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:160
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:176
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:128
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:144
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:96
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[48:51], off offset:112
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[52:55], off offset:64
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[38:41], off offset:80
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[42:45], off offset:32
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[56:59], off offset:48
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[60:63], off
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
-; GCN-SDAG-NEXT:    s_clause 0xd
-; GCN-SDAG-NEXT:    scratch_load_b32 v63, off, s32
-; GCN-SDAG-NEXT:    scratch_load_b32 v62, off, s32 offset:4
-; GCN-SDAG-NEXT:    scratch_load_b32 v61, off, s32 offset:8
-; GCN-SDAG-NEXT:    scratch_load_b32 v60, off, s32 offset:12
-; GCN-SDAG-NEXT:    scratch_load_b32 v59, off, s32 offset:16
-; GCN-SDAG-NEXT:    scratch_load_b32 v58, off, s32 offset:20
-; GCN-SDAG-NEXT:    scratch_load_b32 v57, off, s32 offset:24
-; GCN-SDAG-NEXT:    scratch_load_b32 v56, off, s32 offset:28
-; GCN-SDAG-NEXT:    scratch_load_b32 v45, off, s32 offset:32
-; GCN-SDAG-NEXT:    scratch_load_b32 v44, off, s32 offset:36
-; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32 offset:40
-; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:44
-; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:48
-; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:52
-; GCN-SDAG-NEXT:    s_wait_xcnt 0xe
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-SDAG-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[68:71], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[80:83], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[84:87], v[0:1], off
+; GCN-SDAG-NEXT:    global_load_b128 v[96:99], v[0:1], off offset:16
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xf
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[2:5], off offset:224
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xe
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[6:9], off offset:240
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xd
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[10:13], off offset:192
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xc
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[14:17], off offset:208
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xb
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[18:21], off offset:160
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0xa
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[22:25], off offset:176
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x9
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[26:29], off offset:128
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x8
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[30:33], off offset:144
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[34:37], off offset:96
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[48:51], off offset:112
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[52:55], off offset:64
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[64:67], off offset:80
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[68:71], off offset:32
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[80:83], off offset:48
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[84:87], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[96:99], off offset:16
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x10
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, v98
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: test_v64i32_load_store:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v4
 ; GCN-GISEL-NEXT:    s_clause 0xf
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    s_wait_xcnt 0x8
-; GCN-GISEL-NEXT:    v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
 ; GCN-GISEL-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:32
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-GISEL-NEXT:    s_clause 0xe
 ; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:48
-; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:64
-; GCN-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:80
-; GCN-GISEL-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:112
-; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:128
-; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:144
-; GCN-GISEL-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:160
-; GCN-GISEL-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:176
-; GCN-GISEL-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:192
-; GCN-GISEL-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:208
-; GCN-GISEL-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:224
-; GCN-GISEL-NEXT:    global_load_b128 v[56:59], v[0:1], off
-; GCN-GISEL-NEXT:    global_load_b128 v[60:63], v[0:1], off offset:16
-; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:240
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    s_clause 0x1
-; GCN-GISEL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    s_clause 0xe
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[0:3], off offset:32
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[6:9], off offset:48
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[10:13], off offset:64
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[14:17], off offset:80
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[18:21], off offset:96
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[22:25], off offset:112
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[26:29], off offset:128
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[30:33], off offset:144
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[34:37], off offset:160
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[48:51], off offset:176
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[52:55], off offset:192
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[38:41], off offset:208
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[42:45], off offset:224
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[56:59], off
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[60:63], off offset:16
-; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    global_store_b128 v[46:47], v[0:3], off offset:240
-; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, v62
-; GCN-GISEL-NEXT:    s_clause 0xf
-; GCN-GISEL-NEXT:    scratch_load_b32 v63, off, s32
-; GCN-GISEL-NEXT:    scratch_load_b32 v62, off, s32 offset:4
-; GCN-GISEL-NEXT:    scratch_load_b32 v61, off, s32 offset:8
-; GCN-GISEL-NEXT:    scratch_load_b32 v60, off, s32 offset:12
-; GCN-GISEL-NEXT:    scratch_load_b32 v59, off, s32 offset:16
-; GCN-GISEL-NEXT:    scratch_load_b32 v58, off, s32 offset:20
-; GCN-GISEL-NEXT:    scratch_load_b32 v57, off, s32 offset:24
-; GCN-GISEL-NEXT:    scratch_load_b32 v56, off, s32 offset:28
-; GCN-GISEL-NEXT:    scratch_load_b32 v47, off, s32 offset:32
-; GCN-GISEL-NEXT:    scratch_load_b32 v46, off, s32 offset:36
-; GCN-GISEL-NEXT:    scratch_load_b32 v45, off, s32 offset:40
-; GCN-GISEL-NEXT:    scratch_load_b32 v44, off, s32 offset:44
-; GCN-GISEL-NEXT:    scratch_load_b32 v43, off, s32 offset:48
-; GCN-GISEL-NEXT:    scratch_load_b32 v42, off, s32 offset:52
-; GCN-GISEL-NEXT:    scratch_load_b32 v41, off, s32 offset:56
-; GCN-GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:60
+; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
+; GCN-GISEL-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:16
+; GCN-GISEL-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:64
+; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:80
+; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:96
+; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:112
+; GCN-GISEL-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:128
+; GCN-GISEL-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:144
+; GCN-GISEL-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:160
+; GCN-GISEL-NEXT:    global_load_b128 v[64:67], v[0:1], off offset:176
+; GCN-GISEL-NEXT:    global_load_b128 v[68:71], v[0:1], off offset:192
+; GCN-GISEL-NEXT:    global_load_b128 v[80:83], v[0:1], off offset:208
+; GCN-GISEL-NEXT:    global_load_b128 v[84:87], v[0:1], off offset:224
+; GCN-GISEL-NEXT:    global_load_b128 v[96:99], v[0:1], off offset:240
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xf
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[2:5], off offset:32
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xe
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[6:9], off offset:48
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xd
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xc
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[14:17], off offset:16
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xb
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[18:21], off offset:64
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0xa
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[22:25], off offset:80
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x9
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[26:29], off offset:96
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x8
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[30:33], off offset:112
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x7
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[34:37], off offset:128
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[48:51], off offset:144
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x5
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[52:55], off offset:160
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x4
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[64:67], off offset:176
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x3
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[68:71], off offset:192
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x2
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[80:83], off offset:208
+; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[84:87], off offset:224
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[96:99], off offset:240
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x10
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, v16
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   store <64 x i32> %vec, ptr addrspace(1) %out, align 4
@@ -299,99 +245,78 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT:    s_clause 0x3
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 scope:SCOPE_SE
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:112
-; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
-; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:112
+; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:96
+; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:80
+; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:48
+; GCN-SDAG-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:32
 ; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
 ; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[24:25], 0x70
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 0x60
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x50
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 64
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[36:37], 0x70
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 48
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[66:67], 0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0x50
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 64
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[24:25], v[10:13], off
+; GCN-SDAG-NEXT:    global_store_b128 v[36:37], v[6:9], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[18:21], off
+; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[10:13], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x4
-; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[18:21], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[14:17], off
+; GCN-SDAG-NEXT:    global_store_b128 v[50:51], v[22:25], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT:    global_store_b128 v[40:41], v[26:29], off
+; GCN-SDAG-NEXT:    global_store_b128 v[64:65], v[26:29], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[42:43], v[30:33], off
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x3
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT:    global_store_b128 v[66:67], v[30:33], off
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x2
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[24:25], 0x64, v[24:25]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
 ; GCN-SDAG-NEXT:    s_clause 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[22:25], off
-; GCN-SDAG-NEXT:    global_store_b128 v[48:49], v[0:3], off
+; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[0:3], off
 ; GCN-SDAG-NEXT:    s_clause 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:96
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:112
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:32
-; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[34:37], off offset:48
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:96
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[48:51], off offset:64
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:80
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[22:25], off offset:32
+; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:48
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:16
-; GCN-SDAG-NEXT:    s_clause 0x3
-; GCN-SDAG-NEXT:    scratch_load_b32 v43, off, s32
-; GCN-SDAG-NEXT:    scratch_load_b32 v42, off, s32 offset:4
-; GCN-SDAG-NEXT:    scratch_load_b32 v41, off, s32 offset:8
-; GCN-SDAG-NEXT:    scratch_load_b32 v40, off, s32 offset:12
-; GCN-SDAG-NEXT:    s_wait_xcnt 0xc
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x8
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: test_v16i64_load_store:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GCN-GISEL-NEXT:    s_clause 0x5
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 scope:SCOPE_SE
 ; GCN-GISEL-NEXT:    s_clause 0x7
 ; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
 ; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
@@ -405,11 +330,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[48:49], 16
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[50:51], 32
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[52:53], 48
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x60
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[66:67], 0x60
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[68:69], 0x70
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[54:55], 64
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[44:45], 0x70
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[34:35], 0xc8
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[40:41], 0x50
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[64:65], 0x50
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x5
@@ -419,13 +344,13 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x3
 ; GCN-GISEL-NEXT:    global_store_b128 v[52:53], v[22:25], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x2
-; GCN-GISEL-NEXT:    global_store_b128 v[42:43], v[26:29], off
+; GCN-GISEL-NEXT:    global_store_b128 v[66:67], v[26:29], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
-; GCN-GISEL-NEXT:    global_store_b128 v[44:45], v[30:33], off
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT:    global_store_b128 v[68:69], v[30:33], off
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x5
-; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x4
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
@@ -448,7 +373,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-GISEL-NEXT:    s_clause 0x1
 ; GCN-GISEL-NEXT:    global_store_b128 v[54:55], v[0:3], off
-; GCN-GISEL-NEXT:    global_store_b128 v[40:41], v[34:37], off
+; GCN-GISEL-NEXT:    global_store_b128 v[64:65], v[34:37], off
 ; GCN-GISEL-NEXT:    s_clause 0x7
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:16
@@ -458,15 +383,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:96
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:112
-; GCN-GISEL-NEXT:    s_clause 0x5
-; GCN-GISEL-NEXT:    scratch_load_b32 v45, off, s32
-; GCN-GISEL-NEXT:    scratch_load_b32 v44, off, s32 offset:4
-; GCN-GISEL-NEXT:    scratch_load_b32 v43, off, s32 offset:8
-; GCN-GISEL-NEXT:    scratch_load_b32 v42, off, s32 offset:12
-; GCN-GISEL-NEXT:    scratch_load_b32 v41, off, s32 offset:16
-; GCN-GISEL-NEXT:    scratch_load_b32 v40, off, s32 offset:20
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x9
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
-; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
   %in_a = insertelement <16 x i64> %a, i64 100, i32 5
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 62d7f4801baf8..94f5aab1eb67d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -12,6 +12,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s
 
 define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX6-LABEL: local_workgroup_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_unordered_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") monotonic, align 4
@@ -524,6 +549,18 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
@@ -718,6 +755,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -859,6 +909,16 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_unordered_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") unordered, align 4
@@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") monotonic, align 4
@@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") monotonic
@@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") unordered, align 4
@@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") monotonic, align 4
@@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") acquire, align 4
@@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    ds_load_b32 v1, v0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %in, ptr addrspace(3) %out) {
 entry:
   %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") unordered, align 4
@@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") release, align 4
@@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     i32 %in, ptr addrspace(3) %out) {
 entry:
   store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") release
@@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU:       ; %bb.0: ; %entry
+; GFX1250-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT:    s_wait_dscnt 0x0
+; GFX1250-CU-NEXT:    ds_store_b32 v0, v1
+; GFX1250-CU-NEXT:    s_endpgm
     ptr addrspace(3) %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 8304be958f1ad..f78168ba29ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -343,66 +343,66 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v40, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v40, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v40, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v40, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v40, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v40, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v40, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v40, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], v[38:39]
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], v[42:43]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[36:37]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[34:35]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[54:55]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], v[56:57]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[48:49]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[48:49]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[32:33]
-; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_v32_vs:
@@ -1600,66 +1600,66 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v40, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v40, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v40, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v40, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v40, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v40, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v40, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v40, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v40, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s20 :: v_dual_mov_b32 v35, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s22 :: v_dual_mov_b32 v39, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v37, s29
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s30 :: v_dual_mov_b32 v43, s31
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v44, s24 :: v_dual_mov_b32 v33, s19
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s28 :: v_dual_mov_b32 v57, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v54, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s13 :: v_dual_mov_b32 v56, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s7 :: v_dual_mov_b32 v52, s2
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s27 :: v_dual_mov_b32 v48, s4
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s5 :: v_dual_mov_b32 v50, s6
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s25 :: v_dual_mov_b32 v46, s26
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s9
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v39, s11
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[42:43]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[36:37]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[34:35]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[38:39]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[54:55]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[56:57]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[48:49]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[48:49]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[32:33]
-; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v40, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fmul_v32_vs:
@@ -2431,65 +2431,65 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v34, 7, v0
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v34, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v34, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v34, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v34, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v34, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v34, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v34, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v34, s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[30:31]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[28:29]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[54:55], s[12:13]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[56:57], s[14:15]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[52:53], s[2:3]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[48:49], s[4:5]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[50:51], s[6:7]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[24:25]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[46:47], s[26:27]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[20:21]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[22:23]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[30:31]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[28:29]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[54:55], s[14:15]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[46:47], s[4:5]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[48:49], s[6:7]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[24:25]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[26:27]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[18:19]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[8:9]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[10:11]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], v[42:43], v[42:43]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[0:1]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], v[54:55], v[54:55]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], v[36:37], v[36:37]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], v[56:57], v[56:57]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49]
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], v[48:49], v[48:49]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], v[50:51], v[50:51]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], v[44:45], v[44:45]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[22:23], v[22:23], v[46:47], v[46:47]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[32:33], v[32:33]
-; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
+; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[16:19], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[12:15], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[8:11], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[4:7], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[20:23], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[24:27], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[0:3], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v34, v[28:31], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fma_v32_vs:
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index 9ab177cf2b125..44fe55ef6e9ba 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -1,9 +1,10 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX8,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx810 --check-prefixes=ALL,GCN,GFX8,PREGFX10,NOWGP,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX10,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX11,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx1200 --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GCN,NONAMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=ALL,GFX90A,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx90a --check-prefixes=ALL,GFX90A,PREGFX10,NOWGP,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1250 -show-encoding %s 2>&1 >/dev/null | FileCheck %s -DMCPU=gfx1250 --check-prefixes=ALL,GCN,GFX10PLUS,GFX12,NOWGP,AMDHSA
 
 .text
 
@@ -11,7 +12,7 @@
 // GFX8-NOT: error:
 // GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
 // GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
-// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1200
+// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--[[MCPU]]
 // NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
 .warning "test_target"
 .amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
@@ -176,8 +177,7 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_workgroup_processor_mode
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: .amdhsa_next_free_vgpr directive is required
+// NOWGP: error: directive unsupported on [[MCPU]]
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_workgroup_processor_mode"
 .amdhsa_kernel test_amdhsa_workgroup_processor_mode
@@ -185,8 +185,7 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_workgroup_processor_mode_invalid
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: value out of range
+// NOWGP: error: directive unsupported on [[MCPU]]
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_workgroup_processor_mode_invalid"
 .amdhsa_kernel test_amdhsa_workgroup_processor_mode_invalid
@@ -290,6 +289,24 @@
   .amdhsa_inst_pref_size 15
 .end_amdhsa_kernel
 
+// GCN-LABEL: warning: test_amdhsa_dx10_clamp_bit
+// GFX12: error: directive unsupported on gfx12+
+.warning "test_amdhsa_dx10_clamp_bit"
+.amdhsa_kernel test_amdhsa_dx10_clamp_bit
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_dx10_clamp 1
+.end_amdhsa_kernel
+
+// GCN-LABEL: warning: test_amdhsa_ieee_mode_bit
+// GFX12: error: directive unsupported on gfx12+
+.warning "test_amdhsa_ieee_mode_bit"
+.amdhsa_kernel test_amdhsa_ieee_mode_bit
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_ieee_mode 1
+.end_amdhsa_kernel
+
 // GCN-LABEL: warning: test_next_free_vgpr_invalid
 // AMDHSA: error: .amdgcn.next_free_{v,s}gpr symbols must be absolute expressions
 // NONAMDHSA-NOT: error:

>From b60e366721dc9a0fcc4edfdc1d7dc7fa5d1c113e Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao at microsoft.com>
Date: Thu, 14 Aug 2025 15:29:20 -0700
Subject: [PATCH 32/53] [win][arm64ec] XFAIL x64 intrinsic tests on Arm64EC
 (#153474)

Clang defines the x64 preprocessor macro (`__x86_64__`) when building
Arm64EC, however the tests for x64 built-ins and intrinsics are
currently failing since the relevant functions don't exist, resulting in
errors like:

```
Line 165: invalid conversion between vector type '__v2di' (vector of 2 'long long' values) and integer type 'int' of different size
```

(Clang doesn't know the intrinsics being called, and so treats it like
an undefined function, which makes it assume the return type is `int`)

For now, expect these tests to fail until someone decides to implement
these intrinsics.
---
 clang/test/Headers/mm3dnow.c                | 3 +++
 clang/test/Headers/pmmintrin.c              | 3 +++
 clang/test/Headers/x86-intrinsics-headers.c | 3 +++
 clang/test/Headers/x86intrin.c              | 3 +++
 clang/test/Headers/x86intrin.cpp            | 3 +++
 5 files changed, 15 insertions(+)

diff --git a/clang/test/Headers/mm3dnow.c b/clang/test/Headers/mm3dnow.c
index a9b6dd88f8034..e45acb12ddbde 100644
--- a/clang/test/Headers/mm3dnow.c
+++ b/clang/test/Headers/mm3dnow.c
@@ -2,6 +2,9 @@
 // RUN: %clang_cc1 -fsyntax-only -D_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS -ffreestanding %s -verify
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify
 
+// XFAIL: target=arm64ec-pc-windows-msvc
+// These intrinsics are not yet implemented for Arm64EC.
+
 #if defined(i386) || defined(__x86_64__)
 #ifndef _CLANG_DISABLE_CRT_DEPRECATION_WARNINGS
 // expected-warning at mm3dnow.h:*{{The <mm3dnow.h> header is deprecated}}
diff --git a/clang/test/Headers/pmmintrin.c b/clang/test/Headers/pmmintrin.c
index 5b7a3a4ef6b9b..776ef75d70c54 100644
--- a/clang/test/Headers/pmmintrin.c
+++ b/clang/test/Headers/pmmintrin.c
@@ -2,6 +2,9 @@
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify
 // expected-no-diagnostics
 
+// XFAIL: target=arm64ec-pc-windows-msvc
+// These intrinsics are not yet implemented for Arm64EC.
+
 #if defined(i386) || defined(__x86_64__)
 #include <pmmintrin.h>
 
diff --git a/clang/test/Headers/x86-intrinsics-headers.c b/clang/test/Headers/x86-intrinsics-headers.c
index 59ca354e1160b..dc06cbde0f587 100644
--- a/clang/test/Headers/x86-intrinsics-headers.c
+++ b/clang/test/Headers/x86-intrinsics-headers.c
@@ -2,6 +2,9 @@
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding -flax-vector-conversions=none %s
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s
 
+// XFAIL: target=arm64ec-pc-windows-msvc
+// These intrinsics are not yet implemented for Arm64EC.
+
 #if defined(i386) || defined(__x86_64__)
 
 #ifdef __SSE4_2__
diff --git a/clang/test/Headers/x86intrin.c b/clang/test/Headers/x86intrin.c
index 53e369559f408..c01af1a43d1fa 100644
--- a/clang/test/Headers/x86intrin.c
+++ b/clang/test/Headers/x86intrin.c
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify
 // expected-no-diagnostics
 
+// XFAIL: target=arm64ec-pc-windows-msvc
+// These intrinsics are not yet implemented for Arm64EC.
+
 #if defined(i386) || defined(__x86_64__)
 
 // Include the metaheader that includes all x86 intrinsic headers.
diff --git a/clang/test/Headers/x86intrin.cpp b/clang/test/Headers/x86intrin.cpp
index 11d442db3d2c4..6c9baa6fff635 100644
--- a/clang/test/Headers/x86intrin.cpp
+++ b/clang/test/Headers/x86intrin.cpp
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding %s -verify
 // expected-no-diagnostics
 
+// XFAIL: target=arm64ec-pc-windows-msvc
+// These intrinsics are not yet implemented for Arm64EC.
+
 #if defined(i386) || defined(__x86_64__)
 
 // Include the metaheader that includes all x86 intrinsic headers.

>From 269c8b5d983ffe1e3d372feb71dc5dc510e9eb8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 15:35:02 -0700
Subject: [PATCH 33/53] [flang][cuda] Add interfaces for __ll2float_rX
 (#153694)

---
 flang/module/cudadevice.f90              | 28 ++++++++++++++++++++++++
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 15 +++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index a8b1d19a4c1ff..ac97a5d27dbd2 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -681,6 +681,34 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __ll2float_rd
+    attributes(device) real function __ll2float_rd(i) bind(c, name='__nv_ll2float_rd')
+      !dir$ ignore_tkr (d) i
+      integer(8), value :: i
+    end function
+  end interface
+
+  interface __ll2float_rn
+    attributes(device) real function __ll2float_rn(i) bind(c, name='__nv_ll2float_rn')
+      !dir$ ignore_tkr (d) i
+      integer(8), value :: i
+    end function
+    end interface
+
+  interface __ll2float_ru
+    attributes(device) real function __ll2float_ru(i) bind(c, name='__nv_ll2float_ru')
+      !dir$ ignore_tkr (d) i
+      integer(8), value :: i
+    end function
+  end interface
+
+  interface __ll2float_rz
+    attributes(device) real function __ll2float_rz(i) bind(c, name='__nv_ll2float_rz')
+      !dir$ ignore_tkr (d) i
+      integer(8), value :: i
+    end function
+  end interface
+
   interface __ll2double_rd
     attributes(device) double precision function __ll2double_rd(i) bind(c, name='__nv_ll2double_rd')
       !dir$ ignore_tkr (d) i
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index 447e09cad747b..c381485ec1761 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -161,3 +161,18 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
+
+attributes(global) subroutine test_ll2float_rX()
+  real :: res
+  integer(8) :: i
+  res = __ll2float_rd(i)
+  res = __ll2float_rn(i)
+  res = __ll2float_ru(i)
+  res = __ll2float_rz(i)
+end subroutine
+
+! CHECK-LABEL: _QPtest_ll2float_rx
+! CHECK: %{{.*}} = fir.call @__nv_ll2float_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_ll2float_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_ll2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_ll2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32

>From 9486c6d1ce6fe178d08a51c9581b57dda7215ea4 Mon Sep 17 00:00:00 2001
From: joaosaffran <126493771+joaosaffran at users.noreply.github.com>
Date: Thu, 14 Aug 2025 18:40:11 -0400
Subject: [PATCH 34/53] [DirectX] Add Range Overlap validation (#152229)

As part of the Root Signature Spec, we need to validate if Root
Signatures are not defining overlapping ranges.
Closes: https://github.com/llvm/llvm-project/issues/126645

---------

Co-authored-by: joaosaffran <joao.saffran at microsoft.com>
Co-authored-by: Joao Saffran <{ID}+{username}@users.noreply.github.com>
Co-authored-by: Joao Saffran <jderezende at microsoft.com>
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  |   2 -
 llvm/include/llvm/Support/DXILABI.h           |   6 +
 llvm/lib/Analysis/DXILResource.cpp            |  15 +-
 llvm/lib/BinaryFormat/DXContainer.cpp         |  11 --
 llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp  |   2 +-
 .../Frontend/HLSL/RootSignatureMetadata.cpp   |   4 +-
 llvm/lib/Support/CMakeLists.txt               |   1 +
 llvm/lib/Support/DXILABI.cpp                  |  34 ++++
 .../lib/Target/DirectX/DXContainerGlobals.cpp |   3 +-
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |   3 +
 .../DXILPostOptimizationValidation.cpp        | 174 +++++++++++++++++-
 llvm/lib/Target/DirectX/DXILRootSignature.h   |   8 +-
 .../DXILResource/buffer-frombinding.ll        |   4 +-
 llvm/test/CodeGen/DirectX/llc-pipeline.ll     |   2 +-
 ...signature-validation-fail-cbuffer-range.ll |  15 ++
 ...-validation-fail-descriptor-table-range.ll |  16 ++
 ...e-validation-fail-root-descriptor-range.ll |  15 ++
 .../rootsignature-validation-fail-sampler.ll  |  15 ++
 ...re-validation-fail-static-sampler-range.ll |  14 ++
 .../DirectX/rootsignature-validation.ll       |  20 ++
 20 files changed, 319 insertions(+), 45 deletions(-)
 create mode 100644 llvm/lib/Support/DXILABI.cpp
 create mode 100644 llvm/test/CodeGen/DirectX/rootsignature-validation-fail-cbuffer-range.ll
 create mode 100644 llvm/test/CodeGen/DirectX/rootsignature-validation-fail-descriptor-table-range.ll
 create mode 100644 llvm/test/CodeGen/DirectX/rootsignature-validation-fail-root-descriptor-range.ll
 create mode 100644 llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
 create mode 100644 llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
 create mode 100644 llvm/test/CodeGen/DirectX/rootsignature-validation.ll

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index cc4af3d9be8d7..f74c9775cb3f3 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -158,8 +158,6 @@ enum class FeatureFlags : uint64_t {
 static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-LLVM_ABI ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> getResourceClasses();
-
 #define ROOT_SIGNATURE_FLAG(Num, Val) Val = Num,
 enum class RootFlags : uint32_t {
 #include "DXContainerConstants.def"
diff --git a/llvm/include/llvm/Support/DXILABI.h b/llvm/include/llvm/Support/DXILABI.h
index b479f7c73eba3..2dcdd73415be2 100644
--- a/llvm/include/llvm/Support/DXILABI.h
+++ b/llvm/include/llvm/Support/DXILABI.h
@@ -17,6 +17,8 @@
 #ifndef LLVM_SUPPORT_DXILABI_H
 #define LLVM_SUPPORT_DXILABI_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include <cstdint>
 
 namespace llvm {
@@ -99,6 +101,10 @@ enum class SamplerFeedbackType : uint32_t {
 const unsigned MinWaveSize = 4;
 const unsigned MaxWaveSize = 128;
 
+LLVM_ABI ArrayRef<EnumEntry<ResourceClass>> getResourceClasses();
+
+LLVM_ABI StringRef getResourceClassName(ResourceClass RC);
+
 } // namespace dxil
 } // namespace llvm
 
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 109c0568a6f9e..3a70666029248 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/DXILABI.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <optional>
@@ -29,20 +30,6 @@
 using namespace llvm;
 using namespace dxil;
 
-static StringRef getResourceClassName(ResourceClass RC) {
-  switch (RC) {
-  case ResourceClass::SRV:
-    return "SRV";
-  case ResourceClass::UAV:
-    return "UAV";
-  case ResourceClass::CBuffer:
-    return "CBuffer";
-  case ResourceClass::Sampler:
-    return "Sampler";
-  }
-  llvm_unreachable("Unhandled ResourceClass");
-}
-
 static StringRef getResourceKindName(ResourceKind RK) {
   switch (RK) {
   case ResourceKind::Texture1D:
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index eb83945c9c42f..36d10d0b63078 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -60,17 +60,6 @@ ArrayRef<EnumEntry<SigComponentType>> dxbc::getSigComponentTypes() {
   return ArrayRef(SigComponentTypes);
 }
 
-static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
-    {"SRV", llvm::dxil::ResourceClass::SRV},
-    {"UAV", llvm::dxil::ResourceClass::UAV},
-    {"CBV", llvm::dxil::ResourceClass::CBuffer},
-    {"Sampler", llvm::dxil::ResourceClass::Sampler},
-};
-
-ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxbc::getResourceClasses() {
-  return ArrayRef(ResourceClassNames);
-}
-
 static const EnumEntry<RootFlags> RootFlagNames[] = {
 #define ROOT_SIGNATURE_FLAG(Val, Enum) {#Enum, RootFlags::Enum},
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 574883e0d7fd7..050cc46e8c9b0 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -94,7 +94,7 @@ static raw_ostream &operator<<(raw_ostream &OS,
 
 static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
   OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)),
-                        dxbc::getResourceClasses());
+                        dxil::getResourceClasses());
 
   return OS;
 }
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 1cda3080442b2..157bfc665b207 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -121,7 +121,7 @@ MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
   IRBuilder<> Builder(Ctx);
   StringRef ResName =
       enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)),
-                      dxbc::getResourceClasses());
+                      dxil::getResourceClasses());
   assert(!ResName.empty() && "Provided an invalid Resource Class");
   SmallString<7> Name({"Root", ResName});
   Metadata *Operands[] = {
@@ -163,7 +163,7 @@ MDNode *MetadataBuilder::BuildDescriptorTableClause(
   IRBuilder<> Builder(Ctx);
   StringRef ResName =
       enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)),
-                      dxbc::getResourceClasses());
+                      dxil::getResourceClasses());
   assert(!ResName.empty() && "Provided an invalid Resource Class");
   Metadata *Operands[] = {
       MDString::get(Ctx, ResName),
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 10b6101d73277..b7578dd580072 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -182,6 +182,7 @@ add_llvm_component_library(LLVMSupport
   DivisionByConstantInfo.cpp
   DAGDeltaAlgorithm.cpp
   DJB.cpp
+  DXILABI.cpp
   DynamicAPInt.cpp
   ELFAttributes.cpp
   ELFAttrParserCompact.cpp
diff --git a/llvm/lib/Support/DXILABI.cpp b/llvm/lib/Support/DXILABI.cpp
new file mode 100644
index 0000000000000..261fe1ef98278
--- /dev/null
+++ b/llvm/lib/Support/DXILABI.cpp
@@ -0,0 +1,34 @@
+//===-- DXILABI.cpp - ABI Sensitive Values for DXIL -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions of various constants and enums that are
+// required to remain stable as per the DXIL format's requirements.
+//
+// Documentation for DXIL can be found in
+// https://github.com/Microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DXILABI.h"
+#include "llvm/Support/ScopedPrinter.h"
+using namespace llvm;
+
+static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
+    {"SRV", llvm::dxil::ResourceClass::SRV},
+    {"UAV", llvm::dxil::ResourceClass::UAV},
+    {"CBV", llvm::dxil::ResourceClass::CBuffer},
+    {"Sampler", llvm::dxil::ResourceClass::Sampler},
+};
+
+ArrayRef<EnumEntry<llvm::dxil::ResourceClass>> dxil::getResourceClasses() {
+  return ArrayRef(ResourceClassNames);
+}
+
+StringRef dxil::getResourceClassName(dxil::ResourceClass RC) {
+  return enumToStringRef(RC, getResourceClasses());
+}
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 26a113d2d5260..a1ef2578f00aa 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -162,8 +162,7 @@ void DXContainerGlobals::addRootSignature(Module &M,
 
   auto &RSA = getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
   const Function *EntryFunction = MMI.EntryPropertyVec[0].Entry;
-  const std::optional<mcdxbc::RootSignatureDesc> &RS =
-      RSA.getDescForFunction(EntryFunction);
+  const mcdxbc::RootSignatureDesc *RS = RSA.getDescForFunction(EntryFunction);
 
   if (!RS)
     return;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index c10a1f5c7e0d7..bd421771e8edb 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -9,6 +9,7 @@
 #include "DXILOpLowering.h"
 #include "DXILConstants.h"
 #include "DXILOpBuilder.h"
+#include "DXILRootSignature.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "llvm/ADT/SmallVector.h"
@@ -918,6 +919,7 @@ PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &MAM) {
   PA.preserve<DXILResourceAnalysis>();
   PA.preserve<DXILMetadataAnalysis>();
   PA.preserve<ShaderFlagsAnalysis>();
+  PA.preserve<RootSignatureAnalysis>();
   return PA;
 }
 
@@ -945,6 +947,7 @@ class DXILOpLoweringLegacy : public ModulePass {
     AU.addPreserved<DXILResourceWrapperPass>();
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
   }
 };
 char DXILOpLoweringLegacy::ID = 0;
diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
index 9d995bc70a997..be2c7d1ddff3f 100644
--- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
+++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DXILPostOptimizationValidation.h"
+#include "DXILRootSignature.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "llvm/ADT/SmallString.h"
@@ -17,13 +18,44 @@
 #include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/DXILABI.h"
 
 #define DEBUG_TYPE "dxil-post-optimization-validation"
 
 using namespace llvm;
 using namespace llvm::dxil;
 
-namespace {
+static ResourceClass toResourceClass(dxbc::DescriptorRangeType RangeType) {
+  using namespace dxbc;
+  switch (RangeType) {
+  case DescriptorRangeType::SRV:
+    return ResourceClass::SRV;
+  case DescriptorRangeType::UAV:
+    return ResourceClass::UAV;
+  case DescriptorRangeType::CBV:
+    return ResourceClass::CBuffer;
+  case DescriptorRangeType::Sampler:
+    return ResourceClass::Sampler;
+  }
+  llvm_unreachable("Unknown DescriptorRangeType");
+}
+
+static ResourceClass toResourceClass(dxbc::RootParameterType Type) {
+  using namespace dxbc;
+  switch (Type) {
+  case RootParameterType::Constants32Bit:
+    return ResourceClass::CBuffer;
+  case RootParameterType::SRV:
+    return ResourceClass::SRV;
+  case RootParameterType::UAV:
+    return ResourceClass::UAV;
+  case RootParameterType::CBV:
+    return ResourceClass::CBuffer;
+  case dxbc::RootParameterType::DescriptorTable:
+    llvm_unreachable("DescriptorTable is not convertible to ResourceClass");
+  }
+  llvm_unreachable("Unknown RootParameterType");
+}
 
 static void reportInvalidDirection(Module &M, DXILResourceMap &DRM) {
   for (const auto &UAV : DRM.uavs()) {
@@ -86,8 +118,125 @@ static void reportOverlappingBinding(Module &M, DXILResourceMap &DRM) {
                        "true, yet no overlapping binding was found");
 }
 
+static void
+reportOverlappingRegisters(Module &M,
+                           const llvm::hlsl::BindingInfoBuilder::Binding &R1,
+                           const llvm::hlsl::BindingInfoBuilder::Binding &R2) {
+  SmallString<128> Message;
+
+  raw_svector_ostream OS(Message);
+  OS << "resource " << getResourceClassName(R1.RC) << " (space=" << R1.Space
+     << ", registers=[" << R1.LowerBound << ", " << R1.UpperBound
+     << "]) overlaps with resource " << getResourceClassName(R2.RC)
+     << " (space=" << R2.Space << ", registers=[" << R2.LowerBound << ", "
+     << R2.UpperBound << "])";
+  M.getContext().diagnose(DiagnosticInfoGeneric(Message));
+}
+
+static dxbc::ShaderVisibility
+tripleToVisibility(llvm::Triple::EnvironmentType ET) {
+  switch (ET) {
+  case Triple::Pixel:
+    return dxbc::ShaderVisibility::Pixel;
+  case Triple::Vertex:
+    return dxbc::ShaderVisibility::Vertex;
+  case Triple::Geometry:
+    return dxbc::ShaderVisibility::Geometry;
+  case Triple::Hull:
+    return dxbc::ShaderVisibility::Hull;
+  case Triple::Domain:
+    return dxbc::ShaderVisibility::Domain;
+  case Triple::Mesh:
+    return dxbc::ShaderVisibility::Mesh;
+  case Triple::Compute:
+    return dxbc::ShaderVisibility::All;
+  default:
+    llvm_unreachable("Invalid triple to shader stage conversion");
+  }
+}
+
+static void validateRootSignature(Module &M,
+                                  const mcdxbc::RootSignatureDesc &RSD,
+                                  dxil::ModuleMetadataInfo &MMI) {
+
+  hlsl::BindingInfoBuilder Builder;
+  dxbc::ShaderVisibility Visibility = tripleToVisibility(MMI.ShaderProfile);
+
+  for (const mcdxbc::RootParameterInfo &ParamInfo : RSD.ParametersContainer) {
+    dxbc::ShaderVisibility ParamVisibility =
+        static_cast<dxbc::ShaderVisibility>(ParamInfo.Header.ShaderVisibility);
+    if (ParamVisibility != dxbc::ShaderVisibility::All &&
+        ParamVisibility != Visibility)
+      continue;
+    dxbc::RootParameterType ParamType =
+        static_cast<dxbc::RootParameterType>(ParamInfo.Header.ParameterType);
+    switch (ParamType) {
+    case dxbc::RootParameterType::Constants32Bit: {
+      dxbc::RTS0::v1::RootConstants Const =
+          RSD.ParametersContainer.getConstant(ParamInfo.Location);
+      Builder.trackBinding(dxil::ResourceClass::CBuffer, Const.RegisterSpace,
+                           Const.ShaderRegister, Const.ShaderRegister,
+                           &ParamInfo);
+      break;
+    }
+
+    case dxbc::RootParameterType::SRV:
+    case dxbc::RootParameterType::UAV:
+    case dxbc::RootParameterType::CBV: {
+      dxbc::RTS0::v2::RootDescriptor Desc =
+          RSD.ParametersContainer.getRootDescriptor(ParamInfo.Location);
+      Builder.trackBinding(toResourceClass(static_cast<dxbc::RootParameterType>(
+                               ParamInfo.Header.ParameterType)),
+                           Desc.RegisterSpace, Desc.ShaderRegister,
+                           Desc.ShaderRegister, &ParamInfo);
+
+      break;
+    }
+    case dxbc::RootParameterType::DescriptorTable: {
+      const mcdxbc::DescriptorTable &Table =
+          RSD.ParametersContainer.getDescriptorTable(ParamInfo.Location);
+
+      for (const dxbc::RTS0::v2::DescriptorRange &Range : Table.Ranges) {
+        uint32_t UpperBound =
+            Range.NumDescriptors == ~0U
+                ? Range.BaseShaderRegister
+                : Range.BaseShaderRegister + Range.NumDescriptors - 1;
+        Builder.trackBinding(
+            toResourceClass(
+                static_cast<dxbc::DescriptorRangeType>(Range.RangeType)),
+            Range.RegisterSpace, Range.BaseShaderRegister, UpperBound,
+            &ParamInfo);
+      }
+      break;
+    }
+    }
+  }
+
+  for (const dxbc::RTS0::v1::StaticSampler &S : RSD.StaticSamplers)
+    Builder.trackBinding(dxil::ResourceClass::Sampler, S.RegisterSpace,
+                         S.ShaderRegister, S.ShaderRegister, &S);
+
+  Builder.calculateBindingInfo(
+      [&M](const llvm::hlsl::BindingInfoBuilder &Builder,
+           const llvm::hlsl::BindingInfoBuilder::Binding &ReportedBinding) {
+        const llvm::hlsl::BindingInfoBuilder::Binding &Overlaping =
+            Builder.findOverlapping(ReportedBinding);
+        reportOverlappingRegisters(M, ReportedBinding, Overlaping);
+      });
+}
+
+static mcdxbc::RootSignatureDesc *
+getRootSignature(RootSignatureBindingInfo &RSBI,
+                 dxil::ModuleMetadataInfo &MMI) {
+  if (MMI.EntryPropertyVec.size() == 0)
+    return nullptr;
+  return RSBI.getDescForFunction(MMI.EntryPropertyVec[0].Entry);
+}
+
 static void reportErrors(Module &M, DXILResourceMap &DRM,
-                         DXILResourceBindingInfo &DRBI) {
+                         DXILResourceBindingInfo &DRBI,
+                         RootSignatureBindingInfo &RSBI,
+                         dxil::ModuleMetadataInfo &MMI) {
   if (DRM.hasInvalidCounterDirection())
     reportInvalidDirection(M, DRM);
 
@@ -96,14 +245,19 @@ static void reportErrors(Module &M, DXILResourceMap &DRM,
 
   assert(!DRBI.hasImplicitBinding() && "implicit bindings should be handled in "
                                        "DXILResourceImplicitBinding pass");
+
+  if (mcdxbc::RootSignatureDesc *RSD = getRootSignature(RSBI, MMI))
+    validateRootSignature(M, *RSD, MMI);
 }
-} // namespace
 
 PreservedAnalyses
 DXILPostOptimizationValidation::run(Module &M, ModuleAnalysisManager &MAM) {
   DXILResourceMap &DRM = MAM.getResult<DXILResourceAnalysis>(M);
   DXILResourceBindingInfo &DRBI = MAM.getResult<DXILResourceBindingAnalysis>(M);
-  reportErrors(M, DRM, DRBI);
+  RootSignatureBindingInfo &RSBI = MAM.getResult<RootSignatureAnalysis>(M);
+  ModuleMetadataInfo &MMI = MAM.getResult<DXILMetadataAnalysis>(M);
+
+  reportErrors(M, DRM, DRBI, RSBI, MMI);
   return PreservedAnalyses::all();
 }
 
@@ -115,7 +269,12 @@ class DXILPostOptimizationValidationLegacy : public ModulePass {
         getAnalysis<DXILResourceWrapperPass>().getResourceMap();
     DXILResourceBindingInfo &DRBI =
         getAnalysis<DXILResourceBindingWrapperPass>().getBindingInfo();
-    reportErrors(M, DRM, DRBI);
+    RootSignatureBindingInfo &RSBI =
+        getAnalysis<RootSignatureAnalysisWrapper>().getRSInfo();
+    dxil::ModuleMetadataInfo &MMI =
+        getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
+
+    reportErrors(M, DRM, DRBI, RSBI, MMI);
     return false;
   }
   StringRef getPassName() const override {
@@ -127,10 +286,13 @@ class DXILPostOptimizationValidationLegacy : public ModulePass {
   void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
     AU.addRequired<DXILResourceWrapperPass>();
     AU.addRequired<DXILResourceBindingWrapperPass>();
+    AU.addRequired<DXILMetadataAnalysisWrapperPass>();
+    AU.addRequired<RootSignatureAnalysisWrapper>();
     AU.addPreserved<DXILResourceWrapperPass>();
     AU.addPreserved<DXILResourceBindingWrapperPass>();
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
   }
 };
 char DXILPostOptimizationValidationLegacy::ID = 0;
@@ -141,6 +303,8 @@ INITIALIZE_PASS_BEGIN(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
 INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
 INITIALIZE_PASS_END(DXILPostOptimizationValidationLegacy, DEBUG_TYPE,
                     "DXIL Post Optimization Validation", false, false)
 
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index 254b7ff504633..b990b6c7410ac 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -43,13 +43,11 @@ class RootSignatureBindingInfo {
 
   iterator end() { return FuncToRsMap.end(); }
 
-  std::optional<mcdxbc::RootSignatureDesc>
-  getDescForFunction(const Function *F) {
+  mcdxbc::RootSignatureDesc *getDescForFunction(const Function *F) {
     const auto FuncRs = find(F);
     if (FuncRs == end())
-      return std::nullopt;
-
-    return FuncRs->second;
+      return nullptr;
+    return &FuncRs->second;
   }
 };
 
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index 2623e6f4d44f1..d08b68d3768af 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -146,7 +146,7 @@ define void @test_typedbuffer() {
   ; CHECK:     Space: 1
   ; CHECK:     Lower Bound: 0
   ; CHECK:     Size: 1
-  ; CHECK:   Class: CBuffer
+  ; CHECK:   Class: CBV
   ; CHECK:   Kind: CBuffer
   ; CHECK:   CBuffer size: 4
 
@@ -159,7 +159,7 @@ define void @test_typedbuffer() {
   ; CHECK:     Space: 1
   ; CHECK:     Lower Bound: 8
   ; CHECK:     Size: 1
-  ; CHECK:   Class: CBuffer
+  ; CHECK:   Class: CBV
   ; CHECK:   Kind: CBuffer
   ; CHECK:   CBuffer size: 4
 
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
index 360a6f6959e9f..5c81ba42ffcc8 100644
--- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -40,9 +40,9 @@
 ; CHECK-NEXT:   DXIL Module Metadata analysis
 ; CHECK-NEXT:   DXIL Shader Flag Analysis
 ; CHECK-NEXT:   DXIL Translate Metadata
+; CHECK-NEXT:   DXIL Root Signature Analysis
 ; CHECK-NEXT:   DXIL Post Optimization Validation
 ; CHECK-NEXT:   DXIL Op Lowering
-; CHECK-NEXT:   DXIL Root Signature Analysis
 ; CHECK-NEXT:   DXIL Prepare Module
 
 ; CHECK-ASM-NEXT: DXIL Metadata Pretty Printer
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-cbuffer-range.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-cbuffer-range.ll
new file mode 100644
index 0000000000000..e420225229919
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-cbuffer-range.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s
+; CHECK: error: resource CBV (space=0, registers=[2, 2]) overlaps with resource CBV (space=0, registers=[0, 2])
+
+define void @CSMain() "hlsl.shader"="compute" {
+entry:
+  ret void
+}
+
+!dx.rootsignatures = !{!0}
+
+!0 = !{ptr @CSMain, !1, i32 2}
+!1 = !{!2, !3}
+!2 = !{!"RootConstants", i32 0, i32 2, i32 0, i32 4}
+!3 = !{!"DescriptorTable", i32 0, !4}
+!4 = !{!"CBV", i32 3, i32 0, i32 0, i32 -1, i32 4}
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-descriptor-table-range.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-descriptor-table-range.ll
new file mode 100644
index 0000000000000..037f8c71f8ef0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-descriptor-table-range.ll
@@ -0,0 +1,16 @@
+; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s
+; CHECK: error: resource UAV (space=10, registers=[4294967295, 4294967295]) overlaps with resource UAV (space=10, registers=[4294967295, 4294967295])
+
+define void @CSMain() "hlsl.shader"="compute" {
+entry:
+  ret void
+}
+
+!dx.rootsignatures = !{!0}
+
+!0 = !{ptr @CSMain, !1, i32 2}
+!1 = !{!2, !4}
+!2 = !{!"DescriptorTable", i32 0, !3}
+!3 = !{!"UAV", i32 -1, i32 -1, i32 10, i32 -1, i32 2}
+!4 = !{!"DescriptorTable", i32 0, !5}
+!5 = !{ !"UAV", i32 -1, i32 -1, i32 10, i32 5, i32 2 }
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-root-descriptor-range.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-root-descriptor-range.ll
new file mode 100644
index 0000000000000..7098efbb43f6a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-root-descriptor-range.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s
+; CHECK: error: resource UAV (space=1, registers=[3, 3]) overlaps with resource UAV (space=1, registers=[0, 3])
+
+define void @CSMain() "hlsl.shader"="compute" {
+entry:
+  ret void
+}
+
+!dx.rootsignatures = !{!0}
+
+!0 = !{ptr @CSMain, !1, i32 2}
+!1 = !{!2, !4}
+!2 = !{!"RootUAV", i32 0, i32 3, i32 1, i32 4}
+!4 = !{!"DescriptorTable", i32 0, !5}
+!5 = !{!"UAV", i32 4, i32 0, i32 1, i32 -1, i32 2}
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
new file mode 100644
index 0000000000000..c244095520468
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
@@ -0,0 +1,15 @@
+; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s
+; CHECK: error: resource Sampler (space=0, registers=[42, 42]) overlaps with resource Sampler (space=0, registers=[42, 42])
+
+define void @CSMain() "hlsl.shader"="compute" {
+entry:
+  ret void
+}
+
+!dx.rootsignatures = !{!0}
+
+!0 = !{ptr @CSMain, !1, i32 2}
+!1 = !{!2, !3}
+!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0 }
+!3 = !{!"DescriptorTable", i32 0, !4}
+!4 = !{!"Sampler", i32 1, i32 42, i32 0, i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
new file mode 100644
index 0000000000000..9ac02ebbc0965
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
@@ -0,0 +1,14 @@
+; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s
+; CHECK: error: resource Sampler (space=0, registers=[42, 42]) overlaps with resource Sampler (space=0, registers=[42, 42])
+
+define void @CSMain() "hlsl.shader"="compute" {
+entry:
+  ret void
+}
+
+!dx.rootsignatures = !{!0}
+
+!0 = !{ptr @CSMain, !1, i32 2}
+!1 = !{!2, !3}
+!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0 }
+!3 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation.ll
new file mode 100644
index 0000000000000..0fdba27018cd4
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 
+; We have a valid root signature, this should compile successfully
+
+define void @CSMain() "hlsl.shader"="compute" {
+entry:
+  ret void
+}
+
+!dx.rootsignatures = !{!0}
+
+!0 = !{ptr @CSMain, !1, i32 2}
+!1 = !{!2, !3, !5, !7, !9}
+!2 = !{!"RootCBV", i32 0, i32 3, i32 1, i32 4}
+!9 = !{!"RootConstants", i32 0, i32 2, i32 0, i32 4}
+!3 = !{!"DescriptorTable", i32 0, !4}
+!4 = !{!"SRV", i32 1, i32 0, i32 0, i32 -1, i32 0}
+!5 = !{!"DescriptorTable", i32 0, !6}
+!6 = !{!"Sampler", i32 5, i32 3, i32 2, i32 -1, i32 0}
+!7 = !{!"DescriptorTable", i32 0, !8}
+!8 = !{!"UAV", i32 -1, i32 0, i32 0, i32 -1, i32 2}

>From fcd4e9ebba668b72c53f26834e73a4b90b5a1a1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 15:44:52 -0700
Subject: [PATCH 35/53] [flang][cuda] Add interfaces for __float2ll_rX
 (#153702)

---
 flang/module/cudadevice.f90              | 28 ++++++++++++++++++++++++
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 15 +++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index ac97a5d27dbd2..b5a92c63f19ed 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -471,6 +471,34 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __float2ll_rd
+    attributes(device) integer(8) function __float2ll_rd(r) bind(c, name='__nv_float2ll_rd')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2ll_rn
+    attributes(device) integer(8) function __float2ll_rn(r) bind(c, name='__nv_float2ll_rn')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2ll_ru
+    attributes(device) integer(8) function __float2ll_ru(r) bind(c, name='__nv_float2ll_ru')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2ll_rz
+    attributes(device) integer(8) function __float2ll_rz(r) bind(c, name='__nv_float2ll_rz')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
   interface __half2float
     attributes(device) real function __half2float(i) bind(c, name='__nv_half2float')
       !dir$ ignore_tkr (d) i
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index c381485ec1761..68fd443d39f57 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -162,6 +162,21 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 
+attributes(global) subroutine test_float2ll_rX()
+  integer(8) :: res
+  real :: r
+  res = __float2ll_rd(r)
+  res = __float2ll_rn(r)
+  res = __float2ll_ru(r)
+  res = __float2ll_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_float2ll_rx
+! CHECK: %{{.*}} = fir.call @__nv_float2ll_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_float2ll_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_float2ll_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i64
+! CHECK: %{{.*}} = fir.call @__nv_float2ll_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i64
+
 attributes(global) subroutine test_ll2float_rX()
   real :: res
   integer(8) :: i

>From 557f40608540c76c74f138d741e13c9012af54de Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 15:54:33 -0700
Subject: [PATCH 36/53] [AMDGPU] Remove wave64 functions (#153690)

gfx1250 only supports wave32.
---
 .../AMDGPURemoveIncompatibleFunctions.cpp     |  10 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   4 +
 .../AMDGPU/pal-metadata-3.0.gfx1250.ll        | 208 ++++++++++++++++++
 .../remove-incompatible-wave64-feature.ll     |  34 +++
 4 files changed, 253 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index e2e5c57397d02..d2ec7dd429d8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
 
   // Delete FeatureWavefrontSize32 functions for
   // gfx9 and below targets that don't support the mode.
-  // gfx10+ is implied to support both wave32 and 64 features.
+  // gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features.
   // They are not in the feature set. So, we need a separate check
-  if (ST->getGeneration() < AMDGPUSubtarget::GFX10 &&
-      ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
+  if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
     reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32);
     return true;
   }
+  // gfx125x only support FeatureWavefrontSize32.
+  if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) {
+    reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64);
+    return true;
+  }
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7ca7e8448c63d..5a631cb1b6d37 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1726,6 +1726,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
+  bool supportsWave32() const { return getGeneration() >= GFX10; }
+
+  bool supportsWave64() const { return !hasGFX1250Insts(); }
+
   bool isWave32() const {
     return getWavefrontSize() == 32;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
new file mode 100644
index 0000000000000..f934c85f68e0f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
@@ -0,0 +1,208 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1250 <%s | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; TotalNumSgprs: 4
+; CHECK: ; NumVgprs: 2
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .compute_registers:
+; CHECK-NEXT:       .tg_size_en:     true
+; CHECK-NEXT:       .tgid_x_en:      false
+; CHECK-NEXT:       .tgid_y_en:      false
+; CHECK-NEXT:       .tgid_z_en:      false
+; CHECK-NEXT:       .tidig_comp_cnt: 0x1
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:      .ps_extra_lds_size: 0
+; CHECK-NEXT:      .spi_ps_input_addr:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: true
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: true
+; CHECK-NEXT:        .linear_centroid_ena: true
+; CHECK-NEXT:        .linear_sample_ena: true
+; CHECK-NEXT:        .persp_center_ena: true
+; CHECK-NEXT:        .persp_centroid_ena: true
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: true
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:      .spi_ps_input_ena:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: false
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: false
+; CHECK-NEXT:        .linear_centroid_ena: false
+; CHECK-NEXT:        .linear_sample_ena: false
+; CHECK-NEXT:        .persp_center_ena: false
+; CHECK-NEXT:        .persp_centroid_ena: false
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: false
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_cs
+; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
+; GFX11-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x4
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x20
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .gs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_gs
+; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .forward_progress: true
+; CHECK-NEXT:      .lds_size:       0x400
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .hs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_hs
+; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .forward_progress: true
+; CHECK-NEXT:        .lds_size:       0x1000
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .ps:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_ps
+; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .forward_progress: true
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK:    .registers:      {}
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %arg1 to i64
+  %i3 = or i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = and i32 %arg2, 1023
+  %i6 = lshr i32 %arg2, 10
+  %i7 = and i32 %i6, 1023
+  %i8 = add nuw nsw i32 %i7, %i5
+  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %.idx = shl nuw nsw i32 %i8, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+  ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+  ret void
+}
+
+ at LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define dllexport amdgpu_gs void @gs_shader() #2 {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+ at LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define dllexport amdgpu_hs void @hs_shader() #2 {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size \B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll
new file mode 100644
index 0000000000000..03dbfdcaf6ff1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX1250 %s
+; RUN: FileCheck --check-prefix=WARN-GFX1250 %s < %t
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32 < %s
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 < %s
+
+; WARN-GFX1250: removing function 'needs_wavefrontsize64': +wavefrontsize64 is not supported on the current target
+; WARN-GFX1250-NOT: not supported
+
+define void @needs_wavefrontsize64(ptr %out) #0 {
+; GFX1250-NOT:  @needs_wavefrontsize64
+; GFX1200:      define void @needs_wavefrontsize64(
+  %1 = tail call i64 @llvm.read_register.i64(metadata !0)
+  %2 = tail call i64 @llvm.ctpop.i64(i64 %1)
+  store i64 %2, ptr %out, align 4
+  ret void
+}
+
+define void @caller(ptr %out) {
+  ; GFX1250: call void null(
+  ; GFX1200: call void @needs_wavefrontsize64(
+  call void @needs_wavefrontsize64(ptr %out)
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata)
+declare i64 @llvm.ctpop.i64(i64)
+
+!0 = !{!"exec"}
+
+attributes #0 = { "target-features"="+wavefrontsize64" }

>From 9e4d07ed5aed9089b3169b23b65328e471ad7426 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 15:55:17 -0700
Subject: [PATCH 37/53] [flang][cuda] Add interface for __saturatef (#153705)

---
 flang/module/cudadevice.f90              | 7 +++++++
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 9 +++++++++
 2 files changed, 16 insertions(+)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index b5a92c63f19ed..598d154eb3d8a 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -325,6 +325,13 @@ attributes(device) real(8) function rsqrt(x) bind(c,name='__nv_rsqrt')
     end function
   end interface
 
+  interface saturate
+    attributes(device) real function __saturatef(r) bind(c, name='__nv_saturatef')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
   interface __sad
     attributes(device) integer function __sad(i,j,k) bind(c, name='__nv_sad')
       !dir$ ignore_tkr (d) i, (d) j, (d) k
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index 68fd443d39f57..ecce60599a358 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -162,6 +162,15 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ull_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 
+attributes(global) subroutine test_saturatef()
+  real :: res
+  real :: r
+  res = __saturatef(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_saturatef
+! CHECK: %{{.*}} = fir.call @__nv_saturatef(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> f32
+
 attributes(global) subroutine test_float2ll_rX()
   integer(8) :: res
   real :: r

>From c86b8b16e583f7a87d814378b2588d36cb97a804 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Thu, 14 Aug 2025 22:56:30 +0000
Subject: [PATCH 38/53] [gn build] Port d56fa965243b

---
 llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 0ea1c6fcee5ee..58ab4a55d4a56 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -72,6 +72,7 @@ static_library("Support") {
     "CrashRecoveryContext.cpp",
     "DAGDeltaAlgorithm.cpp",
     "DJB.cpp",
+    "DXILABI.cpp",
     "DataExtractor.cpp",
     "Debug.cpp",
     "DebugCounter.cpp",

>From 8257eaa7e9568a413011801a7eef6ddc4cbcf59d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 16:11:45 -0700
Subject: [PATCH 39/53] [flang][cuda] Add interfaces for __float2int_rX and
 __float2unit_rX (#153691)

---
 flang/module/cudadevice.f90              | 56 ++++++++++++++++++++++++
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 30 ++++++++++++-
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 598d154eb3d8a..2a5fced6c02c2 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -478,6 +478,62 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __float2int_rd
+    attributes(device) integer function __float2int_rd(r) bind(c, name='__nv_float2int_rd')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2int_rn
+    attributes(device) integer function __float2int_rn(r) bind(c, name='__nv_float2int_rn')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2int_ru
+    attributes(device) integer function __float2int_ru(r) bind(c, name='__nv_float2int_ru')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2int_rz
+    attributes(device) integer function __float2int_rz(r) bind(c, name='__nv_float2int_rz')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2uint_rd
+    attributes(device) integer function __float2uint_rd(r) bind(c, name='__nv_float2uint_rd')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2uint_rn
+    attributes(device) integer function __float2uint_rn(r) bind(c, name='__nv_float2uint_rn')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2uint_ru
+    attributes(device) integer function __float2uint_ru(r) bind(c, name='__nv_float2uint_ru')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
+  interface __float2uint_rz
+    attributes(device) integer function __float2uint_rz(r) bind(c, name='__nv_float2uint_rz')
+      !dir$ ignore_tkr (d) r
+      real, value :: r
+    end function
+  end interface
+
   interface __float2ll_rd
     attributes(device) integer(8) function __float2ll_rd(r) bind(c, name='__nv_float2ll_rd')
       !dir$ ignore_tkr (d) r
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index ecce60599a358..d54456f3df943 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -131,7 +131,6 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 ! CHECK: %{{.*}} = fir.call @__nv_double2ll_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i64
 
-
 attributes(global) subroutine test_drcp_rX()
   double precision :: res
   double precision :: r
@@ -200,3 +199,32 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_ll2float_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
 ! CHECK: %{{.*}} = fir.call @__nv_ll2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
 ! CHECK: %{{.*}} = fir.call @__nv_ll2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
+attributes(global) subroutine test_float2int_rX()
+  integer :: res
+  real :: r
+  res = __float2int_rd(r)
+  res = __float2int_rn(r)
+  res = __float2int_ru(r)
+  res = __float2int_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_float2int_rx
+! CHECK: %{{.*}} = fir.call @__nv_float2int_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32    
+! CHECK: %{{.*}} = fir.call @__nv_float2int_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_float2int_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_float2int_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32
+
+attributes(global) subroutine test_float2uint_rX()
+  integer :: res
+  real :: r
+  res = __float2uint_rd(r)
+  res = __float2uint_rn(r)
+  res = __float2uint_ru(r)
+  res = __float2uint_rz(r)
+end subroutine
+
+! CHECK-LABEL: _QPtest_float2uint_rx
+! CHECK: %{{.*}} = fir.call @__nv_float2uint_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_float2uint_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_float2uint_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32
+! CHECK: %{{.*}} = fir.call @__nv_float2uint_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32) -> i32

>From 2cd09ccd92f9c458d71a11e8f811f61a68138829 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 16:29:53 -0700
Subject: [PATCH 40/53] [AMDGPU] Enable kernarg preload on gfx1250 (#153686)

---
 .../AMDGPU/AMDGPUPreloadKernArgProlog.cpp     |   2 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   6 +
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll  |  80 ++---
 .../AMDGPU/preload-implicit-kernargs.ll       | 183 ++++++++++
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  | 314 +++++++++++++++++-
 5 files changed, 531 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index 40094518dce0a..90c4f4e6680c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
       TRI(*ST.getRegisterInfo()) {}
 
 bool AMDGPUPreloadKernArgProlog::run() {
-  if (!ST.hasKernargPreload())
+  if (!ST.needsKernArgPreloadProlog())
     return false;
 
   unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 5a631cb1b6d37..f9f512bad13ad 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1577,6 +1577,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // extended VA to 57 bits.
   bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
 
+  // \returns true if the target needs to create a prolog for backward
+  // compatibility when preloading kernel arguments.
+  bool needsKernArgPreloadProlog() const {
+    return hasKernargPreload() && !GFX1250Insts;
+  }
+
   /// \returns SGPR allocation granularity supported by the subtarget.
   unsigned getSGPRAllocGranule() const {
     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
index 3a5507063b834..57967bc1650fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -27,7 +27,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s2
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -38,7 +38,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -49,7 +49,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -58,33 +58,21 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 
 define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-REAL16:       ; %bb.1:
-; SDAG-REAL16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT:    s_branch .LBB1_0
-; SDAG-REAL16-NEXT:    .p2align 8
-; SDAG-REAL16-NEXT:  ; %bb.2:
-; SDAG-REAL16-NEXT:  .LBB1_0:
+; SDAG-REAL16:       ; %bb.0:
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-FAKE16:       ; %bb.1:
-; SDAG-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT:    s_branch .LBB1_0
-; SDAG-FAKE16-NEXT:    .p2align 8
-; SDAG-FAKE16-NEXT:  ; %bb.2:
-; SDAG-FAKE16-NEXT:  .LBB1_0:
+; SDAG-FAKE16:       ; %bb.0:
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s8
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -95,7 +83,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -106,7 +94,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -120,7 +108,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -129,7 +117,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, 0x64
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -138,7 +126,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -147,7 +135,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, 0x64
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0
   store i16 %cvt, ptr %out, align 2
@@ -163,7 +151,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -174,7 +162,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s2
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -185,7 +173,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -196,7 +184,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -205,33 +193,21 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 
 define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-REAL16:       ; %bb.1:
-; SDAG-REAL16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT:    s_branch .LBB4_0
-; SDAG-REAL16-NEXT:    .p2align 8
-; SDAG-REAL16-NEXT:  ; %bb.2:
-; SDAG-REAL16-NEXT:  .LBB4_0:
+; SDAG-REAL16:       ; %bb.0:
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-FAKE16:       ; %bb.1:
-; SDAG-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT:    s_branch .LBB4_0
-; SDAG-FAKE16-NEXT:    .p2align 8
-; SDAG-FAKE16-NEXT:  ; %bb.2:
-; SDAG-FAKE16-NEXT:  .LBB4_0:
+; SDAG-FAKE16:       ; %bb.0:
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s8
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -242,7 +218,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -253,7 +229,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
   store i16 %cvt, ptr %out, align 2
@@ -267,7 +243,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -276,7 +252,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, 0x64
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT:    flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
 ; SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -285,7 +261,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-REAL16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -294,7 +270,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, 0x64
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT:    flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
 ; GISEL-FAKE16-NEXT:    s_endpgm
   %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0
   store i16 %cvt, ptr %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 79b531e3ce785..c87f723086a41 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 
 define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX942-LABEL: preload_block_count_x:
@@ -30,6 +31,12 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -65,6 +72,12 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_unused_arg_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -101,6 +114,14 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[14:15]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: no_free_sgprs_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x28
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[8:9]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -127,6 +148,14 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: no_inreg_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -156,6 +185,16 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: mixed_inreg_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b32 s2, s[0:1], 0x10
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -192,6 +231,15 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i64_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i64, ptr addrspace(4) %imp_arg_ptr
   store i64 %load, ptr addrspace(1) %out
@@ -228,6 +276,14 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i16_block_count_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1] offset:8
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i16, ptr addrspace(4) %imp_arg_ptr
   store i16 %load, ptr addrspace(1) %out
@@ -261,6 +317,12 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_y:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
   %load = load i32, ptr addrspace(4) %gep
@@ -300,6 +362,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: random_incorrect_offset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0xa
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
   %load = load i32, ptr addrspace(4) %gep
@@ -336,6 +406,12 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_z:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
   %load = load i32, ptr addrspace(4) %gep
@@ -376,6 +452,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_add_co_i32 s0, s6, s0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   %ext = zext i8 %val to i32
@@ -417,6 +502,13 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_xyz:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
   %load_x = load i32, ptr addrspace(4) %gep_x
@@ -461,6 +553,14 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
   %load = load i16, ptr addrspace(4) %gep
@@ -499,6 +599,14 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_y:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
   %load = load i16, ptr addrspace(4) %gep
@@ -539,6 +647,14 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_z:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
   %load = load i16, ptr addrspace(4) %gep
@@ -587,6 +703,16 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_xyz:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX1250-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX1250-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
   %load_x = load i16, ptr addrspace(4) %gep_x
@@ -636,6 +762,14 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_remainder_x:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
   %load = load i16, ptr addrspace(4) %gep
@@ -674,6 +808,14 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preloadremainder_y:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
   %load = load i16, ptr addrspace(4) %gep
@@ -712,6 +854,14 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preloadremainder_z:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
   %load = load i16, ptr addrspace(4) %gep
@@ -758,6 +908,16 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preloadremainder_xyz:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX1250-NEXT:    s_lshr_b32 s1, s8, 16
+; GFX1250-NEXT:    s_and_b32 s4, s9, 0xffff
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
   %load_x = load i16, ptr addrspace(4) %gep_x
@@ -805,6 +965,14 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[14:15]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s15, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[8:9]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
   %load = load i16, ptr addrspace(4) %gep
@@ -845,6 +1013,12 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_max_user_sgprs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s12
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
   store i32 %load, ptr addrspace(1) %out
@@ -887,6 +1061,15 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX1250-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
   %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index efe4cfa7e5d2b..d5edfb42fa6d1 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
-
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 
 define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
 ; GFX942-LABEL: ptr1_i8:
@@ -33,6 +33,14 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void
@@ -68,6 +76,14 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_i8_zext_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
@@ -103,6 +119,14 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_i16_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
@@ -136,6 +160,12 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_i32_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store i32 %arg0, ptr addrspace(1) %out
   ret void
 }
@@ -172,6 +202,14 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[10:11]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i32_ptr1_i32_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_co_i32 s0, s2, s6
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1250-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
@@ -211,6 +249,16 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_i16_i16_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX1250-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   %ext1 = zext i16 %arg1 to i32
   %add = add i32 %ext, %ext1
@@ -246,6 +294,12 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_v2i8_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
@@ -289,6 +343,18 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
 ; GFX90a-NEXT:    global_store_dword v0, v2, s[8:9]
 ; GFX90a-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: byref_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_endpgm
   %in = load i32, ptr addrspace(4) %in.byref
   store volatile i32 %in, ptr addrspace(1) %out, align 4
   store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -335,6 +401,18 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    global_store_dword v0, v2, s[8:9]
 ; GFX90a-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: byref_staggered_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_endpgm
   %in = load i32, ptr addrspace(4) %in.byref
   store volatile i32 %in, ptr addrspace(1) %out, align 4
   store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -390,6 +468,20 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v8i32_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b256 s[4:11], s[0:1], 0x20
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s8
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10
+; GFX1250-NEXT:    v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT:    v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6
+; GFX1250-NEXT:    v_mov_b32_e32 v7, s7
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX1250-NEXT:    global_store_b128 v8, v[4:7], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -425,6 +517,15 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v3i16_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -461,6 +562,13 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v3i32_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -497,6 +605,13 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v3f32_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -546,6 +661,19 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v5i8_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_pack_lh_b32_b16 s0, 0, s4
+; GFX1250-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    s_or_b32 s0, s1, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b8 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -604,6 +732,24 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v5f64_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[12:13], s[0:1], 0x60
+; GFX1250-NEXT:    s_load_b256 s[4:11], s[0:1], 0x40
+; GFX1250-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX1250-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    global_store_b64 v10, v[8:9], s[2:3] offset:32
+; GFX1250-NEXT:    global_store_b128 v10, v[0:3], s[2:3] offset:16
+; GFX1250-NEXT:    global_store_b128 v10, v[4:7], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
@@ -665,6 +811,20 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v8i8_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_pack_lh_b32_b16 s0, 0, s5
+; GFX1250-NEXT:    s_pack_lh_b32_b16 s1, 0, s4
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX1250-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX1250-NEXT:    s_or_b32 s1, s4, s1
+; GFX1250-NEXT:    s_or_b32 s0, s5, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
@@ -696,6 +856,13 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
 ; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i64_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
@@ -727,6 +894,13 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
 ; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: f64_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store double %in, ptr addrspace(1) %out
   ret void
 }
@@ -759,6 +933,12 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: half_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store half %in, ptr addrspace(1) %out
   ret void
 }
@@ -791,6 +971,12 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: bfloat_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store bfloat %in, ptr addrspace(1) %out
   ret void
 }
@@ -823,6 +1009,12 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v2bfloat_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <2 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
@@ -858,6 +1050,15 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v3bfloat_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <3 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
@@ -894,6 +1095,13 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v6bfloat_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <6 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
@@ -939,6 +1147,17 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT:    v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    global_store_b16 v3, v4, s[2:3]
+; GFX1250-NEXT:    global_store_b16 v3, v5, s[10:11] offset:12
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[10:11]
+; GFX1250-NEXT:    s_endpgm
   store half %in, ptr addrspace(1) %out
   store <7 x bfloat> %in2, ptr addrspace(1) %out2
   ret void
@@ -974,6 +1193,14 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_byte v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i1_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s4, 1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b8 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store i1 %in, ptr addrspace(1) %out
   ret void
 }
@@ -1012,6 +1239,14 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: fp128_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT:    v_mov_b32_e32 v3, s9
+; GFX1250-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store fp128 %in, ptr addrspace(1) %out
   ret void
 }
@@ -1063,6 +1298,20 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v7i8_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_pack_lh_b32_b16 s0, 0, s4
+; GFX1250-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    s_or_b32 s0, s1, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    global_store_d16_hi_b8 v0, v1, s[2:3] offset:6
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <7 x i8> %in, ptr addrspace(1) %out
   ret void
 }
@@ -1103,6 +1352,16 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v7half_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s9
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v3, v4, s[2:3] offset:12
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT:    s_endpgm
   store <7 x half> %in, ptr addrspace(1) %out
   ret void
 }
@@ -1139,6 +1398,15 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[12:13]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i16_i32_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    global_store_b32 v0, v2, s[6:7]
+; GFX1250-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store i32 %in2, ptr addrspace(1) %out2
   ret void
@@ -1181,6 +1449,16 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v3, v4, s[2:3]
+; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[10:11]
+; GFX1250-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store <3 x i32> %in2, ptr addrspace(1) %out2
   ret void
@@ -1216,6 +1494,14 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
 ; GFX90a-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX90a-NEXT:    global_store_short_d16_hi v0, v1, s[12:13]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i16_i16_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    global_store_d16_hi_b16 v0, v1, s[6:7]
+; GFX1250-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store i16 %in2, ptr addrspace(1) %out2
   ret void
@@ -1261,6 +1547,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_short v0, v1, s[12:13]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    global_store_d16_hi_b16 v0, v1, s[6:7]
+; GFX1250-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store <2 x i8> %in2, ptr addrspace(1) %out2
   ret void
@@ -1302,6 +1596,16 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b96 s[4:6], s[0:1], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s0, s2, s6
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1250-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
@@ -1336,6 +1640,14 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ptr1_i8_trailing_unused:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void

>From 0baec0763eacad304d4f2653ca5e9566b17bedb8 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Thu, 14 Aug 2025 16:37:22 -0700
Subject: [PATCH 41/53] [DAGCombine] Fix an incorrect folding of
 extract_subvector (#153709)

Reported from
https://github.com/llvm/llvm-project/pull/153393#issuecomment-3189898813

During DAGCombine, an intermediate extract_subvector sequence was
generated:
```
  t8: v9i16 = extract_subvector t3, Constant:i64<9>
t24: v8i16 = extract_subvector t8, Constant:i64<0>
```
And one of the DAGCombine rule which turns `(extract_subvector
(extract_subvector X, C), 0)` into `(extract_subvector X, C)` kicked in
and turn that into `v8i16 = extract_subvector t3, Constant:i64<9>`. But
it forgot to check if the extracted index is a multiple of the minimum
vector length of the result type, hence the crash.

This patch fixes this by adding an additional check.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 ++-
 .../incorrect-extract-subvector-combine.ll    | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 17703f58f2824..d343b644e41cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26018,7 +26018,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
   // Combine an extract of an extract into a single extract_subvector.
   // ext (ext X, C), 0 --> ext X, C
   if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
-    if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
+    // The index has to be a multiple of the new result type's known minimum
+    // vector length.
+    if (V.getConstantOperandVal(1) % NVT.getVectorMinNumElements() == 0 &&
+        TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
                                     V.getConstantOperandVal(1)) &&
         TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, V.getOperand(0),
diff --git a/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll b/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll
new file mode 100644
index 0000000000000..6a0c03f339717
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr='+zve64f,+zvl512b' < %s | FileCheck %s
+
+; Previously, an incorrect (extract_subvector (extract_subvector X, C), 0) DAG combine crashed
+; this snippet.
+
+define <8 x i16> @gsm_encode() {
+; CHECK-LABEL: gsm_encode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 19, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (zero)
+; CHECK-NEXT:    vslidedown.vi v9, v8, 12
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, -1
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 9
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vslide1down.vx v9, v9, zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <19 x i16>, ptr null, align 2
+  %1 = shufflevector <19 x i16> zeroinitializer, <19 x i16> %0, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 28, i32 31, i32 poison, i32 poison>
+  %2 = shufflevector <9 x i16> %1, <9 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+  ret <8 x i16> %2
+}

>From 364514a078822bb059e92741a023958acb670439 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Thu, 14 Aug 2025 16:45:44 -0700
Subject: [PATCH 42/53] [flang][cuda] Add interfaces for __int2float_rX
 (#153708)

---
 flang/module/cudadevice.f90              | 28 ++++++++++++++++++++++++
 flang/test/Lower/CUDA/cuda-libdevice.cuf | 16 ++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 2a5fced6c02c2..ffc3a3b170ca6 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -674,6 +674,34 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
     end function
   end interface
 
+  interface __int2float_rd
+    attributes(device) real function __int2float_rd(i) bind(c, name='__nv_int2float_rd')
+      !dir$ ignore_tkr (d) i
+      integer, value :: i
+    end function
+  end interface
+
+  interface __int2float_rn
+    attributes(device) real function __int2float_rn(i) bind(c, name='__nv_int2float_rn')
+      !dir$ ignore_tkr (d) i
+      integer, value :: i
+    end function
+  end interface
+
+  interface __int2float_ru
+    attributes(device) real function __int2float_ru(i) bind(c, name='__nv_int2float_ru')
+      !dir$ ignore_tkr (d) i
+      integer, value :: i
+    end function
+  end interface
+
+  interface __int2float_rz
+    attributes(device) real function __int2float_rz(i) bind(c, name='__nv_int2float_rz')
+      !dir$ ignore_tkr (d) i
+      integer, value :: i
+    end function
+  end interface
+
   interface __int2double_rn
     attributes(device) double precision function __int2double_rn(i) bind(c)
       !dir$ ignore_tkr (d) i
diff --git a/flang/test/Lower/CUDA/cuda-libdevice.cuf b/flang/test/Lower/CUDA/cuda-libdevice.cuf
index d54456f3df943..f9c5dcc5fc4c3 100644
--- a/flang/test/Lower/CUDA/cuda-libdevice.cuf
+++ b/flang/test/Lower/CUDA/cuda-libdevice.cuf
@@ -199,6 +199,22 @@ end subroutine
 ! CHECK: %{{.*}} = fir.call @__nv_ll2float_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
 ! CHECK: %{{.*}} = fir.call @__nv_ll2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
 ! CHECK: %{{.*}} = fir.call @__nv_ll2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i64) -> f32
+
+attributes(global) subroutine test_int2float_rX()
+  real :: res
+  integer :: i
+  res = __int2float_rd(i)
+  res = __int2float_rn(i)
+  res = __int2float_ru(i)
+  res = __int2float_rz(i)
+end subroutine
+
+! CHECK-LABEL: _QPtest_int2float_rx
+! CHECK: %{{.*}} = fir.call @__nv_int2float_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_int2float_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_int2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> f32
+! CHECK: %{{.*}} = fir.call @__nv_int2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32) -> f32
+
 attributes(global) subroutine test_float2int_rX()
   integer :: res
   real :: r

>From 35f5a81899ca8725d693593ed8b507e23cd0e62f Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 14 Aug 2025 16:56:54 -0700
Subject: [PATCH 43/53] Revert "[CGData] Lazy loading support for stable
 function map (#151660)"

This reverts commit 76dd742f7b32e4d3acf50fab1dbbd897f215837e.
---
 llvm/include/llvm/CGData/CodeGenData.h        |   3 -
 llvm/include/llvm/CGData/CodeGenData.inc      |   2 +-
 llvm/include/llvm/CGData/StableFunctionMap.h  |  65 +-------
 .../llvm/CGData/StableFunctionMapRecord.h     |  48 +-----
 llvm/lib/CGData/CodeGenData.cpp               |   2 +-
 llvm/lib/CGData/CodeGenDataReader.cpp         |  17 +-
 llvm/lib/CGData/StableFunctionMap.cpp         |  70 ++-------
 llvm/lib/CGData/StableFunctionMapRecord.cpp   | 147 +++++-------------
 llvm/lib/CodeGen/GlobalMergeFunctions.cpp     |  10 +-
 .../ThinLTO/AArch64/cgdata-merge-write.ll     |   4 +-
 llvm/test/tools/llvm-cgdata/empty.test        |   4 +-
 llvm/test/tools/llvm-cgdata/error.test        |   4 +-
 .../merge-combined-funcmap-hashtree.test      |   4 +-
 .../llvm-cgdata/merge-funcmap-archive.test    |   8 +-
 .../llvm-cgdata/merge-funcmap-concat.test     |   6 +-
 .../llvm-cgdata/merge-funcmap-double.test     |   7 +-
 .../llvm-cgdata/merge-funcmap-single.test     |   4 +-
 llvm/tools/llvm-cgdata/Opts.td                |   1 -
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        |   5 -
 .../CGData/StableFunctionMapTest.cpp          |   2 +-
 20 files changed, 86 insertions(+), 327 deletions(-)

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index e44497a408245..38b96b72ccac6 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -285,9 +285,6 @@ enum CGDataVersion {
   // Version 3 adds the total size of the Names in the stable function map so
   // we can skip reading them into the memory for non-assertion builds.
   Version3 = 3,
-  // Version 4 adjusts the structure of stable function merging map for
-  // efficient lazy loading support.
-  Version4 = 4,
   CurrentVersion = CG_DATA_INDEX_VERSION
 };
 const uint64_t Version = CGDataVersion::CurrentVersion;
diff --git a/llvm/include/llvm/CGData/CodeGenData.inc b/llvm/include/llvm/CGData/CodeGenData.inc
index d5fbe2fb97718..94de4c0b017a2 100644
--- a/llvm/include/llvm/CGData/CodeGenData.inc
+++ b/llvm/include/llvm/CGData/CodeGenData.inc
@@ -49,4 +49,4 @@ CG_DATA_SECT_ENTRY(CG_merge, CG_DATA_QUOTE(CG_DATA_MERGE_COMMON),
 #endif
 
 /* Indexed codegen data format version (start from 1). */
-#define CG_DATA_INDEX_VERSION 4
+#define CG_DATA_INDEX_VERSION 3
diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h
index ea3523c3a3299..bcb72e8216973 100644
--- a/llvm/include/llvm/CGData/StableFunctionMap.h
+++ b/llvm/include/llvm/CGData/StableFunctionMap.h
@@ -20,8 +20,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/StructuralHash.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <mutex>
 
 namespace llvm {
 
@@ -74,37 +72,11 @@ struct StableFunctionMap {
           IndexOperandHashMap(std::move(IndexOperandHashMap)) {}
   };
 
-  using StableFunctionEntries =
-      SmallVector<std::unique_ptr<StableFunctionEntry>>;
-
-  /// In addition to the deserialized StableFunctionEntry, the struct stores
-  /// the offsets of corresponding serialized stable function entries, and a
-  /// once flag for safe lazy loading in a multithreaded environment.
-  struct EntryStorage {
-    /// The actual storage of deserialized stable function entries. If the map
-    /// is lazily loaded, this will be empty until the first access by the
-    /// corresponding function hash.
-    StableFunctionEntries Entries;
-
-  private:
-    /// This is used to deserialize the entry lazily. Each element is the
-    /// corresponding serialized stable function entry's offset in the memory
-    /// buffer (StableFunctionMap::Buffer).
-    /// The offsets are only populated when loading the map lazily, otherwise
-    /// it is empty.
-    SmallVector<uint64_t> Offsets;
-    std::once_flag LazyLoadFlag;
-    friend struct StableFunctionMap;
-    friend struct StableFunctionMapRecord;
-  };
-
-  // Note: DenseMap requires value type to be copyable even if only using
-  // in-place insertion. Use STL instead. This also affects the
-  // deletion-while-iteration in finalize().
-  using HashFuncsMapType = std::unordered_map<stable_hash, EntryStorage>;
+  using HashFuncsMapType =
+      DenseMap<stable_hash, SmallVector<std::unique_ptr<StableFunctionEntry>>>;
 
   /// Get the HashToFuncs map for serialization.
-  const HashFuncsMapType &getFunctionMap() const;
+  const HashFuncsMapType &getFunctionMap() const { return HashToFuncs; }
 
   /// Get the NameToId vector for serialization.
   ArrayRef<std::string> getNames() const { return IdToName; }
@@ -127,19 +99,6 @@ struct StableFunctionMap {
   /// \returns true if there is no stable function entry.
   bool empty() const { return size() == 0; }
 
-  /// \returns true if there is an entry for the given function hash.
-  /// This does not trigger lazy loading.
-  bool contains(HashFuncsMapType::key_type FunctionHash) const {
-    return HashToFuncs.count(FunctionHash) > 0;
-  }
-
-  /// \returns the stable function entries for the given function hash. If the
-  /// map is lazily loaded, it will deserialize the entries if it is not already
-  /// done, other requests to the same hash at the same time will be blocked
-  /// until the entries are deserialized.
-  const StableFunctionEntries &
-  at(HashFuncsMapType::key_type FunctionHash) const;
-
   enum SizeType {
     UniqueHashCount,        // The number of unique hashes in HashToFuncs.
     TotalFunctionCount,     // The number of total functions in HashToFuncs.
@@ -160,31 +119,17 @@ struct StableFunctionMap {
   /// `StableFunctionEntry` is ready for insertion.
   void insert(std::unique_ptr<StableFunctionEntry> FuncEntry) {
     assert(!Finalized && "Cannot insert after finalization");
-    HashToFuncs[FuncEntry->Hash].Entries.emplace_back(std::move(FuncEntry));
+    HashToFuncs[FuncEntry->Hash].emplace_back(std::move(FuncEntry));
   }
 
-  void deserializeLazyLoadingEntry(HashFuncsMapType::iterator It) const;
-
-  /// Eagerly deserialize all the unloaded entries in the lazy loading map.
-  void deserializeLazyLoadingEntries() const;
-
-  bool isLazilyLoaded() const { return (bool)Buffer; }
-
   /// A map from a stable_hash to a vector of functions with that hash.
-  mutable HashFuncsMapType HashToFuncs;
+  HashFuncsMapType HashToFuncs;
   /// A vector of strings to hold names.
   SmallVector<std::string> IdToName;
   /// A map from StringRef (name) to an ID.
   StringMap<unsigned> NameToId;
   /// True if the function map is finalized with minimal content.
   bool Finalized = false;
-  /// The memory buffer that contains the serialized stable function map for
-  /// lazy loading.
-  /// Non-empty only if this StableFunctionMap is created from a MemoryBuffer
-  /// (i.e. by IndexedCodeGenDataReader::read()) and lazily deserialized.
-  std::shared_ptr<MemoryBuffer> Buffer;
-  /// Whether to read stable function names from the buffer.
-  bool ReadStableFunctionMapNames = true;
 
   friend struct StableFunctionMapRecord;
 };
diff --git a/llvm/include/llvm/CGData/StableFunctionMapRecord.h b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
index 2d8b573a3cb46..a75cb12a70ba6 100644
--- a/llvm/include/llvm/CGData/StableFunctionMapRecord.h
+++ b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
@@ -24,26 +24,6 @@
 
 namespace llvm {
 
-/// The structure of the serialized stable function map is as follows:
-/// - Number of unique function/module names
-/// - Total size of unique function/module names for opt-in skipping
-/// - Unique function/module names
-/// - Padding to align to 4 bytes
-/// - Number of StableFunctionEntries
-/// - Hashes of each StableFunctionEntry
-/// - Fixed-size fields for each StableFunctionEntry (the order is consistent
-///   with the hashes above):
-///   - FunctionNameId
-///   - ModuleNameId
-///   - InstCount
-///   - Relative offset to the beginning of IndexOperandHashes for this entry
-/// - Total size of variable-sized IndexOperandHashes for lazy-loading support
-/// - Variable-sized IndexOperandHashes for each StableFunctionEntry:
-///   - Number of IndexOperandHashes
-///   - Contents of each IndexOperandHashes
-///     - InstIndex
-///     - OpndIndex
-///     - OpndHash
 struct StableFunctionMapRecord {
   std::unique_ptr<StableFunctionMap> FunctionMap;
 
@@ -60,25 +40,13 @@ struct StableFunctionMapRecord {
                                  const StableFunctionMap *FunctionMap,
                                  std::vector<CGDataPatchItem> &PatchItems);
 
-  /// A static helper function to deserialize the stable function map entry.
-  /// Ptr should be pointing to the start of the fixed-sized fields of the
-  /// entry when passed in.
-  LLVM_ABI static void deserializeEntry(const unsigned char *Ptr,
-                                        stable_hash Hash,
-                                        StableFunctionMap *FunctionMap);
-
   /// Serialize the stable function map to a raw_ostream.
   LLVM_ABI void serialize(raw_ostream &OS,
                           std::vector<CGDataPatchItem> &PatchItems) const;
 
   /// Deserialize the stable function map from a raw_ostream.
-  LLVM_ABI void deserialize(const unsigned char *&Ptr);
-
-  /// Lazily deserialize the stable function map from `Buffer` starting at
-  /// `Offset`. The individual stable function entry would be read lazily from
-  /// `Buffer` when the function map is accessed.
-  LLVM_ABI void lazyDeserialize(std::shared_ptr<MemoryBuffer> Buffer,
-                                uint64_t Offset);
+  LLVM_ABI void deserialize(const unsigned char *&Ptr,
+                            bool ReadStableFunctionMapNames = true);
 
   /// Serialize the stable function map to a YAML stream.
   LLVM_ABI void serializeYAML(yaml::Output &YOS) const;
@@ -102,18 +70,6 @@ struct StableFunctionMapRecord {
     yaml::Output YOS(OS);
     serializeYAML(YOS);
   }
-
-  /// Set whether to read stable function names from the buffer.
-  /// Has no effect if the function map is read from a YAML stream.
-  void setReadStableFunctionMapNames(bool Read) {
-    assert(
-        FunctionMap->empty() &&
-        "Cannot change ReadStableFunctionMapNames after the map is populated");
-    FunctionMap->ReadStableFunctionMapNames = Read;
-  }
-
-private:
-  void deserialize(const unsigned char *&Ptr, bool Lazy);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index b4f08c3d13b0d..cd012342e1958 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -186,7 +186,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
     return make_error<CGDataError>(cgdata_error::unsupported_version);
   H.DataKind = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
 
-  static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version4,
+  static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version3,
                 "Please update the offset computation below if a new field has "
                 "been added to the header.");
   H.OutlinedHashTreeOffset =
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index fc59be8df525a..0ab35499c8986 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -26,12 +26,6 @@ static cl::opt<bool> IndexedCodeGenDataReadFunctionMapNames(
              "disabled to save memory and time for final consumption of the "
              "indexed CodeGenData in production."));
 
-cl::opt<bool> IndexedCodeGenDataLazyLoading(
-    "indexed-codegen-data-lazy-loading", cl::init(false), cl::Hidden,
-    cl::desc(
-        "Lazily load indexed CodeGenData. Enable to save memory and time "
-        "for final consumption of the indexed CodeGenData in production."));
-
 namespace llvm {
 
 static Expected<std::unique_ptr<MemoryBuffer>>
@@ -115,20 +109,11 @@ Error IndexedCodeGenDataReader::read() {
       return error(cgdata_error::eof);
     HashTreeRecord.deserialize(Ptr);
   }
-
-  // TODO: lazy loading support for outlined hash tree.
-  std::shared_ptr<MemoryBuffer> SharedDataBuffer = std::move(DataBuffer);
   if (hasStableFunctionMap()) {
     const unsigned char *Ptr = Start + Header.StableFunctionMapOffset;
     if (Ptr >= End)
       return error(cgdata_error::eof);
-    FunctionMapRecord.setReadStableFunctionMapNames(
-        IndexedCodeGenDataReadFunctionMapNames);
-    if (IndexedCodeGenDataLazyLoading)
-      FunctionMapRecord.lazyDeserialize(SharedDataBuffer,
-                                        Header.StableFunctionMapOffset);
-    else
-      FunctionMapRecord.deserialize(Ptr);
+    FunctionMapRecord.deserialize(Ptr, IndexedCodeGenDataReadFunctionMapNames);
   }
 
   return success();
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
index 2f54fad0aa084..87f1e76afb60b 100644
--- a/llvm/lib/CGData/StableFunctionMap.cpp
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -15,10 +15,8 @@
 
 #include "llvm/CGData/StableFunctionMap.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/CGData/StableFunctionMapRecord.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include <mutex>
 
 #define DEBUG_TYPE "stable-function-map"
 
@@ -95,10 +93,9 @@ void StableFunctionMap::insert(const StableFunction &Func) {
 
 void StableFunctionMap::merge(const StableFunctionMap &OtherMap) {
   assert(!Finalized && "Cannot merge after finalization");
-  deserializeLazyLoadingEntries();
   for (auto &[Hash, Funcs] : OtherMap.HashToFuncs) {
-    auto &ThisFuncs = HashToFuncs[Hash].Entries;
-    for (auto &Func : Funcs.Entries) {
+    auto &ThisFuncs = HashToFuncs[Hash];
+    for (auto &Func : Funcs) {
       auto FuncNameId =
           getIdOrCreateForName(*OtherMap.getNameForId(Func->FunctionNameId));
       auto ModuleNameId =
@@ -117,63 +114,25 @@ size_t StableFunctionMap::size(SizeType Type) const {
   case UniqueHashCount:
     return HashToFuncs.size();
   case TotalFunctionCount: {
-    deserializeLazyLoadingEntries();
     size_t Count = 0;
     for (auto &Funcs : HashToFuncs)
-      Count += Funcs.second.Entries.size();
+      Count += Funcs.second.size();
     return Count;
   }
   case MergeableFunctionCount: {
-    deserializeLazyLoadingEntries();
     size_t Count = 0;
     for (auto &[Hash, Funcs] : HashToFuncs)
-      if (Funcs.Entries.size() >= 2)
-        Count += Funcs.Entries.size();
+      if (Funcs.size() >= 2)
+        Count += Funcs.size();
     return Count;
   }
   }
   llvm_unreachable("Unhandled size type");
 }
 
-const StableFunctionMap::StableFunctionEntries &
-StableFunctionMap::at(HashFuncsMapType::key_type FunctionHash) const {
-  auto It = HashToFuncs.find(FunctionHash);
-  if (isLazilyLoaded())
-    deserializeLazyLoadingEntry(It);
-  return It->second.Entries;
-}
-
-void StableFunctionMap::deserializeLazyLoadingEntry(
-    HashFuncsMapType::iterator It) const {
-  assert(isLazilyLoaded() && "Cannot deserialize non-lazily-loaded map");
-  auto &[Hash, Storage] = *It;
-  std::call_once(Storage.LazyLoadFlag,
-                 [this, HashArg = Hash, &StorageArg = Storage]() {
-                   for (auto Offset : StorageArg.Offsets)
-                     StableFunctionMapRecord::deserializeEntry(
-                         reinterpret_cast<const unsigned char *>(Offset),
-                         HashArg, const_cast<StableFunctionMap *>(this));
-                 });
-}
-
-void StableFunctionMap::deserializeLazyLoadingEntries() const {
-  if (!isLazilyLoaded())
-    return;
-  for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It)
-    deserializeLazyLoadingEntry(It);
-}
-
-const StableFunctionMap::HashFuncsMapType &
-StableFunctionMap::getFunctionMap() const {
-  // Ensure all entries are deserialized before returning the raw map.
-  if (isLazilyLoaded())
-    deserializeLazyLoadingEntries();
-  return HashToFuncs;
-}
-
 using ParamLocs = SmallVector<IndexPair>;
-static void
-removeIdenticalIndexPair(StableFunctionMap::StableFunctionEntries &SFS) {
+static void removeIdenticalIndexPair(
+    SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>> &SFS) {
   auto &RSF = SFS[0];
   unsigned StableFunctionCount = SFS.size();
 
@@ -200,7 +159,9 @@ removeIdenticalIndexPair(StableFunctionMap::StableFunctionEntries &SFS) {
       SF->IndexOperandHashMap->erase(Pair);
 }
 
-static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS) {
+static bool isProfitable(
+    const SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>>
+        &SFS) {
   unsigned StableFunctionCount = SFS.size();
   if (StableFunctionCount < GlobalMergingMinMerges)
     return false;
@@ -241,11 +202,8 @@ static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS) {
 }
 
 void StableFunctionMap::finalize(bool SkipTrim) {
-  deserializeLazyLoadingEntries();
-  SmallVector<HashFuncsMapType::iterator> ToDelete;
   for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It) {
-    auto &[StableHash, Storage] = *It;
-    auto &SFS = Storage.Entries;
+    auto &[StableHash, SFS] = *It;
 
     // Group stable functions by ModuleIdentifier.
     llvm::stable_sort(SFS, [&](const std::unique_ptr<StableFunctionEntry> &L,
@@ -278,7 +236,7 @@ void StableFunctionMap::finalize(bool SkipTrim) {
       }
     }
     if (Invalid) {
-      ToDelete.push_back(It);
+      HashToFuncs.erase(It);
       continue;
     }
 
@@ -290,10 +248,8 @@ void StableFunctionMap::finalize(bool SkipTrim) {
     removeIdenticalIndexPair(SFS);
 
     if (!isProfitable(SFS))
-      ToDelete.push_back(It);
+      HashToFuncs.erase(It);
   }
-  for (auto It : ToDelete)
-    HashToFuncs.erase(It);
 
   Finalized = true;
 }
diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp
index e585995ba6a31..423e068023088 100644
--- a/llvm/lib/CGData/StableFunctionMapRecord.cpp
+++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp
@@ -53,7 +53,7 @@ static SmallVector<const StableFunctionMap::StableFunctionEntry *>
 getStableFunctionEntries(const StableFunctionMap &SFM) {
   SmallVector<const StableFunctionMap::StableFunctionEntry *> FuncEntries;
   for (const auto &P : SFM.getFunctionMap())
-    for (auto &Func : P.second.Entries)
+    for (auto &Func : P.second)
       FuncEntries.emplace_back(Func.get());
 
   llvm::stable_sort(
@@ -107,25 +107,14 @@ void StableFunctionMapRecord::serialize(
   // Write StableFunctionEntries whose pointers are sorted.
   auto FuncEntries = getStableFunctionEntries(*FunctionMap);
   Writer.write<uint32_t>(FuncEntries.size());
-  for (const auto *FuncRef : FuncEntries)
-    Writer.write<stable_hash>(FuncRef->Hash);
-  std::vector<uint64_t> IndexOperandHashesOffsets;
-  IndexOperandHashesOffsets.reserve(FuncEntries.size());
+
   for (const auto *FuncRef : FuncEntries) {
+    Writer.write<stable_hash>(FuncRef->Hash);
     Writer.write<uint32_t>(FuncRef->FunctionNameId);
     Writer.write<uint32_t>(FuncRef->ModuleNameId);
     Writer.write<uint32_t>(FuncRef->InstCount);
-    const uint64_t Offset = Writer.OS.tell();
-    IndexOperandHashesOffsets.push_back(Offset);
-    Writer.write<uint64_t>(0);
-  }
-  const uint64_t IndexOperandHashesByteSizeOffset = Writer.OS.tell();
-  Writer.write<uint64_t>(0);
-  for (size_t I = 0; I < FuncEntries.size(); ++I) {
-    const uint64_t Offset = Writer.OS.tell() - IndexOperandHashesOffsets[I];
-    PatchItems.emplace_back(IndexOperandHashesOffsets[I], &Offset, 1);
+
     // Emit IndexOperandHashes sorted from IndexOperandHashMap.
-    const auto *FuncRef = FuncEntries[I];
     IndexOperandHashVecType IndexOperandHashes =
         getStableIndexOperandHashes(FuncRef);
     Writer.write<uint32_t>(IndexOperandHashes.size());
@@ -135,62 +124,10 @@ void StableFunctionMapRecord::serialize(
       Writer.write<stable_hash>(IndexOperandHash.second);
     }
   }
-  // Write the total size of IndexOperandHashes.
-  const uint64_t IndexOperandHashesByteSize =
-      Writer.OS.tell() - IndexOperandHashesByteSizeOffset - sizeof(uint64_t);
-  PatchItems.emplace_back(IndexOperandHashesByteSizeOffset,
-                          &IndexOperandHashesByteSize, 1);
-}
-
-void StableFunctionMapRecord::deserializeEntry(const unsigned char *Ptr,
-                                               stable_hash Hash,
-                                               StableFunctionMap *FunctionMap) {
-  auto FunctionNameId =
-      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-  if (FunctionMap->ReadStableFunctionMapNames)
-    assert(FunctionMap->getNameForId(FunctionNameId) &&
-           "FunctionNameId out of range");
-  auto ModuleNameId =
-      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-  if (FunctionMap->ReadStableFunctionMapNames)
-    assert(FunctionMap->getNameForId(ModuleNameId) &&
-           "ModuleNameId out of range");
-  auto InstCount =
-      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-
-  // Read IndexOperandHashes to build IndexOperandHashMap
-  auto CurrentPosition = reinterpret_cast<uintptr_t>(Ptr);
-  auto IndexOperandHashesOffset =
-      endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
-  auto *IndexOperandHashesPtr = reinterpret_cast<const unsigned char *>(
-      CurrentPosition + IndexOperandHashesOffset);
-  auto NumIndexOperandHashes =
-      endian::readNext<uint32_t, endianness::little, unaligned>(
-          IndexOperandHashesPtr);
-  auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
-  for (unsigned J = 0; J < NumIndexOperandHashes; ++J) {
-    auto InstIndex = endian::readNext<uint32_t, endianness::little, unaligned>(
-        IndexOperandHashesPtr);
-    auto OpndIndex = endian::readNext<uint32_t, endianness::little, unaligned>(
-        IndexOperandHashesPtr);
-    auto OpndHash =
-        endian::readNext<stable_hash, endianness::little, unaligned>(
-            IndexOperandHashesPtr);
-    assert(InstIndex < InstCount && "InstIndex out of range");
-
-    IndexOperandHashMap->try_emplace({InstIndex, OpndIndex}, OpndHash);
-  }
-
-  // Insert a new StableFunctionEntry into the map.
-  auto FuncEntry = std::make_unique<StableFunctionMap::StableFunctionEntry>(
-      Hash, FunctionNameId, ModuleNameId, InstCount,
-      std::move(IndexOperandHashMap));
-
-  FunctionMap->insert(std::move(FuncEntry));
 }
 
 void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr,
-                                          bool Lazy) {
+                                          bool ReadStableFunctionMapNames) {
   // Assert that Ptr is 4-byte aligned
   assert(((uintptr_t)Ptr % 4) == 0);
   // Read Names.
@@ -202,7 +139,7 @@ void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr,
   const auto NamesByteSize =
       endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
   const auto NamesOffset = reinterpret_cast<uintptr_t>(Ptr);
-  if (FunctionMap->ReadStableFunctionMapNames) {
+  if (ReadStableFunctionMapNames) {
     for (unsigned I = 0; I < NumNames; ++I) {
       StringRef Name(reinterpret_cast<const char *>(Ptr));
       Ptr += Name.size() + 1;
@@ -220,51 +157,47 @@ void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr,
   // Read StableFunctionEntries.
   auto NumFuncs =
       endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-  auto FixedSizeFieldsOffset =
-      reinterpret_cast<uintptr_t>(Ptr) + NumFuncs * sizeof(stable_hash);
-  constexpr uint32_t FixedSizeFieldsSizePerEntry =
-      // FunctionNameId
-      sizeof(uint32_t) +
-      // ModuleNameId
-      sizeof(uint32_t) +
-      // InstCount
-      sizeof(uint32_t) +
-      // Relative offset to IndexOperandHashes
-      sizeof(uint64_t);
   for (unsigned I = 0; I < NumFuncs; ++I) {
     auto Hash =
         endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
-    if (Lazy) {
-      auto It = FunctionMap->HashToFuncs.try_emplace(Hash).first;
-      StableFunctionMap::EntryStorage &Storage = It->second;
-      Storage.Offsets.push_back(FixedSizeFieldsOffset);
-    } else {
-      deserializeEntry(
-          reinterpret_cast<const unsigned char *>(FixedSizeFieldsOffset), Hash,
-          FunctionMap.get());
+    [[maybe_unused]] auto FunctionNameId =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    [[maybe_unused]] auto ModuleNameId =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    // Only validate IDs if we've read the names
+    if (ReadStableFunctionMapNames) {
+      assert(FunctionMap->getNameForId(FunctionNameId) &&
+             "FunctionNameId out of range");
+      assert(FunctionMap->getNameForId(ModuleNameId) &&
+             "ModuleNameId out of range");
     }
-    FixedSizeFieldsOffset += FixedSizeFieldsSizePerEntry;
-  }
 
-  // Update Ptr to the end of the serialized map to meet the expectation of
-  // CodeGenDataReader.
-  Ptr = reinterpret_cast<const unsigned char *>(FixedSizeFieldsOffset);
-  auto IndexOperandHashesByteSize =
-      endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
-  Ptr = reinterpret_cast<const unsigned char *>(
-      reinterpret_cast<uintptr_t>(Ptr) + IndexOperandHashesByteSize);
-}
+    auto InstCount =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+
+    // Read IndexOperandHashes to build IndexOperandHashMap
+    auto NumIndexOperandHashes =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+    for (unsigned J = 0; J < NumIndexOperandHashes; ++J) {
+      auto InstIndex =
+          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+      auto OpndIndex =
+          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+      auto OpndHash =
+          endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
+      assert(InstIndex < InstCount && "InstIndex out of range");
+
+      IndexOperandHashMap->try_emplace({InstIndex, OpndIndex}, OpndHash);
+    }
 
-void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr) {
-  deserialize(Ptr, /*Lazy=*/false);
-}
+    // Insert a new StableFunctionEntry into the map.
+    auto FuncEntry = std::make_unique<StableFunctionMap::StableFunctionEntry>(
+        Hash, FunctionNameId, ModuleNameId, InstCount,
+        std::move(IndexOperandHashMap));
 
-void StableFunctionMapRecord::lazyDeserialize(
-    std::shared_ptr<MemoryBuffer> Buffer, uint64_t Offset) {
-  const auto *Ptr = reinterpret_cast<const unsigned char *>(
-      reinterpret_cast<uintptr_t>(Buffer->getBufferStart()) + Offset);
-  deserialize(Ptr, /*Lazy=*/true);
-  FunctionMap->Buffer = std::move(Buffer);
+    FunctionMap->insert(std::move(FuncEntry));
+  }
 }
 
 void StableFunctionMapRecord::serializeYAML(yaml::Output &YOS) const {
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 47640c4aac6df..73f11c1345daf 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -350,8 +350,9 @@ checkConstLocationCompatible(const StableFunctionMap::StableFunctionEntry &SF,
   return true;
 }
 
-static ParamLocsVecTy
-computeParamInfo(const StableFunctionMap::StableFunctionEntries &SFS) {
+static ParamLocsVecTy computeParamInfo(
+    const SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>>
+        &SFS) {
   std::map<std::vector<stable_hash>, ParamLocs> HashSeqToLocs;
   auto &RSF = *SFS[0];
   unsigned StableFunctionCount = SFS.size();
@@ -395,18 +396,19 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
   // Collect stable functions related to the current module.
   DenseMap<stable_hash, SmallVector<std::pair<Function *, FunctionHashInfo>>>
       HashToFuncs;
+  auto &Maps = FunctionMap->getFunctionMap();
   for (auto &F : M) {
     if (!isEligibleFunction(&F))
       continue;
     auto FI = llvm::StructuralHashWithDifferences(F, ignoreOp);
-    if (FunctionMap->contains(FI.FunctionHash))
+    if (Maps.contains(FI.FunctionHash))
       HashToFuncs[FI.FunctionHash].emplace_back(&F, std::move(FI));
   }
 
   for (auto &[Hash, Funcs] : HashToFuncs) {
     std::optional<ParamLocsVecTy> ParamLocsVec;
     SmallVector<FuncMergeInfo> FuncMergeInfos;
-    auto &SFS = FunctionMap->at(Hash);
+    auto &SFS = Maps.at(Hash);
     assert(!SFS.empty());
     auto &RFS = SFS[0];
 
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
index 47042d23cc2ca..a4022eb885b43 100644
--- a/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
@@ -36,11 +36,9 @@
 
 ; Merge the cgdata using llvm-cgdata.
 ; We now validate the content of the merged cgdata.
-; Two functions have the same hash with only one different constant at the same location.
+; Two functions have the same hash with only one different constnat at a same location.
 ; RUN: llvm-cgdata --merge -o %tout.cgdata %tout-nowrite.1 %tout-nowrite.2
 ; RUN: llvm-cgdata --convert %tout.cgdata   -o - | FileCheck %s
-; RUN: llvm-cgdata --merge -o %tout-lazy.cgdata %tout-nowrite.1 %tout-nowrite.2 -indexed-codegen-data-lazy-loading
-; RUN: llvm-cgdata --convert %tout-lazy.cgdata -indexed-codegen-data-lazy-loading -o - | FileCheck %s
 
 ; CHECK:      - Hash: [[#%d,HASH:]]
 ; CHECK-NEXT:   FunctionName: f1
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
index 2082eca58f073..0d2b0e848a2c9 100644
--- a/llvm/test/tools/llvm-cgdata/empty.test
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -16,7 +16,7 @@ RUN: llvm-cgdata --show %t_emptyheader.cgdata | count 0
 
 # The version number appears when asked, as it's in the header
 RUN: llvm-cgdata --show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
-VERSION: Version: 4
+VERSION: Version: 3
 
 # When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
 RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
@@ -30,7 +30,7 @@ RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
 #   uint64_t StableFunctionMapOffset;
 # }
 RUN: printf '\xffcgdata\x81' > %t_header.cgdata
-RUN: printf '\x04\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x03\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
index 9484371848a72..92ff484e31caf 100644
--- a/llvm/test/tools/llvm-cgdata/error.test
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -22,9 +22,9 @@ RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
 RUN: not llvm-cgdata --show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
 CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
 
-# The current version 4 while the header says 5.
+# The current version 3 while the header says 4.
 RUN: printf '\xffcgdata\x81' > %t_version.cgdata
-RUN: printf '\x05\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x04\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test b/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
index 70b83af407e5a..b060872113b1b 100644
--- a/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
+++ b/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
@@ -23,8 +23,6 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-both-hashtree-funcma
 # Merge an object file having cgdata (__llvm_outline and __llvm_merge)
 RUN: llvm-cgdata -m --skip-trim %t/merge-both-hashtree-funcmap.o -o %t/merge-both-hashtree-funcmap.cgdata
 RUN: llvm-cgdata -s %t/merge-both-hashtree-funcmap.cgdata | FileCheck %s
-RUN: llvm-cgdata -m --skip-trim %t/merge-both-hashtree-funcmap.o -o %t/merge-both-hashtree-funcmap-lazy.cgdata -indexed-codegen-data-lazy-loading
-RUN: llvm-cgdata -s %t/merge-both-hashtree-funcmap-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 3
@@ -65,4 +63,4 @@ CHECK-NEXT:  Mergeable function Count: 0
 
 ;--- merge-both-template.ll
 @.data1 = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
- at .data2 = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data2 = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
index c088ffbb4e83f..2936086321028 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
@@ -23,8 +23,8 @@ RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
 # Merge the archive into the codegen data file.
 RUN: llvm-cgdata --merge --skip-trim %t/merge-archive.a -o %t/merge-archive.cgdata
 RUN: llvm-cgdata --show %t/merge-archive.cgdata | FileCheck %s
-RUN: llvm-cgdata --merge --skip-trim %t/merge-archive.a -o %t/merge-archive-lazy.cgdata -indexed-codegen-data-lazy-loading
-RUN: llvm-cgdata --show %t/merge-archive-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
+
+RUN: llvm-cgdata --show %t/merge-archive.cgdata| FileCheck %s
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
 CHECK-NEXT:  Total function Count: 2
@@ -65,7 +65,7 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-1-template.ll
- at .data = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
 
 ;--- raw-2.cgtext
 :stable_function_map
@@ -80,4 +80,4 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-2-template.ll
- at .data = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
index 90b5992973b49..d2965456a1999 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
@@ -17,8 +17,6 @@ RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat-template-
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
 RUN: llvm-cgdata --merge --skip-trim %t/merge-concat.o -o %t/merge-concat.cgdata
 RUN: llvm-cgdata --show %t/merge-concat.cgdata | FileCheck %s
-RUN: llvm-cgdata --merge --skip-trim %t/merge-concat.o -o %t/merge-concat-lazy.cgdata -indexed-codegen-data-lazy-loading
-RUN: llvm-cgdata --show %t/merge-concat-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
@@ -76,5 +74,5 @@ MAP-NEXT: ...
 ; In an linked executable (as opposed to an object file), cgdata in __llvm_merge might be concatenated.
 ; Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated.
 ; In other words, the following two trees are encoded back-to-back in a binary format.
- at .data1 = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
- at .data2 = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data1 = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data2 = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
index b986aef26f1d7..8277e3272d77e 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
@@ -19,9 +19,8 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
 # Merge two object files into the codegen data file.
 RUN: llvm-cgdata --merge --skip-trim %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+
 RUN: llvm-cgdata --show %t/merge.cgdata | FileCheck %s
-RUN: llvm-cgdata --merge --skip-trim %t/merge-1.o %t/merge-2.o -o %t/merge-lazy.cgdata -indexed-codegen-data-lazy-loading
-RUN: llvm-cgdata --show %t/merge-lazy.cgdata -indexed-codegen-data-lazy-loading  | FileCheck %s
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
 CHECK-NEXT:  Total function Count: 2
@@ -62,7 +61,7 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-1-template.ll
- at .data = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
 
 ;--- raw-2.cgtext
 :stable_function_map
@@ -77,4 +76,4 @@ MAP-NEXT: ...
 ...
 
 ;--- merge-2-template.ll
- at .data = private unnamed_addr constant [84 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [68 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
index eac852ff7e710..9469f1cbda331 100644
--- a/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
@@ -15,8 +15,6 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merg
 # Merge an object file having cgdata (__llvm_merge)
 RUN: llvm-cgdata -m --skip-trim %t/merge-single.o -o %t/merge-single.cgdata
 RUN: llvm-cgdata -s %t/merge-single.cgdata | FileCheck %s
-RUN: llvm-cgdata -m --skip-trim %t/merge-single.o -o %t/merge-single-lazy.cgdata -indexed-codegen-data-lazy-loading 
-RUN: llvm-cgdata -s %t/merge-single-lazy.cgdata -indexed-codegen-data-lazy-loading | FileCheck %s
 CHECK: Stable function map:
 CHECK-NEXT:  Unique hash Count: 1
 CHECK-NEXT:  Total function Count: 1
@@ -35,4 +33,4 @@ CHECK-NEXT:  Mergeable function Count: 0
 ...
 
 ;--- merge-single-template.ll
- at .data = private unnamed_addr constant [84 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data = private unnamed_addr constant [68 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/tools/llvm-cgdata/Opts.td b/llvm/tools/llvm-cgdata/Opts.td
index 2b515a0140e67..8da933f744e87 100644
--- a/llvm/tools/llvm-cgdata/Opts.td
+++ b/llvm/tools/llvm-cgdata/Opts.td
@@ -31,4 +31,3 @@ def : JoinedOrSeparate<["-"], "o">, Alias<output>, MetaVarName<"<file>">, HelpTe
 def format : Option<["--"], "format", KIND_SEPARATE>,
              HelpText<"Specify the output format (text or binary)">, MetaVarName<"<value>">;
 def : JoinedOrSeparate<["-"], "f">, Alias<format>, HelpText<"Alias for --format">;
-def indexed_codegen_data_lazy_loading : F<"indexed-codegen-data-lazy-loading", "Lazily load indexed CodeGenData for testing purpose.">, Flags<[HelpHidden]>;
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 047557e5a7fae..98fa5c5657353 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -83,8 +83,6 @@ static CGDataAction Action;
 static std::optional<CGDataFormat> OutputFormat;
 static std::vector<std::string> InputFilenames;
 
-extern cl::opt<bool> IndexedCodeGenDataLazyLoading;
-
 static void exitWithError(Twine Message, StringRef Whence = "",
                           StringRef Hint = "") {
   WithColor::error();
@@ -363,9 +361,6 @@ static void parseArgs(int argc, char **argv) {
   default:
     llvm_unreachable("unrecognized action");
   }
-
-  IndexedCodeGenDataLazyLoading =
-      Args.hasArg(OPT_indexed_codegen_data_lazy_loading);
 }
 
 int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
diff --git a/llvm/unittests/CGData/StableFunctionMapTest.cpp b/llvm/unittests/CGData/StableFunctionMapTest.cpp
index 5cf62ae0b3943..d551ac8a814f4 100644
--- a/llvm/unittests/CGData/StableFunctionMapTest.cpp
+++ b/llvm/unittests/CGData/StableFunctionMapTest.cpp
@@ -117,7 +117,7 @@ TEST(StableFunctionMap, Finalize3) {
   Map.finalize();
   auto &M = Map.getFunctionMap();
   EXPECT_THAT(M, SizeIs(1));
-  auto &FuncEntries = M.begin()->second.Entries;
+  auto &FuncEntries = M.begin()->second;
   for (auto &FuncEntry : FuncEntries) {
     EXPECT_THAT(*FuncEntry->IndexOperandHashMap, SizeIs(1));
     ASSERT_THAT(*FuncEntry->IndexOperandHashMap,

>From 1beaf8b4a6b7c016f95d9e7beeb1cafb52bec0e8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 15 Aug 2025 09:02:56 +0900
Subject: [PATCH 44/53] TableGen: Emit statically generated hash table for
 runtime libcalls (#150192)

a96121089b9c94e08c6632f91f2dffc73c0ffa28 reverted a change
to use a binary search on the string name table because it
was too slow. This replaces it with a static string hash
table based on the known set of libcall names. Microbenchmarking
shows this is similarly fast to using DenseMap. It's possibly
slightly slower than using StringSet, though these aren't an
exact comparison. This also saves on the one time use construction
of the map, so it could be better in practice.

This search isn't simple set check, since it does find the
range of possible matches with the same name. There's also
an additional check for whether the current target supports
the name. The runtime constructed set doesn't require this,
since it only adds the symbols live for the target.

Followed algorithm from this post
http://0x80.pl/notesen/2023-04-30-lookup-in-strings.html

I'm also thinking the 2 special case global symbols should
just be added to RuntimeLibcalls. There are also other global
references emitted in the backend that aren't tracked; we probably
should just use this as a centralized database for all compiler
selected symbols.
---
 llvm/benchmarks/CMakeLists.txt                |  17 ++
 llvm/benchmarks/RuntimeLibcalls.cpp           | 116 ++++++++++
 llvm/include/llvm/IR/RuntimeLibcalls.h        |  44 +++-
 llvm/lib/IR/RuntimeLibcalls.cpp               |  59 ++---
 llvm/lib/Object/IRSymtab.cpp                  |  47 ++--
 llvm/test/TableGen/RuntimeLibcallEmitter.td   |  26 +++
 llvm/unittests/IR/CMakeLists.txt              |   1 +
 llvm/unittests/IR/RuntimeLibcallsTest.cpp     |  63 ++++++
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 201 +++++++++++++++++-
 9 files changed, 492 insertions(+), 82 deletions(-)
 create mode 100644 llvm/benchmarks/RuntimeLibcalls.cpp
 create mode 100644 llvm/unittests/IR/RuntimeLibcallsTest.cpp

diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 1078efa55f497..9613678d2e0ac 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -11,3 +11,20 @@ add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED)
 add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED)
 
+# Extract the list of symbols in a random utility as sample data.
+set(SYMBOL_TEST_DATA_FILE "sample_symbol_list.txt")
+set(SYMBOL_TEST_DATA_SOURCE_BINARY $<TARGET_FILE:llc>)
+
+add_custom_command(OUTPUT ${SYMBOL_TEST_DATA_FILE}
+  COMMAND $<TARGET_FILE:llvm-nm> --no-demangle --no-sort
+  --format=just-symbols
+  ${SYMBOL_TEST_DATA_SOURCE_BINARY} > ${SYMBOL_TEST_DATA_FILE}
+  DEPENDS "$<TARGET_FILE:llvm-nm>" "$<TARGET_FILE:llc>")
+
+add_custom_target(generate-runtime-libcalls-sample-symbol-list
+                  DEPENDS ${SYMBOL_TEST_DATA_FILE})
+add_benchmark(RuntimeLibcallsBench RuntimeLibcalls.cpp PARTIAL_SOURCES_INTENDED)
+
+add_dependencies(RuntimeLibcallsBench generate-runtime-libcalls-sample-symbol-list)
+target_compile_definitions(RuntimeLibcallsBench PRIVATE
+  -DSYMBOL_TEST_DATA_FILE="${CMAKE_CURRENT_BINARY_DIR}/${SYMBOL_TEST_DATA_FILE}")
diff --git a/llvm/benchmarks/RuntimeLibcalls.cpp b/llvm/benchmarks/RuntimeLibcalls.cpp
new file mode 100644
index 0000000000000..47f68abff1e0d
--- /dev/null
+++ b/llvm/benchmarks/RuntimeLibcalls.cpp
@@ -0,0 +1,116 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "benchmark/benchmark.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/TargetParser/Triple.h"
+#include <random>
+#include <string>
+using namespace llvm;
+
+static constexpr unsigned MaxFuncNameSize = 53;
+
+static std::vector<StringRef> getLibcallNameStringRefs() {
+  std::vector<StringRef> Names(RTLIB::NumLibcallImpls);
+  // Keep the strlens on the StringRef construction out of the benchmark loop.
+  for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls()) {
+    const char *Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LC);
+    Names[LC] = StringRef(Name);
+  }
+
+  return Names;
+}
+
+static std::vector<std::string> getRandomFuncNames() {
+  std::mt19937_64 Rng;
+  std::uniform_int_distribution<> StringLengthDistribution(1, MaxFuncNameSize);
+  std::uniform_int_distribution<> CharDistribution(1, 255);
+  int NumTestFuncs = 1 << 10;
+  std::vector<std::string> TestFuncNames(NumTestFuncs);
+
+  for (std::string &TestFuncName : TestFuncNames) {
+    for (int I = 0, E = StringLengthDistribution(Rng); I != E; ++I)
+      TestFuncName += static_cast<char>(CharDistribution(Rng));
+  }
+
+  return TestFuncNames;
+}
+
+static std::vector<std::string> readSymbolsFromFile(StringRef InputFile) {
+  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile, /*IsText=*/true);
+  if (!BufOrError) {
+    reportFatalUsageError("failed to open \'" + Twine(InputFile) +
+                          "\': " + BufOrError.getError().message());
+  }
+
+  // Hackily figure out if there's a prefix on the symbol names - llvm-nm
+  // appears to not have a flag to skip this.
+  llvm::Triple HostTriple(LLVM_HOST_TRIPLE);
+  std::string DummyDatalayout = "e";
+  DummyDatalayout += DataLayout::getManglingComponent(HostTriple);
+
+  DataLayout DL(DummyDatalayout);
+  char GlobalPrefix = DL.getGlobalPrefix();
+
+  std::vector<std::string> Lines;
+  for (line_iterator LineIt(**BufOrError, /*SkipBlanks=*/true);
+       !LineIt.is_at_eof(); ++LineIt) {
+    StringRef SymbolName = *LineIt;
+    SymbolName.consume_front(StringRef(&GlobalPrefix, 1));
+
+    Lines.push_back(SymbolName.str());
+  }
+  return Lines;
+}
+
+static void BM_LookupRuntimeLibcallByNameKnownCalls(benchmark::State &State) {
+  std::vector<StringRef> Names = getLibcallNameStringRefs();
+
+  for (auto _ : State) {
+    for (StringRef Name : Names) {
+      benchmark::DoNotOptimize(
+          RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName(Name).empty());
+    }
+  }
+}
+
+static void BM_LookupRuntimeLibcallByNameRandomCalls(benchmark::State &State) {
+  std::vector<std::string> TestFuncNames = getRandomFuncNames();
+
+  for (auto _ : State) {
+    for (const std::string &Name : TestFuncNames) {
+      benchmark::DoNotOptimize(
+          RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName(StringRef(Name))
+              .empty());
+    }
+  }
+}
+
+// This isn't fully representative, it doesn't include any anonymous functions.
+// nm -n --no-demangle --format=just-symbols sample-binary > sample.txt
+static void BM_LookupRuntimeLibcallByNameSampleData(benchmark::State &State) {
+  std::vector<std::string> TestFuncNames =
+      readSymbolsFromFile(SYMBOL_TEST_DATA_FILE);
+  for (auto _ : State) {
+    for (const std::string &Name : TestFuncNames) {
+      benchmark::DoNotOptimize(
+          RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName(StringRef(Name))
+              .empty());
+    }
+  }
+}
+
+BENCHMARK(BM_LookupRuntimeLibcallByNameKnownCalls);
+BENCHMARK(BM_LookupRuntimeLibcallByNameRandomCalls);
+BENCHMARK(BM_LookupRuntimeLibcallByNameSampleData);
+
+BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 2d1d07c5fd81b..078098eeb7148 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -132,11 +132,41 @@ struct RuntimeLibcallsInfo {
     return ImplToLibcall[Impl];
   }
 
+  /// Check if a function name is a recognized runtime call of any kind. This
+  /// does not consider if this call is available for any current compilation,
+  /// just that it is a known call somewhere. This returns the set of all
+  /// LibcallImpls which match the name; multiple implementations with the same
+  /// name may exist but differ in interpretation based on the target context.
+  ///
+  /// Generated by tablegen.
+  LLVM_ABI static inline iota_range<RTLIB::LibcallImpl>
+  lookupLibcallImplName(StringRef Name){
+  // Inlining the early exit on the string name appears to be worthwhile when
+  // querying a real set of symbols
+#define GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+#include "llvm/IR/RuntimeLibcalls.inc"
+#undef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+  }
+
   /// Check if this is valid libcall for the current module, otherwise
   /// RTLIB::Unsupported.
-  LLVM_ABI RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const;
+  LLVM_ABI RTLIB::LibcallImpl
+      getSupportedLibcallImpl(StringRef FuncName) const {
+    for (RTLIB::LibcallImpl Impl : lookupLibcallImplName(FuncName)) {
+      // FIXME: This should not depend on looking up ImplToLibcall, only the
+      // list of libcalls for the module.
+      RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
+      if (Recognized != RTLIB::Unsupported)
+        return Recognized;
+    }
+
+    return RTLIB::Unsupported;
+  }
 
 private:
+  LLVM_ABI static iota_range<RTLIB::LibcallImpl>
+  lookupLibcallImplNameImpl(StringRef Name);
+
   /// Stores the implementation choice for each each libcall.
   RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
       RTLIB::Unsupported};
@@ -157,13 +187,11 @@ struct RuntimeLibcallsInfo {
   /// Map from a concrete LibcallImpl implementation to its RTLIB::Libcall kind.
   LLVM_ABI static const RTLIB::Libcall ImplToLibcall[RTLIB::NumLibcallImpls];
 
-  /// Check if a function name is a recognized runtime call of any kind. This
-  /// does not consider if this call is available for any current compilation,
-  /// just that it is a known call somewhere. This returns the set of all
-  /// LibcallImpls which match the name; multiple implementations with the same
-  /// name may exist but differ in interpretation based on the target context.
-  LLVM_ABI static iterator_range<ArrayRef<uint16_t>::const_iterator>
-  getRecognizedLibcallImpls(StringRef FuncName);
+  /// Utility function for tablegenerated lookup function. Return a range of
+  /// enum values that apply for the function name at \p NameOffsetEntry with
+  /// the value \p StrOffset.
+  static inline iota_range<RTLIB::LibcallImpl>
+  libcallImplNameHit(uint16_t NameOffsetEntry, uint16_t StrOffset);
 
   static bool darwinHasSinCosStret(const Triple &TT) {
     if (!TT.isOSDarwin())
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index ac845c4998783..88cb192c08781 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -9,6 +9,7 @@
 #include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/ADT/StringTable.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/xxhash.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
 
 #define DEBUG_TYPE "runtime-libcalls-info"
@@ -18,9 +19,11 @@ using namespace RTLIB;
 
 #define GET_INIT_RUNTIME_LIBCALL_NAMES
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
+#define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 #include "llvm/IR/RuntimeLibcalls.inc"
 #undef GET_INIT_RUNTIME_LIBCALL_NAMES
 #undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
+#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
@@ -58,49 +61,23 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
   }
 }
 
-RTLIB::LibcallImpl
-RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const {
-  const ArrayRef<uint16_t> RuntimeLibcallNameOffsets(
-      RuntimeLibcallNameOffsetTable);
-
-  iterator_range<ArrayRef<uint16_t>::const_iterator> Range =
-      getRecognizedLibcallImpls(FuncName);
-
-  for (auto I = Range.begin(); I != Range.end(); ++I) {
-    RTLIB::LibcallImpl Impl =
-        static_cast<RTLIB::LibcallImpl>(I - RuntimeLibcallNameOffsets.begin());
-
-    // FIXME: This should not depend on looking up ImplToLibcall, only the list
-    // of libcalls for the module.
-    RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
-    if (Recognized != RTLIB::Unsupported)
-      return Recognized;
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+iota_range<RTLIB::LibcallImpl>
+RuntimeLibcallsInfo::libcallImplNameHit(uint16_t NameOffsetEntry,
+                                        uint16_t StrOffset) {
+  int NumAliases = 1;
+  for (uint16_t Entry : ArrayRef(RuntimeLibcallNameOffsetTable)
+                            .drop_front(NameOffsetEntry + 1)) {
+    if (Entry != StrOffset)
+      break;
+    ++NumAliases;
   }
 
-  return RTLIB::Unsupported;
-}
-
-iterator_range<ArrayRef<uint16_t>::const_iterator>
-RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) {
-  StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName);
-  if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName)
-    return iterator_range(ArrayRef<uint16_t>());
-
-  uint16_t IndexVal = It.offset().value();
-  const ArrayRef<uint16_t> TableRef(RuntimeLibcallNameOffsetTable);
-
-  ArrayRef<uint16_t>::const_iterator E = TableRef.end();
-  ArrayRef<uint16_t>::const_iterator EntriesBegin =
-      std::lower_bound(TableRef.begin(), E, IndexVal);
-  ArrayRef<uint16_t>::const_iterator EntriesEnd = EntriesBegin;
-
-  while (EntriesEnd != E && *EntriesEnd == IndexVal)
-    ++EntriesEnd;
-
-  assert(EntriesBegin != E &&
-         "libcall found in name table but not offset table");
-
-  return make_range(EntriesBegin, EntriesEnd);
+  RTLIB::LibcallImpl ImplStart = static_cast<RTLIB::LibcallImpl>(
+      &RuntimeLibcallNameOffsetTable[NameOffsetEntry] -
+      &RuntimeLibcallNameOffsetTable[0]);
+  return enum_seq(ImplStart,
+                  static_cast<RTLIB::LibcallImpl>(ImplStart + NumAliases));
 }
 
 bool RuntimeLibcallsInfo::isAAPCS_ABI(const Triple &TT, StringRef ABIName) {
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 0f194953787e6..0043f02107fb8 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool> DisableBitcodeVersionUpgrade(
     "disable-bitcode-version-upgrade", cl::Hidden,
     cl::desc("Disable automatic bitcode upgrade for version mismatch"));
 
-static const char *PreservedSymbols[] = {
+static constexpr StringLiteral PreservedSymbols[] = {
     // There are global variables, so put it here instead of in
     // RuntimeLibcalls.td.
     // TODO: Are there similar such variables?
@@ -54,6 +54,10 @@ static const char *PreservedSymbols[] = {
     "__stack_chk_guard",
 };
 
+static bool isPreservedGlobalVarName(StringRef Name) {
+  return PreservedSymbols[0] == Name || PreservedSymbols[1] == Name;
+}
+
 namespace {
 
 const char *getExpectedProducerName() {
@@ -81,12 +85,16 @@ struct Builder {
   // The StringTableBuilder does not create a copy of any strings added to it,
   // so this provides somewhere to store any strings that we create.
   Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder,
-          BumpPtrAllocator &Alloc)
-      : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {}
+          BumpPtrAllocator &Alloc, const Triple &TT)
+      : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT),
+        Libcalls(TT) {}
 
   DenseMap<const Comdat *, int> ComdatMap;
   Mangler Mang;
-  Triple TT;
+  const Triple &TT;
+
+  // FIXME: This shouldn't be here.
+  RTLIB::RuntimeLibcallsInfo Libcalls;
 
   std::vector<storage::Comdat> Comdats;
   std::vector<storage::Module> Mods;
@@ -98,6 +106,10 @@ struct Builder {
 
   std::vector<storage::Str> DependentLibraries;
 
+  bool isPreservedLibFuncName(StringRef Name) {
+    return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported;
+  }
+
   void setStr(storage::Str &S, StringRef Value) {
     S.Offset = StrtabBuilder.add(Value);
     S.Size = Value.size();
@@ -213,19 +225,6 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
   return P.first->second;
 }
 
-static StringSet<> buildPreservedSymbolsSet(const Triple &TT) {
-  StringSet<> PreservedSymbolSet;
-  PreservedSymbolSet.insert(std::begin(PreservedSymbols),
-                            std::end(PreservedSymbols));
-  // FIXME: Do we need to pass in ABI fields from TargetOptions?
-  RTLIB::RuntimeLibcallsInfo Libcalls(TT);
-  for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
-    if (Impl != RTLIB::Unsupported)
-      PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl));
-  }
-  return PreservedSymbolSet;
-}
-
 Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
                          const SmallPtrSet<GlobalValue *, 4> &Used,
                          ModuleSymbolTable::Symbol Msym) {
@@ -279,13 +278,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
     return Error::success();
   }
 
-  setStr(Sym.IRName, GV->getName());
-
-  static const StringSet<> PreservedSymbolsSet =
-      buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
-  bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
+  StringRef GVName = GV->getName();
+  setStr(Sym.IRName, GVName);
 
-  if (Used.count(GV) || IsPreservedSymbol)
+  if (Used.count(GV) || isPreservedLibFuncName(GVName) ||
+      isPreservedGlobalVarName(GVName))
     Sym.Flags |= 1 << storage::Symbol::FB_used;
   if (GV->isThreadLocal())
     Sym.Flags |= 1 << storage::Symbol::FB_tls;
@@ -352,7 +349,6 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
   setStr(Hdr.Producer, kExpectedProducerName);
   setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str());
   setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
-  TT = IRMods[0]->getTargetTriple();
 
   for (auto *M : IRMods)
     if (Error Err = addModule(M))
@@ -378,7 +374,8 @@ Error Builder::build(ArrayRef<Module *> IRMods) {
 Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
                       StringTableBuilder &StrtabBuilder,
                       BumpPtrAllocator &Alloc) {
-  return Builder(Symtab, StrtabBuilder, Alloc).build(Mods);
+  const Triple &TT = Mods[0]->getTargetTriple();
+  return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods);
 }
 
 // Upgrade a vector of bitcode modules created by an old version of LLVM by
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index a2d946f3aa84f..54ca3f97e2d4b 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -149,6 +149,32 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: RTLIB::SQRT_F80, // RTLIB::sqrtl_f80
 // CHECK-NEXT: };
 
+// CHECK: #ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-NEXT: size_t Size = Name.size();
+// CHECK-NEXT: if (Size == 0 || Size > 9)
+// CHECK-NEXT:   return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
+// CHECK-NEXT: return lookupLibcallImplNameImpl(Name);
+// CHECK-NEXT: #endif
+
+// CHECK: #ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-NEXT: static inline uint64_t hash(StringRef Str) {
+// CHECK-NEXT: return static_cast<uint32_t>(xxh3_64bits(Str));
+// CHECK-NEXT: }
+
+// CHECK: iota_range<RTLIB::LibcallImpl> RTLIB::RuntimeLibcallsInfo::lookupLibcallImplNameImpl(StringRef Name) {
+// CHECK: static constexpr std::pair<uint16_t, uint16_t> HashTableNameToEnum[16] = {
+// CHECK: {2, 9}, // 0x000000705301b8, ___memset
+// CHECK: {0, 0},
+// CHECK: {6, 6}, // 0x0000001417a2af, calloc
+// CHECK: {0, 0},
+// CHECK: };
+
+// CHECK: unsigned Idx = (hash(Name) % 8) * 2;
+// CHECK: for (int I = 0; I != 2; ++I) {
+// CHECK: return libcallImplNameHit(Entry, StrOffset);
+
+// CHECK: return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
+// CHECK-NEXT: }
 
 // CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) {
 // CHECK-NEXT:  struct LibcallImplPair {
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index b66eae93f9339..8b7bd3997ea27 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_unittest(IRTests
   PatternMatch.cpp
   ShuffleVectorInstTest.cpp
   StructuralHashTest.cpp
+  RuntimeLibcallsTest.cpp
   TimePassesTest.cpp
   TypesTest.cpp
   UseTest.cpp
diff --git a/llvm/unittests/IR/RuntimeLibcallsTest.cpp b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
new file mode 100644
index 0000000000000..94ed56e92bd55
--- /dev/null
+++ b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/ADT/STLExtras.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+TEST(RuntimeLibcallsTest, LibcallImplByName) {
+  EXPECT_TRUE(RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("").empty());
+  EXPECT_TRUE(
+      RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("unknown").empty());
+  EXPECT_TRUE(
+      RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("Unsupported").empty());
+  EXPECT_TRUE(
+      RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("unsupported").empty());
+
+  for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls()) {
+    const char *Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LC);
+    EXPECT_TRUE(is_contained(
+        RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName(Name), LC));
+  }
+
+  // Test first libcall name
+  EXPECT_EQ(
+      RTLIB::arm64ec__Unwind_Resume,
+      *RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("#_Unwind_Resume")
+           .begin());
+  // Test longest libcall names
+  EXPECT_EQ(RTLIB::__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes,
+            *RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName(
+                 "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes")
+                 .begin());
+
+  {
+    auto SquirtleSquad =
+        RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("sqrtl");
+    ASSERT_EQ(size(SquirtleSquad), 3);
+    auto I = SquirtleSquad.begin();
+    EXPECT_EQ(*I++, RTLIB::sqrt_f128);
+    EXPECT_EQ(*I++, RTLIB::sqrt_f80);
+    EXPECT_EQ(*I++, RTLIB::sqrt_ppcf128);
+  }
+
+  // Last libcall
+  {
+    auto Truncs = RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("truncl");
+    ASSERT_EQ(size(Truncs), 3);
+    auto I = Truncs.begin();
+    EXPECT_EQ(*I++, RTLIB::trunc_f128);
+    EXPECT_EQ(*I++, RTLIB::trunc_f80);
+    EXPECT_EQ(*I++, RTLIB::trunc_ppcf128);
+  }
+}
+
+} // namespace
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 0fc230c4714f0..775cef22db0b6 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -6,10 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "runtime-libcall-emitter"
+
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
@@ -215,6 +220,9 @@ class RuntimeLibcallEmitter {
 private:
   void emitGetRuntimeLibcallEnum(raw_ostream &OS) const;
 
+  void emitNameMatchHashTable(raw_ostream &OS,
+                              StringToOffsetTable &OffsetTable) const;
+
   void emitGetInitRuntimeLibcallNames(raw_ostream &OS) const;
 
   void emitSystemRuntimeLibrarySetCalls(raw_ostream &OS) const;
@@ -255,12 +263,9 @@ class RuntimeLibcallEmitter {
       RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
                                              LibCallImplEnumVal++);
 
-      RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
-
+      const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
       Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
 
-      // const RuntimeLibcallImpl &LibCallImpl =
-      // RuntimeLibcallImplDefList.back();
       if (LibCallImpl.isDefault()) {
         const RuntimeLibcall *Provides = LibCallImpl.getProvides();
         if (!Provides)
@@ -282,6 +287,13 @@ class RuntimeLibcallEmitter {
   void run(raw_ostream &OS);
 };
 
+/// Helper struct for the name hash table.
+struct LookupEntry {
+  StringRef FuncName;
+  uint64_t Hash = 0;
+  unsigned TableValue = 0;
+};
+
 } // End anonymous namespace.
 
 void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
@@ -295,8 +307,6 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
     OS << "  " << Name << " = " << LibCall.getEnumVal() << ",\n";
   }
 
-  // TODO: Emit libcall names as string offset table.
-
   OS << "  UNKNOWN_LIBCALL = " << RuntimeLibcallDefList.size()
      << "\n};\n\n"
         "enum LibcallImpl : unsigned short {\n"
@@ -315,8 +325,181 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
         "#endif\n\n";
 }
 
+// StringMap uses xxh3_64bits, truncated to uint32_t.
+static uint64_t hash(StringRef Str) {
+  return static_cast<uint32_t>(xxh3_64bits(Str));
+}
+
+static void emitHashFunction(raw_ostream &OS) {
+  OS << "static inline uint64_t hash(StringRef Str) {\n"
+        "  return static_cast<uint32_t>(xxh3_64bits(Str));\n"
+        "}\n\n";
+}
+
+/// Return the table size, maximum number of collisions for the set of hashes
+static std::pair<int, int>
+computePerfectHashParameters(ArrayRef<uint64_t> Hashes) {
+  const int SizeOverhead = 10;
+  const int NumHashes = Hashes.size();
+
+  // Index derived from hash -> number of collisions.
+  DenseMap<uint64_t, int> Table;
+
+  for (int MaxCollisions = 1;; ++MaxCollisions) {
+    for (int N = NumHashes; N < SizeOverhead * NumHashes; ++N) {
+      Table.clear();
+
+      bool NeedResize = false;
+      for (uint64_t H : Hashes) {
+        uint64_t Idx = H % static_cast<uint64_t>(N);
+        if (++Table[Idx] > MaxCollisions) {
+          // Need to resize the final table if we increased the collision count.
+          NeedResize = true;
+          break;
+        }
+      }
+
+      if (!NeedResize)
+        return {N, MaxCollisions};
+    }
+  }
+}
+
+static std::vector<LookupEntry>
+constructPerfectHashTable(ArrayRef<RuntimeLibcallImpl> Keywords,
+                          ArrayRef<uint64_t> Hashes, int Size, int Collisions,
+                          StringToOffsetTable &OffsetTable) {
+  DenseSet<StringRef> Seen;
+  std::vector<LookupEntry> Lookup(Size * Collisions);
+
+  for (const RuntimeLibcallImpl &LibCallImpl : Keywords) {
+    StringRef ImplName = LibCallImpl.getLibcallFuncName();
+
+    // We do not want to add repeated entries for cases with the same name, only
+    // an entry for the first, with the name collision enum values immediately
+    // following.
+    if (!Seen.insert(ImplName).second)
+      continue;
+
+    uint64_t HashValue = Hashes[LibCallImpl.getEnumVal() - 1];
+
+    uint64_t Idx = (HashValue % static_cast<uint64_t>(Size)) *
+                   static_cast<uint64_t>(Collisions);
+
+    bool Found = false;
+    for (int J = 0; J < Collisions; ++J) {
+      LookupEntry &Entry = Lookup[Idx + J];
+      if (Entry.TableValue == 0) {
+        Entry.FuncName = ImplName;
+        Entry.TableValue = LibCallImpl.getEnumVal();
+        Entry.Hash = HashValue;
+        Found = true;
+        break;
+      }
+    }
+
+    if (!Found)
+      reportFatalInternalError("failure to hash " + ImplName);
+  }
+
+  return Lookup;
+}
+
+/// Generate hash table based lookup by name.
+void RuntimeLibcallEmitter::emitNameMatchHashTable(
+    raw_ostream &OS, StringToOffsetTable &OffsetTable) const {
+  std::vector<uint64_t> Hashes(RuntimeLibcallImplDefList.size());
+
+  size_t MaxFuncNameSize = 0;
+  size_t Index = 0;
+  for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    StringRef ImplName = LibCallImpl.getLibcallFuncName();
+    MaxFuncNameSize = std::max(MaxFuncNameSize, ImplName.size());
+    Hashes[Index++] = hash(ImplName);
+  }
+
+  LLVM_DEBUG({
+    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+      StringRef ImplName = LibCallImpl.getLibcallFuncName();
+      if (ImplName.size() == MaxFuncNameSize) {
+        dbgs() << "Maximum runtime libcall name size: " << ImplName << '('
+               << MaxFuncNameSize << ")\n";
+      }
+    }
+  });
+
+  // Early exiting on the symbol name provides a significant speedup in the miss
+  // case on the set of symbols in a clang binary. Emit this as an inlinable
+  // precondition in the header.
+  //
+  // The empty check is also used to get sensible behavior on anonymous
+  // functions.
+  //
+  // TODO: It may make more sense to split the search by string size more. There
+  // are a few outliers, most call names are small.
+  OS << "#ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY\n"
+        "  size_t Size = Name.size();\n"
+        "  if (Size == 0 || Size > "
+     << MaxFuncNameSize
+     << ")\n"
+        "    return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);\n"
+        " return lookupLibcallImplNameImpl(Name);\n"
+        "#endif\n";
+
+  auto [Size, Collisions] = computePerfectHashParameters(Hashes);
+  std::vector<LookupEntry> Lookup = constructPerfectHashTable(
+      RuntimeLibcallImplDefList, Hashes, Size, Collisions, OffsetTable);
+
+  LLVM_DEBUG(dbgs() << "Runtime libcall perfect hashing parameters: Size = "
+                    << Size << ", maximum collisions = " << Collisions << '\n');
+
+  OS << "#ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME\n";
+  emitHashFunction(OS);
+
+  OS << "iota_range<RTLIB::LibcallImpl> RTLIB::RuntimeLibcallsInfo::"
+        "lookupLibcallImplNameImpl(StringRef Name) {\n";
+
+  // Emit pair of RTLIB::LibcallImpl, size of the string name. It's important to
+  // avoid strlen on the string table entries.
+  OS << "  static constexpr std::pair<uint16_t, uint16_t> HashTableNameToEnum["
+     << Lookup.size() << "] = {\n";
+
+  for (auto [FuncName, Hash, TableVal] : Lookup) {
+    OS << "    {" << TableVal << ", " << FuncName.size() << "},";
+
+    if (TableVal != 0) {
+      OS << " // " << format_hex(Hash, 16) << ", " << FuncName;
+    }
+
+    OS << '\n';
+  }
+
+  OS << "  };\n\n";
+
+  OS << "  unsigned Idx = (hash(Name) % " << Size << ") * " << Collisions
+     << ";\n\n"
+        "  for (int I = 0; I != "
+     << Collisions << R"(; ++I) {
+    auto [Entry, StringSize] = HashTableNameToEnum[Idx + I];
+    const uint16_t StrOffset = RuntimeLibcallNameOffsetTable[Entry];
+    StringRef Str(
+      &RTLIB::RuntimeLibcallsInfo::RuntimeLibcallImplNameTableStorage[StrOffset],
+      StringSize);
+    if (Str == Name)
+      return libcallImplNameHit(Entry, StrOffset);
+  }
+
+  return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
+}
+)";
+
+  OS << "#endif\n\n";
+}
+
 void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
     raw_ostream &OS) const {
+  OS << "#ifdef GET_INIT_RUNTIME_LIBCALL_NAMES\n";
+
   // Emit the implementation names
   StringToOffsetTable Table(/*AppendZero=*/true,
                             "RTLIB::RuntimeLibcallsInfo::");
@@ -351,6 +534,10 @@ const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
     OS << '\n';
   }
   OS << "};\n\n";
+
+  OS << "#endif\n\n";
+
+  emitNameMatchHashTable(OS, Table);
 }
 
 void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
@@ -531,9 +718,7 @@ void RuntimeLibcallEmitter::run(raw_ostream &OS) {
   emitSourceFileHeader("Runtime LibCalls Source Fragment", OS, Records);
   emitGetRuntimeLibcallEnum(OS);
 
-  OS << "#ifdef GET_INIT_RUNTIME_LIBCALL_NAMES\n";
   emitGetInitRuntimeLibcallNames(OS);
-  OS << "#endif\n\n";
 
   OS << "#ifdef GET_SET_TARGET_RUNTIME_LIBCALL_SETS\n";
   emitSystemRuntimeLibrarySetCalls(OS);

>From 2072dc6e4b98c379b49b35246b148a2b48850eaf Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu at apple.com>
Date: Thu, 14 Aug 2025 17:05:33 -0700
Subject: [PATCH 45/53] [Support] Add mapped_file_region::sync(), equivalent to
 msync (#153632)

---
 llvm/include/llvm/Support/FileSystem.h |  5 ++++
 llvm/lib/Support/Unix/Path.inc         |  6 +++++
 llvm/lib/Support/Windows/Path.inc      |  8 ++++++
 llvm/unittests/Support/Path.cpp        | 37 ++++++++++++++++++++++++++
 4 files changed, 56 insertions(+)

diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index 31fedc37bf776..ee7a77c578747 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -1342,6 +1342,11 @@ class mapped_file_region {
   LLVM_ABI size_t size() const;
   LLVM_ABI char *data() const;
 
+  /// Write changes to disk and synchronize. Equivalent to POSIX msync. This
+  /// will wait for flushing memory-mapped region back to disk and can be very
+  /// slow.
+  LLVM_ABI std::error_code sync() const;
+
   /// Get a const view of the data. Modifying this memory has undefined
   /// behavior.
   LLVM_ABI const char *const_data() const;
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index cc02cae40ec76..31fb1e8fe9b75 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -876,6 +876,12 @@ void mapped_file_region::unmapImpl() {
     ::munmap(Mapping, Size);
 }
 
+std::error_code mapped_file_region::sync() const {
+  if (int Res = ::msync(Mapping, Size, MS_SYNC))
+    return std::error_code(Res, std::generic_category());
+  return std::error_code();
+}
+
 void mapped_file_region::dontNeedImpl() {
   assert(Mode == mapped_file_region::readonly);
   if (!Mapping)
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index fdf9d540a6488..9001c19c057cf 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -1006,6 +1006,14 @@ void mapped_file_region::unmapImpl() {
 
 void mapped_file_region::dontNeedImpl() {}
 
+std::error_code mapped_file_region::sync() const {
+  if (!::FlushViewOfFile(Mapping, Size))
+    return mapWindowsError(GetLastError());
+  if (!::FlushFileBuffers(FileHandle))
+    return mapWindowsError(GetLastError());
+  return std::error_code();
+}
+
 int mapped_file_region::alignment() {
   SYSTEM_INFO SysInfo;
   ::GetSystemInfo(&SysInfo);
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index 355aa6b9ade06..888729b9dd249 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -1471,6 +1471,43 @@ TEST_F(FileSystemTest, FileMapping) {
   ASSERT_NO_ERROR(fs::remove(TempPath));
 }
 
+TEST_F(FileSystemTest, FileMappingSync) {
+  // Create a temp file.
+  SmallString<0> TempPath(TestDirectory);
+  sys::path::append(TempPath, "test-%%%%");
+  auto TempFileOrError = fs::TempFile::create(TempPath);
+  ASSERT_TRUE((bool)TempFileOrError);
+  fs::TempFile File = std::move(*TempFileOrError);
+  StringRef Content("hello there");
+  std::string FileName = File.TmpName;
+  ASSERT_NO_ERROR(
+      fs::resize_file_before_mapping_readwrite(File.FD, Content.size()));
+  {
+    // Map in the file and write some content.
+    std::error_code EC;
+    fs::mapped_file_region MFR(fs::convertFDToNativeFile(File.FD),
+                               fs::mapped_file_region::readwrite,
+                               Content.size(), 0, EC);
+
+    // Keep the file so it can be read.
+    ASSERT_FALSE((bool)File.keep());
+
+    // Write content through mapped memory.
+    ASSERT_NO_ERROR(EC);
+    std::copy(Content.begin(), Content.end(), MFR.data());
+
+    // Synchronize to file system.
+    ASSERT_FALSE((bool)MFR.sync());
+
+    // Check the file content using file IO APIs.
+    auto Buffer = MemoryBuffer::getFile(FileName);
+    ASSERT_TRUE((bool)Buffer);
+    ASSERT_EQ(Content, Buffer->get()->getBuffer());
+  }
+  // Manually remove the test file.
+  ASSERT_FALSE((bool)fs::remove(FileName));
+}
+
 TEST(Support, NormalizePath) {
   //                           Input,        Expected Win, Expected Posix
   using TestTuple = std::tuple<const char *, const char *, const char *>;

>From 6677c3273b0491367acf75b0c5c6967811453e31 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 14 Aug 2025 17:17:16 -0700
Subject: [PATCH 46/53] [AMDGPU] Fix the comment wrt SSrc_* RCs. NFC. (#153711)

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 81655f5a829fb..0293d4018770f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1166,7 +1166,8 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName>
 }
 
 //===----------------------------------------------------------------------===//
-//  SSrc_* Operands with an SGPR or a 32-bit immediate
+//  SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate
+//  if supported by target.
 //===----------------------------------------------------------------------===//
 
 class SrcRegOrImm9<RegisterClass regClass, string operandType>

>From ca222194c2d6afd38e916d35c81e91c4a26dd0c2 Mon Sep 17 00:00:00 2001
From: Oliver Hunt <oliver at apple.com>
Date: Thu, 14 Aug 2025 17:20:27 -0700
Subject: [PATCH 47/53] [clang][Obj-C][PAC] Make block descriptor pointer
 signing configurable (#153700)

Pointer auth protection of the block descriptor pointer is only
supported in some constrained environments so we do actually need it to
be configurable.

We had made it non configurable in the first PR to protect block
metadata because we believed that was an option but subsequently
realised it does need to remain configurable.

This PR revives the flags that permit this.
---
 clang/include/clang/Basic/Features.def            |  2 +-
 clang/include/clang/Basic/LangOptions.def         |  2 ++
 clang/include/clang/Driver/Options.td             |  1 +
 clang/lib/Frontend/CompilerInvocation.cpp         | 11 ++++++++---
 .../ptrauth-block-descriptor-pointer.m            | 15 +++++++++++++--
 clang/test/CodeGenObjC/ptrauth-block-isa.m        |  2 +-
 6 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 0f6cd005bfd03..b9efc6a6a2e9d 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -155,7 +155,7 @@ FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtr
 FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
 FEATURE(ptrauth_type_info_vtable_pointer_discrimination, LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
 FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
-FEATURE(ptrauth_signed_block_descriptors, LangOpts.PointerAuthCalls)
+FEATURE(ptrauth_signed_block_descriptors, LangOpts.PointerAuthBlockDescriptorPointers)
 FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFunctionTypeDiscrimination)
 FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 08d98a77e0252..f094ba112988f 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -136,6 +136,8 @@ LANGOPT(PointerAuthObjcInterfaceSel, 1, 0, NotCompatible, "authentication of SEL
 LANGOPT(PointerAuthObjcInterfaceSelKey, 16, 0, NotCompatible, "authentication key for SEL fields of ObjC interfaces")
 LANGOPT(PointerAuthObjcClassROPointers, 1, 0, Benign, "class_ro_t pointer authentication")
 
+LANGOPT(PointerAuthBlockDescriptorPointers, 1, 0, NotCompatible, "enable signed block descriptors")
+
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, NotCompatible, "'[[]]' attributes extension for all language standard modes")
 LANGOPT(ExperimentalLateParseAttributes, 1, 0, NotCompatible, "experimental late parsing of attributes")
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7712a49cef154..858f37c392107 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4546,6 +4546,7 @@ defm aarch64_jump_table_hardening: OptInCC1FFlag<"aarch64-jump-table-hardening",
 defm ptrauth_objc_isa : OptInCC1FFlag<"ptrauth-objc-isa", "Enable signing and authentication of Objective-C object's 'isa' field">;
 defm ptrauth_objc_interface_sel : OptInCC1FFlag<"ptrauth-objc-interface-sel", "Enable signing and authentication of Objective-C object's 'SEL' fields">;
 defm ptrauth_objc_class_ro : OptInCC1FFlag<"ptrauth-objc-class-ro", "Enable signing and authentication for ObjC class_ro pointers">;
+defm ptrauth_block_descriptor_pointers : OptInCC1FFlag<"ptrauth-block-descriptor-pointers", "Enable signing and authentication of block descriptors">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index a4d18966be35f..da96352e1d82c 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1548,9 +1548,10 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
         PointerAuthSchema(Key::ASIA, true, Discrimination::None);
     Opts.BlockByrefHelperFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::None);
-    Opts.BlockDescriptorPointers =
-        PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
-                          BlockDescriptorConstantDiscriminator);
+    if (LangOpts.PointerAuthBlockDescriptorPointers)
+      Opts.BlockDescriptorPointers =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            BlockDescriptorConstantDiscriminator);
 
     Opts.ObjCMethodListFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::None);
@@ -3608,6 +3609,8 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_objc_interface_sel);
   if (Opts.PointerAuthObjcClassROPointers)
     GenerateArg(Consumer, OPT_fptrauth_objc_class_ro);
+  if (Opts.PointerAuthBlockDescriptorPointers)
+    GenerateArg(Consumer, OPT_fptrauth_block_descriptor_pointers);
 }
 
 static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
@@ -3631,6 +3634,8 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthELFGOT = Args.hasArg(OPT_fptrauth_elf_got);
   Opts.AArch64JumpTableHardening =
       Args.hasArg(OPT_faarch64_jump_table_hardening);
+  Opts.PointerAuthBlockDescriptorPointers =
+      Args.hasArg(OPT_fptrauth_block_descriptor_pointers);
   Opts.PointerAuthObjcIsa = Args.hasArg(OPT_fptrauth_objc_isa);
   Opts.PointerAuthObjcClassROPointers = Args.hasArg(OPT_fptrauth_objc_class_ro);
   Opts.PointerAuthObjcInterfaceSel =
diff --git a/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
index 559cddfd4e866..b51670fd6459a 100644
--- a/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
+++ b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -triple arm64e-apple-ios  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -fptrauth-block-descriptor-pointers -triple arm64e-apple-ios  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -triple arm64e-apple-ios -DNO_BLOCK_DESC_AUTH -emit-llvm -o - %s | FileCheck %s --check-prefix=NODESCRIPTORAUTH
 
+#ifndef NO_BLOCK_DESC_AUTH
 _Static_assert(__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should set ptrauth_signed_block_descriptors");
+#else
+_Static_assert(!__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should not be enabled by default");
+#endif
 
 void a() {
   // Test out a global block.
@@ -8,9 +13,11 @@ void a() {
 }
 
 // CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr [[BLOCK_DESCRIPTOR_NAME]], i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
 
+// NODESCRIPTORAUTH: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+// NODESCRIPTORAUTH: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr [[BLOCK_DESCRIPTOR_NAME]] }
 
-// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr @"__block_descriptor_32_e5_v8\01?0l", i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
 
 void b(int p) {
   // CHECK-LABEL: define void @b
@@ -25,4 +32,8 @@ void b(int p) {
   // CHECK: [[SIGNED_REF:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @"__block_descriptor_36_e5_v8\01?0l" to i64), i32 2, i64 [[BLENDED]])
   // CHECK: [[SIGNED_REF_PTR:%.*]] = inttoptr i64 [[SIGNED_REF]] to ptr
   // CHECK: store ptr [[SIGNED_REF_PTR]], ptr [[BLOCK_DESCRIPTOR_REF]]
+
+  // NODESCRIPTORAUTH: [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32 }>
+  // NODESCRIPTORAUTH: [[BLOCK_DESCRIPTOR_REF:%.*]] = getelementptr inbounds nuw <{ {{.*}} }>, ptr [[BLOCK]], i32 0, i32 4
+  // NODESCRIPTORAUTH: store ptr @"__block_descriptor_36_e5_v8\01?0l", ptr [[BLOCK_DESCRIPTOR_REF]]
 }
diff --git a/clang/test/CodeGenObjC/ptrauth-block-isa.m b/clang/test/CodeGenObjC/ptrauth-block-isa.m
index c37fe8b0d7fec..248e57769ba1e 100644
--- a/clang/test/CodeGenObjC/ptrauth-block-isa.m
+++ b/clang/test/CodeGenObjC/ptrauth-block-isa.m
@@ -2,7 +2,7 @@
 
 void (^globalblock)(void) = ^{};
 // CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }, comdat, align 8
-// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr @__block_literal_global), i32 1342177280, i32 0, ptr ptrauth (ptr @globalblock_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr [[BLOCK_DESCRIPTOR_NAME]], i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr @__block_literal_global), i32 1342177280, i32 0, ptr ptrauth (ptr @globalblock_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr [[BLOCK_DESCRIPTOR_NAME]] }
 
 @interface A
 - (int) count;

>From 79b8605ed6170dd22e0fe836f2031cc47ac1a6fd Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Thu, 14 Aug 2025 17:20:54 -0700
Subject: [PATCH 48/53] [RISCV][NFC] Make the pointer in the test case for
 #153709 non-null

The snippet was originally from llvm-reduce but we probably shouldn't use a null
pointer in the actual test case.

NFC.
---
 .../RISCV/rvv/incorrect-extract-subvector-combine.ll        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll b/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll
index 6a0c03f339717..3b0d14a5591fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/incorrect-extract-subvector-combine.ll
@@ -4,11 +4,11 @@
 ; Previously, an incorrect (extract_subvector (extract_subvector X, C), 0) DAG combine crashed
 ; this snippet.
 
-define <8 x i16> @gsm_encode() {
+define <8 x i16> @gsm_encode(ptr %p) {
 ; CHECK-LABEL: gsm_encode:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 19, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v8, (zero)
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 12
 ; CHECK-NEXT:    vmv.x.s a0, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
@@ -29,7 +29,7 @@ define <8 x i16> @gsm_encode() {
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %0 = load <19 x i16>, ptr null, align 2
+  %0 = load <19 x i16>, ptr %p, align 2
   %1 = shufflevector <19 x i16> zeroinitializer, <19 x i16> %0, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 28, i32 31, i32 poison, i32 poison>
   %2 = shufflevector <9 x i16> %1, <9 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
   ret <8 x i16> %2

>From 3222124e5bcfc78d9c93d12913fd9df7a2098dcd Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Fri, 15 Aug 2025 00:24:52 +0000
Subject: [PATCH 49/53] [gn build] Port 769a9058c8d0

---
 llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
index 0d162ff0f9d57..229586a9b0e3c 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -43,6 +43,7 @@ unittest("IRTests") {
     "PassBuilderCallbacksTest.cpp",
     "PassManagerTest.cpp",
     "PatternMatch.cpp",
+    "RuntimeLibcallsTest.cpp",
     "ShuffleVectorInstTest.cpp",
     "StructuralHashTest.cpp",
     "TimePassesTest.cpp",

>From 5341147155a0c11f0a5bcf5079eb508499b1b531 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 15 Aug 2025 09:55:39 +0900
Subject: [PATCH 50/53] RuntimeLibcalls: Return StringRef for libcall names
 (#153209)

Does not yet fully propagate this down into the TargetLowering
uses, many of which are relying on null checks on the returned
value.
---
 llvm/benchmarks/RuntimeLibcalls.cpp             |  6 ++----
 llvm/include/llvm/CodeGen/TargetLowering.h      | 12 ++++++++----
 llvm/include/llvm/IR/RuntimeLibcalls.h          | 17 ++++++++---------
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp   |  2 +-
 llvm/lib/LTO/LTO.cpp                            |  2 +-
 .../WebAssemblyRuntimeLibcallSignatures.cpp     |  4 ++--
 .../Transforms/Utils/DeclareRuntimeLibcalls.cpp |  2 +-
 llvm/unittests/IR/RuntimeLibcallsTest.cpp       |  2 +-
 8 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/benchmarks/RuntimeLibcalls.cpp b/llvm/benchmarks/RuntimeLibcalls.cpp
index 47f68abff1e0d..81a5a24ec8f93 100644
--- a/llvm/benchmarks/RuntimeLibcalls.cpp
+++ b/llvm/benchmarks/RuntimeLibcalls.cpp
@@ -22,10 +22,8 @@ static constexpr unsigned MaxFuncNameSize = 53;
 static std::vector<StringRef> getLibcallNameStringRefs() {
   std::vector<StringRef> Names(RTLIB::NumLibcallImpls);
   // Keep the strlens on the StringRef construction out of the benchmark loop.
-  for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls()) {
-    const char *Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LC);
-    Names[LC] = StringRef(Name);
-  }
+  for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls())
+    Names[LC] = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LC);
 
   return Names;
 }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 20e4dfaaff6e1..ed7495694cc70 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3552,15 +3552,19 @@ class LLVM_ABI TargetLoweringBase {
 
   /// Get the libcall routine name for the specified libcall.
   const char *getLibcallName(RTLIB::Libcall Call) const {
-    return Libcalls.getLibcallName(Call);
+    // FIXME: Return StringRef
+    return Libcalls.getLibcallName(Call).data();
   }
 
   /// Get the libcall routine name for the specified libcall implementation
-  const char *getLibcallImplName(RTLIB::LibcallImpl Call) const {
-    return Libcalls.getLibcallImplName(Call);
+  static StringRef getLibcallImplName(RTLIB::LibcallImpl Call) {
+    return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Call);
   }
 
-  const char *getMemcpyName() const { return Libcalls.getMemcpyName(); }
+  const char *getMemcpyName() const {
+    // FIXME: Return StringRef
+    return Libcalls.getMemcpyName().data();
+  }
 
   /// Get the comparison predicate that's to be used to test the result of the
   /// comparison libcall against zero. This should only be used with
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 078098eeb7148..620774fd296e3 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -77,17 +77,15 @@ struct RuntimeLibcallsInfo {
 
   /// Get the libcall routine name for the specified libcall.
   // FIXME: This should be removed. Only LibcallImpl should have a name.
-  const char *getLibcallName(RTLIB::Libcall Call) const {
+  StringRef getLibcallName(RTLIB::Libcall Call) const {
     return getLibcallImplName(LibcallImpls[Call]);
   }
 
   /// Get the libcall routine name for the specified libcall implementation.
-  // FIXME: Change to return StringRef
-  static const char *getLibcallImplName(RTLIB::LibcallImpl CallImpl) {
+  static StringRef getLibcallImplName(RTLIB::LibcallImpl CallImpl) {
     if (CallImpl == RTLIB::Unsupported)
-      return nullptr;
-    return RuntimeLibcallImplNameTable[RuntimeLibcallNameOffsetTable[CallImpl]]
-        .data();
+      return StringRef();
+    return RuntimeLibcallImplNameTable[RuntimeLibcallNameOffsetTable[CallImpl]];
   }
 
   /// Return the lowering's selection of implementation call for \p Call
@@ -119,9 +117,10 @@ struct RuntimeLibcallsInfo {
 
   /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
   /// unsupported.
-  const char *getMemcpyName() const {
-    if (const char *Memcpy = getLibcallName(RTLIB::MEMCPY))
-      return Memcpy;
+  StringRef getMemcpyName() const {
+    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
+    if (Memcpy != RTLIB::Unsupported)
+      return getLibcallImplName(Memcpy);
 
     // Fallback to memmove if memcpy isn't available.
     return getLibcallName(RTLIB::MEMMOVE);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 9fa96e7372961..96c9cde622b45 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -145,7 +145,7 @@ static bool lowerObjCCall(Function &F, RTLIB::LibcallImpl NewFn,
 
   // FIXME: When RuntimeLibcalls is an analysis, check if the function is really
   // supported, and go through RTLIB::Libcall.
-  const char *NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
+  StringRef NewFnName = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(NewFn);
 
   // If we haven't already looked up this function, check to see if the
   // program already contains a function with this name.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 0323b4d433b87..35d24c17bbd93 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1422,7 +1422,7 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
 
   for (RTLIB::LibcallImpl Impl : LibcallImpls) {
     if (Impl != RTLIB::Unsupported)
-      LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl));
+      LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
   }
 
   return LibcallSymbols;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 4548a7520b3b4..45b0e7dc12263 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -533,8 +533,8 @@ struct StaticLibcallNameMap {
     // different libcalls.
     RTLIB::RuntimeLibcallsInfo RTCI(TT);
     for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-      const char *NameLibcall = RTCI.getLibcallName(LC);
-      if (NameLibcall != nullptr &&
+      StringRef NameLibcall = RTCI.getLibcallName(LC);
+      if (!NameLibcall.empty() &&
           getRuntimeLibcallSignatures().Table[LC] != unsupported) {
         assert(!Map.contains(NameLibcall) &&
                "duplicate libcall names in name map");
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 540039b7d2cbd..0642d51cd2c21 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -30,7 +30,7 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
     FunctionType *FuncTy =
         FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
 
-    const char *FuncName = RTLCI.getLibcallImplName(Impl);
+    StringRef FuncName = RTLCI.getLibcallImplName(Impl);
     M.getOrInsertFunction(FuncName, FuncTy);
   }
 
diff --git a/llvm/unittests/IR/RuntimeLibcallsTest.cpp b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
index 94ed56e92bd55..012316801859c 100644
--- a/llvm/unittests/IR/RuntimeLibcallsTest.cpp
+++ b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
@@ -23,7 +23,7 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
       RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("unsupported").empty());
 
   for (RTLIB::LibcallImpl LC : RTLIB::libcall_impls()) {
-    const char *Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LC);
+    StringRef Name = RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LC);
     EXPECT_TRUE(is_contained(
         RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName(Name), LC));
   }

>From 1666c0de52c58e5c3a21f086469a6ebad0ea046d Mon Sep 17 00:00:00 2001
From: Brock Denson <brock.denson at virscient.com>
Date: Thu, 7 Aug 2025 10:09:25 -0500
Subject: [PATCH 51/53] [clang] unrecognized html tag causing undesirable
 comment lexing. fixes #32680

---
 clang/test/AST/ast-dump-comment.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index 820e6e97f21e3..9659f9eec1adc 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -146,3 +146,12 @@ int Test_UnderscoreInSpecialCommand;
 // CHECK:      VarDecl{{.*}}Test_UnderscoreInSpecialCommand 'int'
 // CHECK:        InlineCommandComment{{.*}} Name="thread_safe" RenderNormal
 // CHECK-NEXT:     TextComment{{.*}} Text=" test for underscore in special command"
+
+/// \param[out] Aaa <summary>Short summary</summary>
+int Test_HTMLSummaryTag(int Aaa);
+// CHECK:     FunctionDecl{{.*}}Test_HTMLSummaryTag
+// CHECK:       ParamCommandComment{{.*}} [out] explicitly Param="Aaa"
+// CHECK-NEXT:    ParagraphComment
+// CHECK:           HTMLStartTagComment{{.*}} Name="summary"
+// CHECK-NEXT:        TextComment{{.*}} Text="Short summary"
+// CHECK-NEXT:        HTMLEndTagComment{{.*}} Name="summary"

>From 21d56815a15cbbba46041ef3a02758532889397a Mon Sep 17 00:00:00 2001
From: Brock Denson <brock.denson at virscient.com>
Date: Thu, 7 Aug 2025 10:09:25 -0500
Subject: [PATCH 52/53] [clang] unrecognized html tag causing undesirable
 comment lexing. fixes #32680

---
 clang/docs/ReleaseNotes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cecc5696cb95f..4b760d3a53bf2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -224,6 +224,7 @@ Bug Fixes to AST Handling
   legal representation. This is fixed because ElaboratedTypes don't exist anymore. (#GH43179) (#GH68670) (#GH92757)
 - Fix unrecognized html tag causing undesirable comment lexing (#GH152944)
 - Fix comment lexing of special command names (#GH152943)
+- Fix unrecognized html tag causing undesirable comment lexing (#GH152944)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^

>From 5a4358abdab15177c97f0989e3e38b78237a160d Mon Sep 17 00:00:00 2001
From: Brock Denson <brock.denson at virscient.com>
Date: Fri, 15 Aug 2025 21:18:33 -0500
Subject: [PATCH 53/53] add tests for new tags

---
 clang/test/AST/ast-dump-comment.cpp | 40 +++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index 9659f9eec1adc..b5dbe2e317d8c 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -147,11 +147,35 @@ int Test_UnderscoreInSpecialCommand;
 // CHECK:        InlineCommandComment{{.*}} Name="thread_safe" RenderNormal
 // CHECK-NEXT:     TextComment{{.*}} Text=" test for underscore in special command"
 
-/// \param[out] Aaa <summary>Short summary</summary>
-int Test_HTMLSummaryTag(int Aaa);
-// CHECK:     FunctionDecl{{.*}}Test_HTMLSummaryTag
-// CHECK:       ParamCommandComment{{.*}} [out] explicitly Param="Aaa"
-// CHECK-NEXT:    ParagraphComment
-// CHECK:           HTMLStartTagComment{{.*}} Name="summary"
-// CHECK-NEXT:        TextComment{{.*}} Text="Short summary"
-// CHECK-NEXT:        HTMLEndTagComment{{.*}} Name="summary"
+/// <details>
+///   <summary>
+///     Summary
+///   </summary>
+///   <p>Details</p>
+/// </details>
+///
+/// Some <mark>highlighting</mark>
+///
+/// <figure>
+///   <img src="pic.jpg">
+///   <figcaption>Figure 1</figcaption>
+/// </figure>
+int Test_AdditionalHTMLTags(int Aaa);
+// CHECK:      FunctionDecl{{.*}}Test_AdditionalHTMLTags 'int (int)'
+// CHECK:        HTMLStartTagComment{{.*}} Name="details"
+// CHECK:        HTMLStartTagComment{{.*}} Name="summary"
+// CHECK-NEXT:   TextComment{{.*}} Text="     Summary"
+// CHECK:        HTMLEndTagComment{{.*}} Name="summary"
+// CHECK:        HTMLStartTagComment{{.*}} Name="p"
+// CHECK-NEXT:   TextComment{{.*}} Text="Details"
+// CHECK-NEXT:   HTMLEndTagComment{{.*}} Name="p"
+// CHECK:        HTMLEndTagComment{{.*}} Name="details"
+// CHECK:        HTMLStartTagComment{{.*}} Name="mark"
+// CHECK-NEXT:   TextComment{{.*}} Text="highlighting"
+// CHECK-NEXT:   HTMLEndTagComment{{.*}} Name="mark"
+// CHECK:        HTMLStartTagComment{{.*}} Name="figure"
+// CHECK:        HTMLStartTagComment{{.*}} Name="img" Attrs:  "src="pic.jpg"
+// CHECK:        HTMLStartTagComment{{.*}} Name="figcaption"
+// CHECK-NEXT:   TextComment{{.*}} Text="Figure 1"
+// CHECK-NEXT:   HTMLEndTagComment{{.*}} Name="figcaption"
+// CHECK:        HTMLEndTagComment{{.*}} Name="figure"