[llvm-branch-commits] [clang] [compiler-rt] [llvm] [mlir] [ThinLTO][Bitcode] Generate import type in bitcode (PR #87600)

Sun May 19 22:34:06 PDT 2024

https://github.com/minglotus-6 updated https://github.com/llvm/llvm-project/pull/87600

>From 001a785f664e3a16e61d1e350ea060b829f1856c Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Mon, 13 May 2024 20:51:25 -0700
Subject: [PATCH 01/44] update this patch as the second one

---
 llvm/include/llvm/Bitcode/BitcodeWriter.h     |  9 +++--
 .../llvm/LTO/legacy/ThinLTOCodeGenerator.h    |  5 ++-
 .../llvm/Transforms/IPO/FunctionImport.h      |  3 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 38 +++++++++++++++----
 llvm/lib/LTO/LTO.cpp                          |  7 +++-
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp         | 11 ++++--
 llvm/lib/Transforms/IPO/FunctionImport.cpp    |  5 ++-
 .../ThinLTO/X86/import_callee_declaration.ll  | 13 +++----
 llvm/tools/llvm-lto/llvm-lto.cpp              |  5 ++-
 9 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index 248d33f4502ef..a343f0e057631 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -102,7 +102,8 @@ class raw_ostream;
 
     void writeIndex(
         const ModuleSummaryIndex *Index,
-        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex);
+        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+        const GVSummaryPtrSet *DecSummaries);
   };
 
   /// Write the specified module to the specified raw output stream.
@@ -147,10 +148,12 @@ class raw_ostream;
   /// where it will be written in a new bitcode block. This is used when
   /// writing the combined index file for ThinLTO. When writing a subset of the
   /// index for a distributed backend, provide the \p ModuleToSummariesForIndex
-  /// map.
+  /// map. \p DecSummaries specifies the set of summaries for which the
+  /// corresponding value should be imported as a declaration (prototype).
   void writeIndexToFile(const ModuleSummaryIndex &Index, raw_ostream &Out,
                         const std::map<std::string, GVSummaryMapTy>
-                            *ModuleToSummariesForIndex = nullptr);
+                            *ModuleToSummariesForIndex = nullptr,
+                        const GVSummaryPtrSet *DecSummaries = nullptr);
 
   /// If EmbedBitcode is set, save a copy of the llvm IR as data in the
   ///  __LLVM,__bitcode section (.llvmbc on non-MacOS).
diff --git a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index c450acda82ad0..f1337e82485c9 100644
--- a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -271,12 +271,13 @@ class ThinLTOCodeGenerator {
                          const lto::InputFile &File);
 
   /**
-   * Compute the list of summaries needed for importing into module.
+   * Compute the list of summaries and the subset of declaration summaries
+   * needed for importing into module.
    */
   void gatherImportedSummariesForModule(
       Module &Module, ModuleSummaryIndex &Index,
       std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
-      const lto::InputFile &File);
+      GVSummaryPtrSet &DecSummaries, const lto::InputFile &File);
 
   /**
    * Perform internalization. Index is updated to reflect linkage changes.
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 024bba8105b89..f8b98d5f81770 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -216,7 +216,8 @@ void gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     const FunctionImporter::ImportMapTy &ImportList,
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
+    GVSummaryPtrSet &DecSummaries);
 
 /// Emit into \p OutputFilename the files module \p ModulePath will import from.
 std::error_code EmitImportsFiles(
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 6d01e3b4d8218..7b89424194f9b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -428,6 +428,11 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
   /// The combined index to write to bitcode.
   const ModuleSummaryIndex &Index;
 
+  /// When writing combined summaries, provides the set of global value
+  /// summaries for which the value (function, function alias, etc) should be
+  /// imported as a declaration.
+  const GVSummaryPtrSet *DecSummaries = nullptr;
+
   /// When writing a subset of the index for distributed backends, client
   /// provides a map of modules to the corresponding GUIDs/summaries to write.
   const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex;
@@ -452,11 +457,16 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
   /// Constructs a IndexBitcodeWriter object for the given combined index,
   /// writing to the provided \p Buffer. When writing a subset of the index
   /// for a distributed backend, provide a \p ModuleToSummariesForIndex map.
+  /// If provided, \p ModuleToDecSummaries specifies the set of summaries for
+  /// which the corresponding functions or aliased functions should be imported
+  /// as a declaration (but not definition) for each module.
   IndexBitcodeWriter(BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder,
                      const ModuleSummaryIndex &Index,
+                     const GVSummaryPtrSet *DecSummaries = nullptr,
                      const std::map<std::string, GVSummaryMapTy>
                          *ModuleToSummariesForIndex = nullptr)
       : BitcodeWriterBase(Stream, StrtabBuilder), Index(Index),
+        DecSummaries(DecSummaries),
         ModuleToSummariesForIndex(ModuleToSummariesForIndex) {
     // Assign unique value ids to all summaries to be written, for use
     // in writing out the call graph edges. Save the mapping from GUID
@@ -1202,7 +1212,8 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
 
 // Decode the flags for GlobalValue in the summary. See getDecodedGVSummaryFlags
 // in BitcodeReader.cpp.
-static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
+static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags,
+                                         bool ImportAsDecl = false) {
   uint64_t RawFlags = 0;
 
   RawFlags |= Flags.NotEligibleToImport; // bool
@@ -1217,7 +1228,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
 
   RawFlags |= (Flags.Visibility << 8); // 2 bits
 
-  RawFlags |= (Flags.ImportType << 10); // 1 bit
+  unsigned ImportType = Flags.ImportType | ImportAsDecl;
+  RawFlags |= (ImportType << 10); // 1 bit
 
   return RawFlags;
 }
@@ -4543,6 +4555,12 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  auto shouldImportValueAsDecl = [&](GlobalValueSummary *GVS) -> bool {
+    if (DecSummaries == nullptr)
+      return false;
+    return DecSummaries->contains(GVS);
+  };
+
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4653,7 +4671,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(*ValueId);
     assert(ModuleIdMap.count(FS->modulePath()));
     NameVals.push_back(ModuleIdMap[FS->modulePath()]);
-    NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
+    NameVals.push_back(
+        getEncodedGVSummaryFlags(FS->flags(), shouldImportValueAsDecl(FS)));
     NameVals.push_back(FS->instCount());
     NameVals.push_back(getEncodedFFlags(FS->fflags()));
     NameVals.push_back(FS->entryCount());
@@ -4702,7 +4721,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(AliasValueId);
     assert(ModuleIdMap.count(AS->modulePath()));
     NameVals.push_back(ModuleIdMap[AS->modulePath()]);
-    NameVals.push_back(getEncodedGVSummaryFlags(AS->flags()));
+    NameVals.push_back(
+        getEncodedGVSummaryFlags(AS->flags(), shouldImportValueAsDecl(AS)));
     auto AliaseeValueId = SummaryToValueIdMap[&AS->getAliasee()];
     assert(AliaseeValueId);
     NameVals.push_back(AliaseeValueId);
@@ -5036,8 +5056,9 @@ void BitcodeWriter::writeModule(const Module &M,
 
 void BitcodeWriter::writeIndex(
     const ModuleSummaryIndex *Index,
-    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
-  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index,
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+    const GVSummaryPtrSet *DecSummaries) {
+  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index, DecSummaries,
                                  ModuleToSummariesForIndex);
   IndexWriter.write();
 }
@@ -5090,12 +5111,13 @@ void IndexBitcodeWriter::write() {
 // index for a distributed backend, provide a \p ModuleToSummariesForIndex map.
 void llvm::writeIndexToFile(
     const ModuleSummaryIndex &Index, raw_ostream &Out,
-    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+    const GVSummaryPtrSet *DecSummaries) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256 * 1024);
 
   BitcodeWriter Writer(Buffer);
-  Writer.writeIndex(&Index, ModuleToSummariesForIndex);
+  Writer.writeIndex(&Index, ModuleToSummariesForIndex, DecSummaries);
   Writer.writeStrtab();
 
   Out.write((char *)&Buffer.front(), Buffer.size());
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 2bdfd03580145..79849f7a4ce40 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1399,10 +1399,12 @@ class lto::ThinBackendProc {
                   llvm::StringRef ModulePath,
                   const std::string &NewModulePath) {
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+    GVSummaryPtrSet DeclarationSummaries;
 
     std::error_code EC;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                     ImportList, ModuleToSummariesForIndex);
+                                     ImportList, ModuleToSummariesForIndex,
+                                     DeclarationSummaries);
 
     raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
                       sys::fs::OpenFlags::OF_None);
@@ -1410,7 +1412,8 @@ class lto::ThinBackendProc {
       return errorCodeToError(EC);
 
     // TODO: Serialize declaration bits to bitcode.
-    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
+    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
+                     &DeclarationSummaries);
 
     if (ShouldEmitImportsFiles) {
       EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 8f517eb50dc76..dc2d73b0130b5 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -766,7 +766,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
 void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
     Module &TheModule, ModuleSummaryIndex &Index,
     std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
-    const lto::InputFile &File) {
+    GVSummaryPtrSet &DecSummaries, const lto::InputFile &File) {
   auto ModuleCount = Index.modulePaths().size();
   auto ModuleIdentifier = TheModule.getModuleIdentifier();
 
@@ -796,7 +796,7 @@ void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
 
   llvm::gatherImportedSummariesForModule(
       ModuleIdentifier, ModuleToDefinedGVSummaries,
-      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 }
 
 /**
@@ -832,10 +832,15 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
                            IsPrevailing(PrevailingCopy), ImportLists,
                            ExportLists);
 
+  // 'EmitImportsFiles' emits the list of modules from which to import from, and
+  // the set of keys in `ModuleToSummariesForIndex` should be a superset of keys
+  // in `ModuleToDecSummaries`, so no need to use `ModuleToDecSummaries` in
+  // `EmitImportFiles`.
+  GVSummaryPtrSet DecSummaries;
   std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
   llvm::gatherImportedSummariesForModule(
       ModuleIdentifier, ModuleToDefinedGVSummaries,
-      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 
   std::error_code EC;
   if ((EC = EmitImportsFiles(ModuleIdentifier, OutputName,
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index e99fb554cd654..548b7c8bc4303 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1436,7 +1436,8 @@ void llvm::gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     const FunctionImporter::ImportMapTy &ImportList,
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
+    GVSummaryPtrSet &DecSummaries) {
   // Include all summaries from the importing module.
   ModuleToSummariesForIndex[std::string(ModulePath)] =
       ModuleToDefinedGVSummaries.lookup(ModulePath);
@@ -1451,7 +1452,7 @@ void llvm::gatherImportedSummariesForModule(
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
       if (Type == GlobalValueSummary::Declaration)
-        continue;
+        DecSummaries.insert(DS->second);
 
       SummariesForIndex[GUID] = DS->second;
     }
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
index b4ce10270e026..a9b30036fc435 100644
--- a/llvm/test/ThinLTO/X86/import_callee_declaration.ll
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -48,20 +48,19 @@
 ; First disassemble per-module summary and find out the GUID for {large_func, large_indirect_callee}.
 ;
 ; RUN: llvm-dis lib.bc -o - | FileCheck %s --check-prefix=LIB-DIS
+; LIB-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
 ; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
 ; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
-; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
 ;
-; Secondly disassemble main's combined summary and test that large callees are
-; not imported as declarations yet.
-; TODO: Serialize declaration bit and test declaration bits are correctly set.
+; Secondly disassemble main's combined summary and verify the import type of
+; these two GUIDs are declaration.
 ;
 ; RUN: llvm-dis main.bc.thinlto.bc -o - | FileCheck %s --check-prefix=MAIN-DIS
 ;
 ; MAIN-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
-; MAIN-DIS-NOT: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration)
+; MAIN-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), aliasee: [[LARGEINDIRECT]])))
 
 ; Run in-process ThinLTO and tests that
 ; 1. `callee` remains internalized even if the symbols of its callers
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index f310097eec634..8218bd5a74ea3 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -692,8 +692,9 @@ class ThinLTOProcessing {
       // Build a map of module to the GUIDs and summary objects that should
       // be written to its index.
       std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+      GVSummaryPtrSet DecSummaries;
       ThinGenerator.gatherImportedSummariesForModule(
-          *TheModule, *Index, ModuleToSummariesForIndex, *Input);
+          *TheModule, *Index, ModuleToSummariesForIndex, DecSummaries, *Input);
 
       std::string OutputName = OutputFilename;
       if (OutputName.empty()) {
@@ -703,7 +704,7 @@ class ThinLTOProcessing {
       std::error_code EC;
       raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::OF_None);
       error(EC, "error opening the file '" + OutputName + "'");
-      writeIndexToFile(*Index, OS, &ModuleToSummariesForIndex);
+      writeIndexToFile(*Index, OS, &ModuleToSummariesForIndex, &DecSummaries);
     }
   }
 

>From 321f6aabb60ff057693a65976bba6c77592bd825 Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Mon, 13 May 2024 21:20:22 -0700
Subject: [PATCH 02/44] update stale comment and use 'DAG' for check lines

---
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp             |  3 +--
 .../test/ThinLTO/X86/import_callee_declaration.ll | 15 ++++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index dc2d73b0130b5..b054b42b63777 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -834,8 +834,7 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
 
   // 'EmitImportsFiles' emits the list of modules from which to import from, and
   // the set of keys in `ModuleToSummariesForIndex` should be a superset of keys
-  // in `ModuleToDecSummaries`, so no need to use `ModuleToDecSummaries` in
-  // `EmitImportFiles`.
+  // in `DecSummaries`, so no need to use `DecSummaries` in `EmitImportFiles`.
   GVSummaryPtrSet DecSummaries;
   std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
   llvm::gatherImportedSummariesForModule(
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
index a9b30036fc435..3ff5125e37227 100644
--- a/llvm/test/ThinLTO/X86/import_callee_declaration.ll
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -51,6 +51,7 @@
 ; LIB-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
 ; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
 ; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
+; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
 ;
 ; Secondly disassemble main's combined summary and verify the import type of
 ; these two GUIDs are declaration.
@@ -82,13 +83,13 @@
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
 ; RUN:   -r=lib.bc,calleeAddrs,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
 
-; IMPORTDUMP: Not importing function 11825436545918268459 callee from lib.cc
-; IMPORTDUMP: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
-; IMPORTDUMP: Is importing function definition 13568239288960714650 small_indirect_callee from lib.cc
-; IMPORTDUMP: Is importing function definition 6976996067367342685 small_func from lib.cc
-; IMPORTDUMP: Is importing function declaration 2418497564662708935 large_func from lib.cc
-; IMPORTDUMP: Not importing global 7680325410415171624 calleeAddrs from lib.cc
-; IMPORTDUMP: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
+; IMPORTDUMP-DAG: Not importing function 11825436545918268459 callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function definition 13568239288960714650 small_indirect_callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function definition 6976996067367342685 small_func from lib.cc
+; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
+; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
+; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
 
 ; RUN: llvm-dis in-process.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
 

>From 31d9bd23366cf34b5d9161c276ab91090ba0b95a Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl at google.com>
Date: Wed, 15 May 2024 12:16:42 -0700
Subject: [PATCH 03/44] add comment for 'DecSummaries' parameter in
 FunctionImport.h

---
 llvm/include/llvm/Transforms/IPO/FunctionImport.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index f8b98d5f81770..72a0823c6627b 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -212,6 +212,9 @@ bool convertToDeclaration(GlobalValue &GV);
 /// \p ModuleToSummariesForIndex will be populated with the needed summaries
 /// from each required module path. Use a std::map instead of StringMap to get
 /// stable order for bitcode emission.
+///
+/// \p DecSummaries will be popluated with the subset of of summary pointers
+/// that have 'declaration' import type among all summaries the module need.
 void gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,

>From f87ed54e495eba7b9897654de4c17fbf101cb620 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sun, 19 May 2024 15:37:42 +0900
Subject: [PATCH 04/44] Reformat

---
 compiler-rt/lib/ctx_profile/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
index ab7bf3241fd69..d69cdf56df8fd 100644
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -26,4 +26,4 @@ add_compiler_rt_runtime(clang_rt.ctx_profile
   CFLAGS ${EXTRA_FLAGS}
   SOURCES ${CTX_PROFILE_SOURCES}
   ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
-  PARENT_TARGET ctx_profile)
\ No newline at end of file
+  PARENT_TARGET ctx_profile)

>From 9d15fc0060b584141674dddfedb06b0b58ad7aae Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic at gmail.com>
Date: Sun, 19 May 2024 15:41:03 +0900
Subject: [PATCH 05/44] Quick fix for a waning in clang_rt.ctx_profile
 [-Wgnu-anonymous-struct]

`__sanitizer_siginfo` has been introduced in D142117.
(llvmorg-16-init-17950-ged9ef9b4f248)
It is incompatible to -pedantic.

`clang_rt.ctx_profile` has been introduced in #92456.
---
 compiler-rt/cmake/config-ix.cmake          | 1 +
 compiler-rt/lib/ctx_profile/CMakeLists.txt | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index ba740af9e1d60..42edbe15edafb 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -127,6 +127,7 @@ check_cxx_compiler_flag("-Werror -Wframe-larger-than=512" COMPILER_RT_HAS_WFRAME
 check_cxx_compiler_flag("-Werror -Wglobal-constructors"   COMPILER_RT_HAS_WGLOBAL_CONSTRUCTORS_FLAG)
 check_cxx_compiler_flag("-Werror -Wc99-extensions"     COMPILER_RT_HAS_WC99_EXTENSIONS_FLAG)
 check_cxx_compiler_flag("-Werror -Wgnu"                COMPILER_RT_HAS_WGNU_FLAG)
+check_cxx_compiler_flag("-Werror -Wgnu-anonymous-struct" COMPILER_RT_HAS_WGNU_ANONYMOUS_STRUCT_FLAG)
 check_cxx_compiler_flag("-Werror -Wvariadic-macros"    COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG)
 check_cxx_compiler_flag("-Werror -Wunused-parameter"   COMPILER_RT_HAS_WUNUSED_PARAMETER_FLAG)
 check_cxx_compiler_flag("-Werror -Wcovered-switch-default" COMPILER_RT_HAS_WCOVERED_SWITCH_DEFAULT_FLAG)
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
index d69cdf56df8fd..ce491fc7e8bf0 100644
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -15,6 +15,9 @@ include_directories(../../include)
 # We don't use the C++ Standard Library here, so avoid including it by mistake.
 append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
 
+# __sanitizer_siginfo
+append_list_if(COMPILER_RT_HAS_WGNU_ANONYMOUS_STRUCT_FLAG -Wno-gnu-anonymous-struct EXTRA_FLAGS)
+
 if(COMPILER_RT_INCLUDE_TESTS)
   add_subdirectory(tests)
 endif()

>From b4ba3fe0068b2391e24ebf9a0ec6f56a8ac224b4 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97 at outlook.com>
Date: Sun, 19 May 2024 15:57:02 +0800
Subject: [PATCH 06/44] [NewPM][AMDGPU] Add CodeGenPassBuilder (#91040)

In order to test SelectionDAG for target AMDGPU, we need
CodeGenPassBuilder.
---
 .../AMDGPU/AMDGPUCodeGenPassBuilder.cpp       | 38 +++++++++++++++++++
 .../Target/AMDGPU/AMDGPUCodeGenPassBuilder.h  | 33 ++++++++++++++++
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  9 +++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |  6 +++
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |  3 ++
 .../Target/AMDGPU/R600CodeGenPassBuilder.cpp  | 33 ++++++++++++++++
 .../Target/AMDGPU/R600CodeGenPassBuilder.h    | 32 ++++++++++++++++
 llvm/lib/Target/AMDGPU/R600TargetMachine.cpp  |  9 +++++
 llvm/lib/Target/AMDGPU/R600TargetMachine.h    |  6 +++
 9 files changed, 169 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
 create mode 100644 llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
new file mode 100644
index 0000000000000..01ab61a0e4070
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp
@@ -0,0 +1,38 @@
+//===- lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.cpp ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCodeGenPassBuilder.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
+    AMDGPUTargetMachine &TM, const CGPassBuilderOption &Opts,
+    PassInstrumentationCallbacks *PIC)
+    : CodeGenPassBuilder(TM, Opts, PIC) {
+  Opt.RequiresCodeGenSCCOrder = true;
+  // Exceptions and StackMaps are not supported, so these passes will never do
+  // anything.
+  // Garbage collection is not supported.
+  disablePass<StackMapLivenessPass, FuncletLayoutPass,
+              ShadowStackGCLoweringPass>();
+}
+
+void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+  // TODO: Add passes pre instruction selection.
+}
+
+void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+                                             CreateMCStreamer) const {
+  // TODO: Add AsmPrinter.
+}
+
+Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
+  // TODO: Add instruction selector.
+  return Error::success();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
new file mode 100644
index 0000000000000..5f79e309703a3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h
@@ -0,0 +1,33 @@
+//===- lib/Target/AMDGPU/AMDGPUCodeGenPassBuilder.h -----------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCODEGENPASSBUILDER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCODEGENPASSBUILDER_H
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+
+class AMDGPUCodeGenPassBuilder
+    : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, AMDGPUTargetMachine> {
+public:
+  AMDGPUCodeGenPassBuilder(AMDGPUTargetMachine &TM,
+                           const CGPassBuilderOption &Opts,
+                           PassInstrumentationCallbacks *PIC);
+
+  void addPreISel(AddIRPass &addPass) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCODEGENPASSBUILDER_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 305a6c8c3b926..20329dea60275 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUCodeGenPassBuilder.h"
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUIGroupLP.h"
@@ -646,6 +647,14 @@ parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
   return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
 }
 
+Error AMDGPUTargetMachine::buildCodeGenPipeline(
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, const CGPassBuilderOption &Opts,
+    PassInstrumentationCallbacks *PIC) {
+  AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
+  return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
+}
+
 void AMDGPUTargetMachine::registerPassBuilderCallbacks(
     PassBuilder &PB, bool PopulateClassToPassNames) {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 30ab388c7d52e..e48cb8fdc6576 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -52,6 +52,12 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
     return TLOF.get();
   }
 
+  Error buildCodeGenPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out,
+                             raw_pwrite_stream *DwoOut,
+                             CodeGenFileType FileType,
+                             const CGPassBuilderOption &Opts,
+                             PassInstrumentationCallbacks *PIC) override;
+
   void registerPassBuilderCallbacks(PassBuilder &PB,
                                     bool PopulateClassToPassNames) override;
   void registerDefaultAliasAnalyses(AAManager &) override;
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 48325a0928f93..ead81b402eb76 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
   AMDGPUCallLowering.cpp
+  AMDGPUCodeGenPassBuilder.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
@@ -119,6 +120,7 @@ add_llvm_target(AMDGPUCodeGen
   GCNVOPDUtils.cpp
   R600AsmPrinter.cpp
   R600ClauseMergePass.cpp
+  R600CodeGenPassBuilder.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
   R600ExpandSpecialInstrs.cpp
@@ -182,6 +184,7 @@ add_llvm_target(AMDGPUCodeGen
   GlobalISel
   HipStdPar
   IPO
+  IRPrinter
   MC
   MIRParser
   Passes
diff --git a/llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.cpp b/llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.cpp
new file mode 100644
index 0000000000000..a57b3aa0adb15
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.cpp
@@ -0,0 +1,33 @@
+//===-- R600CodeGenPassBuilder.cpp ------ Build R600 CodeGen pipeline -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600CodeGenPassBuilder.h"
+#include "R600TargetMachine.h"
+
+using namespace llvm;
+
+R600CodeGenPassBuilder::R600CodeGenPassBuilder(
+    R600TargetMachine &TM, const CGPassBuilderOption &Opts,
+    PassInstrumentationCallbacks *PIC)
+    : CodeGenPassBuilder(TM, Opts, PIC) {
+  Opt.RequiresCodeGenSCCOrder = true;
+}
+
+void R600CodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+  // TODO: Add passes pre instruction selection.
+}
+
+void R600CodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+                                           CreateMCStreamer) const {
+  // TODO: Add AsmPrinter.
+}
+
+Error R600CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
+  // TODO: Add instruction selector.
+  return Error::success();
+}
diff --git a/llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.h b/llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.h
new file mode 100644
index 0000000000000..be7c935c094d9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/R600CodeGenPassBuilder.h
@@ -0,0 +1,32 @@
+//===-- R600CodeGenPassBuilder.h -- Build R600 CodeGen pipeline -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600CODEGENPASSBUILDER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600CODEGENPASSBUILDER_H
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
+
+namespace llvm {
+
+class R600TargetMachine;
+
+class R600CodeGenPassBuilder
+    : public CodeGenPassBuilder<R600CodeGenPassBuilder, R600TargetMachine> {
+public:
+  R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
+                         PassInstrumentationCallbacks *PIC);
+
+  void addPreISel(AddIRPass &addPass) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600CODEGENPASSBUILDER_H
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 2461263866a96..c550cfaf06c10 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "R600TargetMachine.h"
 #include "AMDGPUTargetMachine.h"
 #include "R600.h"
+#include "R600CodeGenPassBuilder.h"
 #include "R600MachineScheduler.h"
 #include "R600TargetTransformInfo.h"
 #include "llvm/Transforms/Scalar.h"
@@ -144,3 +145,11 @@ void R600PassConfig::addPreEmitPass() {
 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new R600PassConfig(*this, PM);
 }
+
+Error R600TargetMachine::buildCodeGenPipeline(
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, const CGPassBuilderOption &Opts,
+    PassInstrumentationCallbacks *PIC) {
+  R600CodeGenPassBuilder CGPB(*this, Opts, PIC);
+  return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
+}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
index af8dcb8488679..29e370edef2c6 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -38,6 +38,12 @@ class R600TargetMachine final : public AMDGPUTargetMachine {
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  Error buildCodeGenPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out,
+                             raw_pwrite_stream *DwoOut,
+                             CodeGenFileType FileType,
+                             const CGPassBuilderOption &Opt,
+                             PassInstrumentationCallbacks *PIC) override;
+
   const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override;
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) const override;

>From ef890572f379273da09db964b9ea1b67aa324762 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Sun, 19 May 2024 07:57:11 +0000
Subject: [PATCH 07/44] [gn build] Port b4ba3fe0068b

---
 llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index dad4f028236d8..c859b887828f5 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -138,6 +138,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUAtomicOptimizer.cpp",
     "AMDGPUAttributor.cpp",
     "AMDGPUCallLowering.cpp",
+    "AMDGPUCodeGenPassBuilder.cpp",
     "AMDGPUCodeGenPrepare.cpp",
     "AMDGPUCombinerHelper.cpp",
     "AMDGPUCtorDtorLowering.cpp",
@@ -206,6 +207,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "GCNVOPDUtils.cpp",
     "R600AsmPrinter.cpp",
     "R600ClauseMergePass.cpp",
+    "R600CodeGenPassBuilder.cpp",
     "R600ControlFlowFinalizer.cpp",
     "R600EmitClauseMarkers.cpp",
     "R600ExpandSpecialInstrs.cpp",

>From 9940620f6eab50deeaed0d976b2ea0afd007ba24 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 19 May 2024 16:08:58 +0800
Subject: [PATCH 08/44] [GISel][RISCV] Legalize G_CONSTANT_FOLD_BARRIER
 (#89960)

This patch supports `G_CONSTANT_FOLD_BARRIER` on RISCV to generate the
following inst seq without crash:
```
define i64 @xor_and_i64(i64 %x) {
entry:
  %y = and i64 %x, 16383
  %z = xor i64 %y, 16368
  ret i64 %z
}
```
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |  2 +-
 .../legalizer/legalize-constbarrier-rv32.mir  | 51 +++++++++++
 .../legalizer/legalize-constbarrier-rv64.mir  | 87 +++++++++++++++++++
 3 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 38c1f9868d7db..adc68e9ee4a89 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -227,7 +227,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   ConstantActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen);
 
   // TODO: transform illegal vector types into legal vector type
-  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_CONSTANT_FOLD_BARRIER})
       .legalFor({s32, sXLen, p0})
       .legalIf(typeIsLegalBoolVec(0, BoolVecTys, ST))
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
new file mode 100644
index 0000000000000..6b1fc2042e2b8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
@@ -0,0 +1,51 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            constbarrier_i32
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16368
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = G_CONSTANT i32 16368
+    %2:_(s32) = G_CONSTANT_FOLD_BARRIER %1
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_nxv2i1
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_nxv2i1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(<vscale x 2 x s1>) = G_CONSTANT_FOLD_BARRIER [[VMCLR_VL]]
+    ; CHECK-NEXT: $v8 = COPY [[CONSTANT_FOLD_BARRIER]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s1) = G_CONSTANT i1 0
+    %2:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %1(s1)
+    %3:_(<vscale x 2 x s1>) = G_CONSTANT_FOLD_BARRIER %2
+    $v8 = COPY %3(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            constbarrier_nxv2i32
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_nxv2i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(<vscale x 2 x s32>) = G_CONSTANT_FOLD_BARRIER [[SPLAT_VECTOR]]
+    ; CHECK-NEXT: $v8 = COPY [[CONSTANT_FOLD_BARRIER]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1(s32)
+    %3:_(<vscale x 2 x s32>) = G_CONSTANT_FOLD_BARRIER %2
+    $v8 = COPY %3(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
new file mode 100644
index 0000000000000..de6a82beee2ab
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
@@ -0,0 +1,87 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            constbarrier_i32
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16368
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = G_CONSTANT i32 16368
+    %2:_(s32) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s64) = G_ANYEXT %2(s32)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_i64
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16368
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s64) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = G_CONSTANT i64 16368
+    %2:_(s64) = G_CONSTANT_FOLD_BARRIER %1
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_nxv2i1
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_nxv2i1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(<vscale x 2 x s1>) = G_CONSTANT_FOLD_BARRIER [[VMCLR_VL]]
+    ; CHECK-NEXT: $v8 = COPY [[CONSTANT_FOLD_BARRIER]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s1) = G_CONSTANT i1 0
+    %2:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %1(s1)
+    %3:_(<vscale x 2 x s1>) = G_CONSTANT_FOLD_BARRIER %2
+    $v8 = COPY %3(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            constbarrier_nxv2i32
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_nxv2i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(<vscale x 2 x s32>) = G_CONSTANT_FOLD_BARRIER [[SPLAT_VECTOR]]
+    ; CHECK-NEXT: $v8 = COPY [[CONSTANT_FOLD_BARRIER]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1(s32)
+    %3:_(<vscale x 2 x s32>) = G_CONSTANT_FOLD_BARRIER %2
+    $v8 = COPY %3(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            constbarrier_nxv2i64
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_nxv2i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(<vscale x 2 x s64>) = G_CONSTANT_FOLD_BARRIER [[SPLAT_VECTOR]]
+    ; CHECK-NEXT: $v8m2 = COPY [[CONSTANT_FOLD_BARRIER]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %1(s64)
+    %3:_(<vscale x 2 x s64>) = G_CONSTANT_FOLD_BARRIER %2(<vscale x 2 x s64>)
+    $v8m2 = COPY %3(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...

>From 8b8a38a7b426fc724804602d7635134a0c63f08c Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sun, 19 May 2024 10:18:26 +0100
Subject: [PATCH 09/44] [VectorCombine] Additional extend tests for
 shuffleToIdentity. NFC

---
 .../AArch64/shuffletoidentity.ll              | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index b58f92d709361..bb333941abf70 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -465,6 +465,125 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
   ret void
 }
 
+define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
+; CHECK-LABEL: @zext(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[AB1:%.*]] = zext <4 x i16> [[AB]] to <4 x i32>
+; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i16> [[AT]] to <4 x i32>
+; CHECK-NEXT:    [[BB1:%.*]] = zext <4 x i16> [[BB]] to <4 x i32>
+; CHECK-NEXT:    [[BT1:%.*]] = zext <4 x i16> [[BT]] to <4 x i32>
+; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]]
+; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
+; CHECK-NEXT:    ret void
+;
+  %ab = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %at = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %bb = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %bt = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %ab1 = zext <4 x i16> %ab to <4 x i32>
+  %at1 = zext <4 x i16> %at to <4 x i32>
+  %bb1 = zext <4 x i16> %bb to <4 x i32>
+  %bt1 = zext <4 x i16> %bt to <4 x i32>
+  %abb = add <4 x i32> %ab1, %bb1
+  %abt = add <4 x i32> %at1, %bt1
+  %r = shufflevector <4 x i32> %abb, <4 x i32> %abt, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %r, ptr %p
+  ret void
+}
+
+define void @sext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
+; CHECK-LABEL: @sext(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[AB1:%.*]] = sext <4 x i16> [[AB]] to <4 x i32>
+; CHECK-NEXT:    [[AT1:%.*]] = sext <4 x i16> [[AT]] to <4 x i32>
+; CHECK-NEXT:    [[BB1:%.*]] = sext <4 x i16> [[BB]] to <4 x i32>
+; CHECK-NEXT:    [[BT1:%.*]] = sext <4 x i16> [[BT]] to <4 x i32>
+; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]]
+; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
+; CHECK-NEXT:    ret void
+;
+  %ab = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %at = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %bb = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %bt = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %ab1 = sext <4 x i16> %ab to <4 x i32>
+  %at1 = sext <4 x i16> %at to <4 x i32>
+  %bb1 = sext <4 x i16> %bb to <4 x i32>
+  %bt1 = sext <4 x i16> %bt to <4 x i32>
+  %abb = add <4 x i32> %ab1, %bb1
+  %abt = add <4 x i32> %at1, %bt1
+  %r = shufflevector <4 x i32> %abb, <4 x i32> %abt, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %r, ptr %p
+  ret void
+}
+
+define void @szext(<8 x i32> %a, <8 x i32> %b, ptr %p) {
+; CHECK-LABEL: @szext(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[AB1:%.*]] = sext <4 x i32> [[AB]] to <4 x i64>
+; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i32> [[AT]] to <4 x i64>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i64> [[AB1]], <4 x i64> [[AT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[R]], ptr [[P:%.*]], align 64
+; CHECK-NEXT:    ret void
+;
+  %ab = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %at = shufflevector <8 x i32> %a, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %ab1 = sext <4 x i32> %ab to <4 x i64>
+  %at1 = zext <4 x i32> %at to <4 x i64>
+  %r = shufflevector <4 x i64> %ab1, <4 x i64> %at1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i64> %r, ptr %p
+  ret void
+}
+
+define void @zext_types(<8 x i16> %a, <8 x i32> %b, ptr %p) {
+; CHECK-LABEL: @zext_types(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[AB1:%.*]] = zext <4 x i16> [[AB]] to <4 x i64>
+; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i32> [[AT]] to <4 x i64>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i64> [[AB1]], <4 x i64> [[AT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[R]], ptr [[P:%.*]], align 64
+; CHECK-NEXT:    ret void
+;
+  %ab = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %at = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %ab1 = zext <4 x i16> %ab to <4 x i64>
+  %at1 = zext <4 x i32> %at to <4 x i64>
+  %r = shufflevector <4 x i64> %ab1, <4 x i64> %at1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i64> %r, ptr %p
+  ret void
+}
+
+define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) {
+; CHECK-LABEL: @trunc(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i64> [[A]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[ABB1:%.*]] = trunc <4 x i64> [[AB]] to <4 x i32>
+; CHECK-NEXT:    [[ABT1:%.*]] = trunc <4 x i64> [[AT]] to <4 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
+; CHECK-NEXT:    ret void
+;
+  %ab = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %at = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %abb1 = trunc <4 x i64> %ab to <4 x i32>
+  %abt1 = trunc <4 x i64> %at to <4 x i32>
+  %r = shufflevector <4 x i32> %abb1, <4 x i32> %abt1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %r, ptr %p
+  ret void
+}
+
 define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @intrinsics_minmax(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]])
@@ -624,4 +743,26 @@ entry:
   ret void
 }
 
+define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: @singleop(
+; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[A:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[A2:%.*]] = zext <4 x i8> [[A1]] to <4 x i16>
+; CHECK-NEXT:    [[B2:%.*]] = zext <4 x i8> [[B1]] to <4 x i16>
+; CHECK-NEXT:    [[AB:%.*]] = add <4 x i16> [[A2]], [[B2]]
+; CHECK-NEXT:    [[T:%.*]] = trunc <4 x i16> [[AB]] to <4 x i8>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[T]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i8> [[R]]
+;
+  %a1 = shufflevector <4 x i8> %a, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %b1 = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %a2 = zext <4 x i8> %a1 to <4 x i16>
+  %b2 = zext <4 x i8> %b1 to <4 x i16>
+  %ab = add <4 x i16> %a2, %b2
+  %t = trunc <4 x i16> %ab to <4 x i8>
+  %r = shufflevector <4 x i8> %t, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i8> %r
+}
+
+
 declare void @use(<4 x i8>)

>From 689bba1eec31fa236e2febaa4bcf46bc89ba432b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sun, 19 May 2024 10:25:01 +0100
Subject: [PATCH 10/44] [DAG] canCreateUndefOrPoison - merge
 INSERT_VECTOR_ELT/EXTRACT_VECTOR_ELT cases. NFC.

The only difference is the operand index for the element index variable.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 247f52370e4c1..6a4ff741af10a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5241,17 +5241,12 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     // Check if we demand any upper (undef) elements.
     return !PoisonOnly && DemandedElts.ugt(1);
 
+  case ISD::INSERT_VECTOR_ELT:
   case ISD::EXTRACT_VECTOR_ELT: {
     // Ensure that the element index is in bounds.
     EVT VecVT = Op.getOperand(0).getValueType();
-    KnownBits KnownIdx = computeKnownBits(Op.getOperand(1), Depth + 1);
-    return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
-  }
-
-  case ISD::INSERT_VECTOR_ELT:{
-    // Ensure that the element index is in bounds.
-    EVT VecVT = Op.getOperand(0).getValueType();
-    KnownBits KnownIdx = computeKnownBits(Op.getOperand(2), Depth + 1);
+    SDValue Idx = Op.getOperand(Opcode == ISD::INSERT_VECTOR_ELT ? 2 : 1);
+    KnownBits KnownIdx = computeKnownBits(Idx, Depth + 1);
     return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
   }
 

>From 7fc524fe080a69e79bd1ce8925e680350b7e9d44 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Sun, 19 May 2024 02:43:55 -0700
Subject: [PATCH 11/44] [ctx_profile] Pass lib path into test

Fixes build after cfe9deb1353021a1c1fe4731ec3e90f702dbd43d on
https://lab.llvm.org/buildbot/#/builders/37/builds/34828
---
 .../test/ctx_profile/TestCases/generate-context.cpp        | 2 +-
 compiler-rt/test/ctx_profile/lit.cfg.py                    | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
index 981d6170091c5..797b871860655 100644
--- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
+++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
@@ -5,7 +5,7 @@
 // RUN: cp %llvm_src/include/llvm/ProfileData/CtxInstrContextNode.h %t_include/
 //
 // Compile with ctx instrumentation "on". We treat "theRoot" as callgraph root.
-// RUN: %clangxx %s -lclang_rt.ctx_profile -I%t_include -O2 -o %t.bin -mllvm -profile-context-root=theRoot
+// RUN: %clangxx %s %ctxprofilelib -I%t_include -O2 -o %t.bin -mllvm -profile-context-root=theRoot
 //
 // Run the binary, and observe the profile fetch handler's output.
 // RUN: %t.bin | FileCheck %s
diff --git a/compiler-rt/test/ctx_profile/lit.cfg.py b/compiler-rt/test/ctx_profile/lit.cfg.py
index bf62093601f11..3034fadbb7a61 100644
--- a/compiler-rt/test/ctx_profile/lit.cfg.py
+++ b/compiler-rt/test/ctx_profile/lit.cfg.py
@@ -33,3 +33,10 @@ def get_required_attr(config, attr_name):
 config.substitutions.append(
     ("%clangxx ", " ".join([config.clang] + config.cxx_mode_flags) + " -ldl -lpthread ")
 )
+
+config.substitutions.append(
+    (
+        "%ctxprofilelib",
+        "-L%s -lclang_rt.ctx_profile%s" % (config.compiler_rt_libdir, config.target_suffix)
+    )
+)

>From e0217ee7829cf49bc0caa8b814f6acc4c4b0836d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sun, 19 May 2024 11:06:02 +0100
Subject: [PATCH 12/44] [DAG] canCreateUndefOrPoison - only compute
 extract/index vector elt index knownbits when not poison

We were calling computeKnownBits to determine the bounds of the element index without ensuring that it wasn't poison, meaning if we did freeze the index, isGuaranteedNotToBeUndefOrPoison would then fail as we can't call computeKnownBits through FREEZE for potentially poison values.

Fixes #92569
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  8 +++--
 llvm/test/CodeGen/X86/pr92569.ll              | 29 +++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr92569.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6a4ff741af10a..2e1f4b7e5b374 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5246,8 +5246,12 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     // Ensure that the element index is in bounds.
     EVT VecVT = Op.getOperand(0).getValueType();
     SDValue Idx = Op.getOperand(Opcode == ISD::INSERT_VECTOR_ELT ? 2 : 1);
-    KnownBits KnownIdx = computeKnownBits(Idx, Depth + 1);
-    return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+    if (isGuaranteedNotToBeUndefOrPoison(Idx, DemandedElts, PoisonOnly,
+                                         Depth + 1)) {
+      KnownBits KnownIdx = computeKnownBits(Idx, Depth + 1);
+      return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+    }
+    return true;
   }
 
   case ISD::VECTOR_SHUFFLE: {
diff --git a/llvm/test/CodeGen/X86/pr92569.ll b/llvm/test/CodeGen/X86/pr92569.ll
new file mode 100644
index 0000000000000..f91063089e3a9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr92569.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define void @PR92569(i64 %arg, <8 x i8> %arg1) {
+; CHECK-LABEL: PR92569:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %cond.false
+; CHECK-NEXT:    rep bsfq %rdi, %rax
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movl $64, %eax
+; CHECK-NEXT:  .LBB0_3: # %cond.end
+; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movzbl -24(%rsp,%rax), %eax
+; CHECK-NEXT:    movl %eax, 0
+; CHECK-NEXT:    retq
+  %cttz = call i64 @llvm.cttz.i64(i64 %arg, i1 false)
+  %trunc = trunc i64 %cttz to i8
+  %lshr = lshr i8 %trunc, 3
+  %extractelement = extractelement <8 x i8> %arg1, i8 %lshr
+  %freeze = freeze i8 %extractelement
+  %zext = zext i8 %freeze to i32
+  store i32 %zext, ptr addrspace(1) null, align 4
+  ret void
+}

>From 9f5c8de3864b0be27a8b36cd891c5a28a3acfd27 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sun, 19 May 2024 11:30:20 +0100
Subject: [PATCH 13/44] [DAG] visitAVG - rewrite "fold (avgfloor x, 0) -> x >>
 1" to use SDPatternMatch

No need for this to be vector specific, and its more likely that scalar cases will appear after #92096
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2b1dec8205b73..bf85212e6a92e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5211,30 +5211,28 @@ SDValue DAGCombiner::visitAVG(SDNode *N) {
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
 
-  if (VT.isVector()) {
+  if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-    // fold (avgfloor x, 0) -> x >> 1
-    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
-      if (Opcode == ISD::AVGFLOORS)
-        return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
-      if (Opcode == ISD::AVGFLOORU)
-        return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
-    }
-  }
-
   // fold (avg x, undef) -> x
   if (N0.isUndef())
     return N1;
   if (N1.isUndef())
     return N0;
 
-  // Fold (avg x, x) --> x
+  // fold (avg x, x) --> x
   if (N0 == N1 && Level >= AfterLegalizeTypes)
     return N0;
 
-  // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
+  // fold (avgfloor x, 0) -> x >> 1
+  SDValue X;
+  if (sd_match(N, m_c_BinOp(ISD::AVGFLOORS, m_Value(X), m_Zero())))
+    return DAG.getNode(ISD::SRA, DL, VT, X,
+                       DAG.getShiftAmountConstant(1, VT, DL));
+  if (sd_match(N, m_c_BinOp(ISD::AVGFLOORU, m_Value(X), m_Zero())))
+    return DAG.getNode(ISD::SRL, DL, VT, X,
+                       DAG.getShiftAmountConstant(1, VT, DL));
 
   return SDValue();
 }

>From 7273ad123850a7b44c0625d098ebb49153bf855a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sun, 19 May 2024 11:49:51 +0100
Subject: [PATCH 14/44] [DAG] visitABD - rewrite "(abs x, 0)" folds to use
 SDPatternMatch

No need for this to be vector specific, and its more likely that scalar cases will appear after #92576
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bf85212e6a92e..8607b50175359 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5253,24 +5253,25 @@ SDValue DAGCombiner::visitABD(SDNode *N) {
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
 
-  if (VT.isVector()) {
+  if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
-    // fold (abds x, 0) -> abs x
-    // fold (abdu x, 0) -> x
-    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
-      if (Opcode == ISD::ABDS)
-        return DAG.getNode(ISD::ABS, DL, VT, N0);
-      if (Opcode == ISD::ABDU)
-        return N0;
-    }
-  }
-
   // fold (abd x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
 
+  SDValue X;
+
+  // fold (abds x, 0) -> abs x
+  if (sd_match(N, m_c_BinOp(ISD::ABDS, m_Value(X), m_Zero())) &&
+      (!LegalOperations || hasOperation(ISD::ABS, VT)))
+    return DAG.getNode(ISD::ABS, DL, VT, X);
+
+  // fold (abdu x, 0) -> x
+  if (sd_match(N, m_c_BinOp(ISD::ABDU, m_Value(X), m_Zero())))
+    return X;
+
   // fold (abds x, y) -> (abdu x, y) iff both args are known positive
   if (Opcode == ISD::ABDS && hasOperation(ISD::ABDU, VT) &&
       DAG.SignBitIsZero(N0) && DAG.SignBitIsZero(N1))

>From ed9007d0d219726db01f211e9c9ab72fbfe4ecb1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Sun, 19 May 2024 04:29:11 -0700
Subject: [PATCH 15/44] Revert "[Bounds-Safety] Temporarily relax a
 `counted_by` attribute restriction on flexible array members"

Together with 0ec3b972e58bcbcdc1bebe1696ea37f2931287c3
breaks https://lab.llvm.org/buildbot/#/builders/5/builds/43403

Issue #92687

This reverts commit cef6387e52578366c2332275dad88b9953b55336.
---
 clang/include/clang/Basic/DiagnosticGroups.td   |  4 ----
 .../include/clang/Basic/DiagnosticSemaKinds.td  |  8 +-------
 clang/lib/Sema/SemaDeclAttr.cpp                 | 17 ++---------------
 clang/test/Sema/attr-counted-by-vla.c           |  9 +++------
 4 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 4fad4d1a0eca7..4cb4f3d999f7a 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1447,10 +1447,6 @@ def FunctionMultiVersioning
 
 def NoDeref : DiagGroup<"noderef">;
 
-// -fbounds-safety and bounds annotation related warnings
-def BoundsSafetyCountedByEltTyUnknownSize :
-  DiagGroup<"bounds-safety-counted-by-elt-type-unknown-size">;
-
 // A group for cross translation unit static analysis related warnings.
 def CrossTU : DiagGroup<"ctu">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1efa3af121c10..8e6596410c5d0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6552,7 +6552,7 @@ def err_counted_by_attr_refer_to_union : Error<
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
 def err_counted_by_attr_pointee_unknown_size : Error<
-  "'counted_by' %select{cannot|should not}3 be applied to %select{"
+  "'counted_by' cannot be applied to %select{"
     "a pointer with pointee|" // pointer
     "an array with element}0" // array
   " of unknown size because %1 is %select{"
@@ -6561,14 +6561,8 @@ def err_counted_by_attr_pointee_unknown_size : Error<
     "a function type|"     // CountedByInvalidPointeeTypeKind::FUNCTION
     // CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER
     "a struct type with a flexible array member"
-    "%select{|. This will be an error in a future compiler version}3"
-    ""
   "}2">;
 
-def warn_counted_by_attr_elt_type_unknown_size :
-  Warning<err_counted_by_attr_pointee_unknown_size.Summary>,
-  InGroup<BoundsSafetyCountedByEltTyUnknownSize>;
-
 let CategoryName = "ARC Semantic Issue" in {
 
 // ARC-mode diagnostics.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e816ea3647a7c..c8b71631076ba 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8687,7 +8687,6 @@ static bool CheckCountedByAttrOnField(
   // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
   // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
   // when `FieldTy->isArrayType()`.
-  bool ShouldWarn = false;
   if (PointeeTy->isIncompleteType()) {
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
   } else if (PointeeTy->isSizelessType()) {
@@ -8695,25 +8694,13 @@ static bool CheckCountedByAttrOnField(
   } else if (PointeeTy->isFunctionType()) {
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
   } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
-    if (FieldTy->isArrayType()) {
-      // This is a workaround for the Linux kernel that has already adopted
-      // `counted_by` on a FAM where the pointee is a struct with a FAM. This
-      // should be an error because computing the bounds of the array cannot be
-      // done correctly without manually traversing every struct object in the
-      // array at runtime. To allow the code to be built this error is
-      // downgraded to a warning.
-      ShouldWarn = true;
-    }
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
   }
 
   if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
-    unsigned DiagID = ShouldWarn
-                          ? diag::warn_counted_by_attr_elt_type_unknown_size
-                          : diag::err_counted_by_attr_pointee_unknown_size;
-    S.Diag(FD->getBeginLoc(), DiagID)
+    S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_pointee_unknown_size)
         << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
-        << (ShouldWarn ? 1 : 0) << FD->getSourceRange();
+        << FD->getSourceRange();
     return true;
   }
 
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
index b25f719f3b95a..3de6bd55e2d8e 100644
--- a/clang/test/Sema/attr-counted-by-vla.c
+++ b/clang/test/Sema/attr-counted-by-vla.c
@@ -173,24 +173,21 @@ struct has_annotated_VLA {
 
 struct buffer_of_structs_with_unnannotated_vla {
   int count;
-  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
-  // expected-warning at +1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  // expected-error at +1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member}}
   struct has_unannotated_VLA Arr[] __counted_by(count);
 };
 
 
 struct buffer_of_structs_with_annotated_vla {
   int count;
-  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
-  // expected-warning at +1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  // expected-error at +1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member}}
   struct has_annotated_VLA Arr[] __counted_by(count);
 };
 
 struct buffer_of_const_structs_with_annotated_vla {
   int count;
-  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
   // Make sure the `const` qualifier is printed when printing the element type.
-  // expected-warning at +1{{'counted_by' should not be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  // expected-error at +1{{'counted_by' cannot be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member}}
   const struct has_annotated_VLA Arr[] __counted_by(count);
 };
 

>From 6447abe067c8088a5cc093fe872719374e174068 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Sun, 19 May 2024 04:30:22 -0700
Subject: [PATCH 16/44] Revert "[BoundsSafety] Allow 'counted_by' attribute on
 pointers in structs in C (#90786)"

Memory leak: https://lab.llvm.org/buildbot/#/builders/5/builds/43403

Issue #92687

This reverts commit 0ec3b972e58bcbcdc1bebe1696ea37f2931287c3.
---
 clang/docs/ReleaseNotes.rst                   |  21 +-
 clang/include/clang/AST/Type.h                |   1 -
 clang/include/clang/Basic/Attr.td             |   3 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |  17 +-
 clang/include/clang/Parse/Parser.h            |   7 +-
 clang/include/clang/Sema/Sema.h               |   3 +-
 clang/lib/AST/Type.cpp                        |  10 -
 clang/lib/Parse/ParseDecl.cpp                 | 104 +------
 clang/lib/Parse/ParseObjc.cpp                 |  10 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  82 ++----
 clang/lib/Sema/SemaType.cpp                   |   6 +-
 clang/lib/Sema/TreeTransform.h                |   2 +-
 .../attr-counted-by-late-parsed-struct-ptrs.c |  45 ----
 clang/test/AST/attr-counted-by-struct-ptrs.c  | 117 --------
 .../Sema/attr-counted-by-late-parsed-off.c    |  26 --
 .../attr-counted-by-late-parsed-struct-ptrs.c | 254 ------------------
 ...tr-counted-by-struct-ptrs-sizeless-types.c |  17 --
 clang/test/Sema/attr-counted-by-struct-ptrs.c | 224 ---------------
 .../Sema/attr-counted-by-vla-sizeless-types.c |  11 -
 clang/test/Sema/attr-counted-by-vla.c         | 193 -------------
 clang/test/Sema/attr-counted-by.c             | 112 ++++++++
 21 files changed, 148 insertions(+), 1117 deletions(-)
 delete mode 100644 clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
 delete mode 100644 clang/test/AST/attr-counted-by-struct-ptrs.c
 delete mode 100644 clang/test/Sema/attr-counted-by-late-parsed-off.c
 delete mode 100644 clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
 delete mode 100644 clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
 delete mode 100644 clang/test/Sema/attr-counted-by-struct-ptrs.c
 delete mode 100644 clang/test/Sema/attr-counted-by-vla-sizeless-types.c
 delete mode 100644 clang/test/Sema/attr-counted-by-vla.c
 create mode 100644 clang/test/Sema/attr-counted-by.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2f83f5c6d54e9..7af5869d21768 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -317,8 +317,7 @@ New Compiler Flags
 
 - ``-fexperimental-late-parse-attributes`` enables an experimental feature to
   allow late parsing certain attributes in specific contexts where they would
-  not normally be late parsed. Currently this allows late parsing the
-  `counted_by` attribute in C. See `Attribute Changes in Clang`_.
+  not normally be late parsed.
 
 - ``-fseparate-named-sections`` uses separate unique sections for global
   symbols in named special sections (i.e. symbols annotated with
@@ -407,24 +406,6 @@ Attribute Changes in Clang
 - The ``clspv_libclc_builtin`` attribute has been added to allow clspv
   (`OpenCL-C to Vulkan SPIR-V compiler <https://github.com/google/clspv>`_) to identify functions coming from libclc
   (`OpenCL-C builtin library <https://libclc.llvm.org>`_).
-- The ``counted_by`` attribute is now allowed on pointers that are members of a
-  struct in C.
-
-- The ``counted_by`` attribute can now be late parsed in C when
-  ``-fexperimental-late-parse-attributes`` is passed but only when attribute is
-  used in the declaration attribute position. This allows using the
-  attribute on existing code where it previously impossible to do so without
-  re-ordering struct field declarations would break ABI as shown below.
-
-  .. code-block:: c
-
-     struct BufferTy {
-       /* Refering to `count` requires late parsing */
-       char* buffer __counted_by(count);
-       /* Swapping `buffer` and `count` to avoid late parsing would break ABI */
-       size_t count;
-     };
-
 
 Improvements to Clang's diagnostics
 -----------------------------------
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index c7a8e785913b3..da3834f19ca04 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2515,7 +2515,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isRecordType() const;
   bool isClassType() const;
   bool isStructureType() const;
-  bool isStructureTypeWithFlexibleArrayMember() const;
   bool isObjCBoxableRecordType() const;
   bool isInterfaceType() const;
   bool isStructureOrClassType() const;
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 7a7721239a28f..38ee8356583be 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2229,8 +2229,7 @@ def TypeNullUnspecified : TypeAttr {
 def CountedBy : DeclOrTypeAttr {
   let Spellings = [Clang<"counted_by">];
   let Subjects = SubjectList<[Field], ErrorDiag>;
-  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel", 1>];
-  let LateParsed = LateAttrParseExperimentalExt;
+  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel">];
   let ParseArgumentsAsUnevaluated = 1;
   let Documentation = [CountedByDocs];
   let LangOpts = [COnly];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8e6596410c5d0..09b1874f9fddd 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6533,10 +6533,8 @@ def warn_superclass_variable_sized_type_not_at_end : Warning<
 
 def err_flexible_array_count_not_in_same_struct : Error<
   "'counted_by' field %0 isn't within the same struct as the flexible array">;
-def err_counted_by_attr_not_on_ptr_or_flexible_array_member : Error<
-  "'counted_by' only applies to pointers or C99 flexible array members">;
-def err_counted_by_attr_on_array_not_flexible_array_member : Error<
-  "'counted_by' on arrays only applies to C99 flexible array members">;
+def err_counted_by_attr_not_on_flexible_array_member : Error<
+  "'counted_by' only applies to C99 flexible array members">;
 def err_counted_by_attr_refer_to_itself : Error<
   "'counted_by' cannot refer to the flexible array member %0">;
 def err_counted_by_must_be_in_structure : Error<
@@ -6551,17 +6549,6 @@ def err_counted_by_attr_refer_to_union : Error<
   "'counted_by' argument cannot refer to a union member">;
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
-def err_counted_by_attr_pointee_unknown_size : Error<
-  "'counted_by' cannot be applied to %select{"
-    "a pointer with pointee|" // pointer
-    "an array with element}0" // array
-  " of unknown size because %1 is %select{"
-    "an incomplete type|"  // CountedByInvalidPointeeTypeKind::INCOMPLETE
-    "a sizeless type|"     // CountedByInvalidPointeeTypeKind::SIZELESS
-    "a function type|"     // CountedByInvalidPointeeTypeKind::FUNCTION
-    // CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER
-    "a struct type with a flexible array member"
-  "}2">;
 
 let CategoryName = "ARC Semantic Issue" in {
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index af50164a8f93f..1e796e828b10a 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1645,8 +1645,6 @@ class Parser : public CodeCompletionHandler {
                                bool EnterScope, bool OnDefinition);
   void ParseLexedAttribute(LateParsedAttribute &LA,
                            bool EnterScope, bool OnDefinition);
-  void ParseLexedCAttribute(LateParsedAttribute &LA,
-                            ParsedAttributes *OutAttrs = nullptr);
   void ParseLexedMethodDeclarations(ParsingClass &Class);
   void ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM);
   void ParseLexedMethodDefs(ParsingClass &Class);
@@ -2533,8 +2531,7 @@ class Parser : public CodeCompletionHandler {
 
   void ParseStructDeclaration(
       ParsingDeclSpec &DS,
-      llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
-      LateParsedAttrList *LateFieldAttrs = nullptr);
+      llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback);
 
   DeclGroupPtrTy ParseTopLevelStmtDecl();
 
@@ -3112,8 +3109,6 @@ class Parser : public CodeCompletionHandler {
                                  SourceLocation ScopeLoc,
                                  ParsedAttr::Form Form);
 
-  void DistributeCLateParsedAttrs(Decl *Dcl, LateParsedAttrList *LateAttrs);
-
   void ParseBoundsAttribute(IdentifierInfo &AttrName,
                             SourceLocation AttrNameLoc, ParsedAttributes &Attrs,
                             IdentifierInfo *ScopeName, SourceLocation ScopeLoc,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d4d4a82525a02..b16a304960d3f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11396,8 +11396,7 @@ class Sema final : public SemaBase {
   QualType BuildMatrixType(QualType T, Expr *NumRows, Expr *NumColumns,
                            SourceLocation AttrLoc);
 
-  QualType BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
-                                                  Expr *CountExpr);
+  QualType BuildCountAttributedArrayType(QualType WrappedTy, Expr *CountExpr);
 
   QualType BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
                                  SourceLocation AttrLoc);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index f69a8f80a6393..e31741cd44240 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -632,16 +632,6 @@ bool Type::isStructureType() const {
   return false;
 }
 
-bool Type::isStructureTypeWithFlexibleArrayMember() const {
-  const auto *RT = getAs<RecordType>();
-  if (!RT)
-    return false;
-  const auto *Decl = RT->getDecl();
-  if (!Decl->isStruct())
-    return false;
-  return Decl->hasFlexibleArrayMember();
-}
-
 bool Type::isObjCBoxableRecordType() const {
   if (const auto *RT = getAs<RecordType>())
     return RT->getDecl()->hasAttr<ObjCBoxableAttr>();
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 8405b44685ae4..2ce8fa98089f6 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3288,19 +3288,6 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
   }
 }
 
-void Parser::DistributeCLateParsedAttrs(Decl *Dcl,
-                                        LateParsedAttrList *LateAttrs) {
-  assert(Dcl && "Dcl cannot be null");
-
-  if (!LateAttrs)
-    return;
-
-  for (auto *LateAttr : *LateAttrs) {
-    if (LateAttr->Decls.empty())
-      LateAttr->addDecl(Dcl);
-  }
-}
-
 /// Bounds attributes (e.g., counted_by):
 ///   AttrName '(' expression ')'
 void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
@@ -4838,14 +4825,13 @@ static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS,
 ///
 void Parser::ParseStructDeclaration(
     ParsingDeclSpec &DS,
-    llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
-    LateParsedAttrList *LateFieldAttrs) {
+    llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback) {
 
   if (Tok.is(tok::kw___extension__)) {
     // __extension__ silences extension warnings in the subexpression.
     ExtensionRAIIObject O(Diags);  // Use RAII to do this.
     ConsumeToken();
-    return ParseStructDeclaration(DS, FieldsCallback, LateFieldAttrs);
+    return ParseStructDeclaration(DS, FieldsCallback);
   }
 
   // Parse leading attributes.
@@ -4910,12 +4896,10 @@ void Parser::ParseStructDeclaration(
     }
 
     // If attributes exist after the declarator, parse them.
-    MaybeParseGNUAttributes(DeclaratorInfo.D, LateFieldAttrs);
+    MaybeParseGNUAttributes(DeclaratorInfo.D);
 
     // We're done with this declarator;  invoke the callback.
-    Decl *Field = FieldsCallback(DeclaratorInfo);
-    if (Field)
-      DistributeCLateParsedAttrs(Field, LateFieldAttrs);
+    FieldsCallback(DeclaratorInfo);
 
     // If we don't have a comma, it is either the end of the list (a ';')
     // or an error, bail out.
@@ -4926,69 +4910,6 @@ void Parser::ParseStructDeclaration(
   }
 }
 
-/// Finish parsing an attribute for which parsing was delayed.
-/// This will be called at the end of parsing a class declaration
-/// for each LateParsedAttribute. We consume the saved tokens and
-/// create an attribute with the arguments filled in. We add this
-/// to the Attribute list for the decl.
-void Parser::ParseLexedCAttribute(LateParsedAttribute &LA,
-                                  ParsedAttributes *OutAttrs) {
-  // Create a fake EOF so that attribute parsing won't go off the end of the
-  // attribute.
-  Token AttrEnd;
-  AttrEnd.startToken();
-  AttrEnd.setKind(tok::eof);
-  AttrEnd.setLocation(Tok.getLocation());
-  AttrEnd.setEofData(LA.Toks.data());
-  LA.Toks.push_back(AttrEnd);
-
-  // Append the current token at the end of the new token stream so that it
-  // doesn't get lost.
-  LA.Toks.push_back(Tok);
-  PP.EnterTokenStream(LA.Toks, /*DisableMacroExpansion=*/true,
-                      /*IsReinject=*/true);
-  // Drop the current token and bring the first cached one. It's the same token
-  // as when we entered this function.
-  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
-
-  ParsedAttributes Attrs(AttrFactory);
-
-  assert(LA.Decls.size() <= 1 &&
-         "late field attribute expects to have at most one declaration.");
-
-  // Dispatch based on the attribute and parse it
-  const AttributeCommonInfo::Form ParsedForm = ParsedAttr::Form::GNU();
-  IdentifierInfo *ScopeName = nullptr;
-  const ParsedAttr::Kind AttrKind =
-      ParsedAttr::getParsedKind(&LA.AttrName, /*ScopeName=*/ScopeName,
-                                /*SyntaxUsed=*/ParsedForm.getSyntax());
-  switch (AttrKind) {
-  case ParsedAttr::Kind::AT_CountedBy:
-    ParseBoundsAttribute(LA.AttrName, LA.AttrNameLoc, Attrs,
-                         /*ScopeName=*/ScopeName, SourceLocation(),
-                         /*Form=*/ParsedForm);
-    break;
-  default:
-    llvm_unreachable("Unhandled late parsed attribute");
-  }
-
-  for (auto *D : LA.Decls)
-    Actions.ActOnFinishDelayedAttribute(getCurScope(), D, Attrs);
-
-  // Due to a parsing error, we either went over the cached tokens or
-  // there are still cached tokens left, so we skip the leftover tokens.
-  while (Tok.isNot(tok::eof))
-    ConsumeAnyToken();
-
-  // Consume the fake EOF token if it's there
-  if (Tok.is(tok::eof) && Tok.getEofData() == AttrEnd.getEofData())
-    ConsumeAnyToken();
-
-  if (OutAttrs) {
-    OutAttrs->takeAllFrom(Attrs);
-  }
-}
-
 /// ParseStructUnionBody
 ///       struct-contents:
 ///         struct-declaration-list
@@ -5012,11 +4933,6 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
   ParseScope StructScope(this, Scope::ClassScope|Scope::DeclScope);
   Actions.ActOnTagStartDefinition(getCurScope(), TagDecl);
 
-  // `LateAttrParseExperimentalExtOnly=true` requests that only attributes
-  // marked with `LateAttrParseExperimentalExt` are late parsed.
-  LateParsedAttrList LateFieldAttrs(/*PSoon=*/false,
-                                    /*LateAttrParseExperimentalExtOnly=*/true);
-
   // While we still have something to read, read the declarations in the struct.
   while (!tryParseMisplacedModuleImport() && Tok.isNot(tok::r_brace) &&
          Tok.isNot(tok::eof)) {
@@ -5067,19 +4983,18 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
     }
 
     if (!Tok.is(tok::at)) {
-      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
+      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) {
         // Install the declarator into the current TagDecl.
         Decl *Field =
             Actions.ActOnField(getCurScope(), TagDecl,
                                FD.D.getDeclSpec().getSourceRange().getBegin(),
                                FD.D, FD.BitfieldSize);
         FD.complete(Field);
-        return Field;
       };
 
       // Parse all the comma separated declarators.
       ParsingDeclSpec DS(*this);
-      ParseStructDeclaration(DS, CFieldCallback, &LateFieldAttrs);
+      ParseStructDeclaration(DS, CFieldCallback);
     } else { // Handle @defs
       ConsumeToken();
       if (!Tok.isObjCAtKeyword(tok::objc_defs)) {
@@ -5120,12 +5035,7 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
 
   ParsedAttributes attrs(AttrFactory);
   // If attributes exist after struct contents, parse them.
-  MaybeParseGNUAttributes(attrs, &LateFieldAttrs);
-
-  // Late parse field attributes if necessary.
-  assert(!getLangOpts().CPlusPlus);
-  for (auto *LateAttr : LateFieldAttrs)
-    ParseLexedCAttribute(*LateAttr);
+  MaybeParseGNUAttributes(attrs);
 
   SmallVector<Decl *, 32> FieldDecls(TagDecl->fields());
 
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 6a2088a73c55b..89f4acbd25e49 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -780,16 +780,16 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
       }
 
       bool addedToDeclSpec = false;
-      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
+      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) {
         if (FD.D.getIdentifier() == nullptr) {
           Diag(AtLoc, diag::err_objc_property_requires_field_name)
               << FD.D.getSourceRange();
-          return nullptr;
+          return;
         }
         if (FD.BitfieldSize) {
           Diag(AtLoc, diag::err_objc_property_bitfield)
               << FD.D.getSourceRange();
-          return nullptr;
+          return;
         }
 
         // Map a nullability property attribute to a context-sensitive keyword
@@ -818,7 +818,6 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
             MethodImplKind);
 
         FD.complete(Property);
-        return Property;
       };
 
       // Parse all the comma separated declarators.
@@ -2014,7 +2013,7 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       continue;
     }
 
-    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
+    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) {
       assert(getObjCDeclContext() == interfaceDecl &&
              "Ivar should have interfaceDecl as its decl context");
       // Install the declarator into the interface decl.
@@ -2025,7 +2024,6 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       if (Field)
         AllIvarDecls.push_back(Field);
       FD.complete(Field);
-      return Field;
     };
 
     // Parse all the comma separated declarators.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index c8b71631076ba..30776ff537fb5 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8633,82 +8633,31 @@ static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
   return RD;
 }
 
-enum class CountedByInvalidPointeeTypeKind {
-  INCOMPLETE,
-  SIZELESS,
-  FUNCTION,
-  FLEXIBLE_ARRAY_MEMBER,
-  VALID,
-};
-
-static bool CheckCountedByAttrOnField(
-    Sema &S, FieldDecl *FD, Expr *E,
-    llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
-  // Check the context the attribute is used in
-
+static bool
+CheckCountExpr(Sema &S, FieldDecl *FD, Expr *E,
+               llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
   if (FD->getParent()->isUnion()) {
     S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_in_union)
         << FD->getSourceRange();
     return true;
   }
 
-  const auto FieldTy = FD->getType();
-  if (!FieldTy->isArrayType() && !FieldTy->isPointerType()) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_counted_by_attr_not_on_ptr_or_flexible_array_member)
-        << FD->getLocation();
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
+        << E->getSourceRange();
     return true;
   }
 
   LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-  if (FieldTy->isArrayType() &&
-      !Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FieldTy,
-                                       StrictFlexArraysLevel, true)) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_counted_by_attr_on_array_not_flexible_array_member)
-        << FD->getLocation();
-    return true;
-  }
 
-  CountedByInvalidPointeeTypeKind InvalidTypeKind =
-      CountedByInvalidPointeeTypeKind::VALID;
-  QualType PointeeTy;
-  int SelectPtrOrArr = 0;
-  if (FieldTy->isPointerType()) {
-    PointeeTy = FieldTy->getPointeeType();
-    SelectPtrOrArr = 0;
-  } else {
-    assert(FieldTy->isArrayType());
-    const ArrayType *AT = S.getASTContext().getAsArrayType(FieldTy);
-    PointeeTy = AT->getElementType();
-    SelectPtrOrArr = 1;
-  }
-  // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
-  // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
-  // when `FieldTy->isArrayType()`.
-  if (PointeeTy->isIncompleteType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
-  } else if (PointeeTy->isSizelessType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
-  } else if (PointeeTy->isFunctionType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
-  } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
-  }
-
-  if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
-    S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_pointee_unknown_size)
-        << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
-        << FD->getSourceRange();
-    return true;
-  }
-
-  // Check the expression
-
-  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
-    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
-        << E->getSourceRange();
+  if (!Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FD->getType(),
+                                       StrictFlexArraysLevel, true)) {
+    // The "counted_by" attribute must be on a flexible array member.
+    SourceRange SR = FD->getLocation();
+    S.Diag(SR.getBegin(),
+           diag::err_counted_by_attr_not_on_flexible_array_member)
+        << SR;
     return true;
   }
 
@@ -8771,11 +8720,10 @@ static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
-  if (CheckCountedByAttrOnField(S, FD, CountExpr, Decls))
+  if (CheckCountExpr(S, FD, CountExpr, Decls))
     return;
 
-  QualType CAT =
-      S.BuildCountAttributedArrayOrPointerType(FD->getType(), CountExpr);
+  QualType CAT = S.BuildCountAttributedArrayType(FD->getType(), CountExpr);
   FD->setType(CAT);
 }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index ef0b6b701a52c..c19c8cc34dd3b 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9345,9 +9345,9 @@ BuildTypeCoupledDecls(Expr *E,
   Decls.push_back(TypeCoupledDeclRefInfo(CountDecl, /*IsDref*/ false));
 }
 
-QualType Sema::BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
-                                                      Expr *CountExpr) {
-  assert(WrappedTy->isIncompleteArrayType() || WrappedTy->isPointerType());
+QualType Sema::BuildCountAttributedArrayType(QualType WrappedTy,
+                                             Expr *CountExpr) {
+  assert(WrappedTy->isIncompleteArrayType());
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
   BuildTypeCoupledDecls(CountExpr, Decls);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 29444f0edc2ae..b10e5ba65eb1c 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7344,7 +7344,7 @@ QualType TreeTransform<Derived>::TransformCountAttributedType(
   if (getDerived().AlwaysRebuild() || InnerTy != OldTy->desugar() ||
       OldCount != NewCount) {
     // Currently, CountAttributedType can only wrap incomplete array types.
-    Result = SemaRef.BuildCountAttributedArrayOrPointerType(InnerTy, NewCount);
+    Result = SemaRef.BuildCountAttributedArrayType(InnerTy, NewCount);
   }
 
   TLB.push<CountAttributedTypeLoc>(Result);
diff --git a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
deleted file mode 100644
index a585a45eeff03..0000000000000
--- a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_known {
-  int field;
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-struct on_member_pointer_complete_ty {
-  struct size_known *buf __counted_by(count);
-  int count;
-};
-// CHECK-LABEL: struct on_member_pointer_complete_ty definition
-// CHECK-NEXT: |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT: `-FieldDecl {{.*}} referenced count 'int'
-
-struct on_pointer_anon_count {
-  struct size_known *buf __counted_by(count);
-  struct {
-    int count;
-  };
-};
-
-// CHECK-LABEL: struct on_pointer_anon_count definition
-// CHECK-NEXT:  |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:  |-RecordDecl {{.*}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
-// CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
-// CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
-// CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute and is **not** late parsed resulting in the `count`
-// field being unavailable.
-//
-// See `clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c` for test
-// cases.
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
deleted file mode 100644
index 79a453d239cd5..0000000000000
--- a/clang/test/AST/attr-counted-by-struct-ptrs.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_unknown;
-struct size_known {
-  int field;
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty definition
-// CHECK-NEXT: |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT: `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_member_pointer_complete_ty {
-  int count;
-  struct size_known * buf __counted_by(count);
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
-// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
-// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_pointer_anon_buf {
-  int count;
-  struct {
-    struct size_known *buf __counted_by(count);
-  };
-};
-
-struct on_pointer_anon_count {
-  struct {
-    int count;
-  };
-  struct size_known *buf __counted_by(count);
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty_ty_pos definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_member_pointer_complete_ty_ty_pos {
-  int count;
-  struct size_known *__counted_by(count) buf;
-};
-
-// TODO: This should be forbidden but isn't due to counted_by being treated as a
-// declaration attribute. The attribute ends up on the outer most pointer
-// (allowed by sema) even though syntactically its supposed to be on the inner
-// pointer (would not allowed by sema due to pointee being a function type).
-// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_fn_ptr_ty_ty_pos_inner definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} fn_ptr 'void (** __counted_by(count))(void)':'void (**)(void)'
-struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
-  int count;
-  void (* __counted_by(count) * fn_ptr)(void);
-};
-
-// FIXME: The generated AST here is wrong. The attribute should be on the inner
-// pointer.
-// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_inner definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
-struct on_nested_pointer_inner {
-  int count;
-  // TODO: This should be disallowed because in the `-fbounds-safety` model
-  // `__counted_by` can only be nested when used in function parameters.
-  struct size_known *__counted_by(count) *buf;
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_outer definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
-struct on_nested_pointer_outer {
-  int count;
-  struct size_known **__counted_by(count) buf;
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf_ty_pos definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
-// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
-// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_pointer_anon_buf_ty_pos {
-  int count;
-  struct {
-    struct size_known * __counted_by(count) buf;
-  };
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_count_ty_pos definition
-// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
-// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
-// CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
-// CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
-struct on_pointer_anon_count_ty_pos {
-  struct {
-    int count;
-  };
-  struct size_known *__counted_by(count) buf;
-};
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-off.c b/clang/test/Sema/attr-counted-by-late-parsed-off.c
deleted file mode 100644
index 34f51d10c0838..0000000000000
--- a/clang/test/Sema/attr-counted-by-late-parsed-off.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify %s
-// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fsyntax-only -verify %s
-
-// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify=ok %s
-// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fsyntax-only -verify=ok %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_known { int dummy; };
-
-#ifdef NEEDS_LATE_PARSING
-struct on_decl {
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct size_known *buf __counted_by(count);
-  int count;
-};
-
-#else
-
-// ok-no-diagnostics
-struct on_decl {
-  int count;
-  struct size_known *buf __counted_by(count);
-};
-
-#endif
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
deleted file mode 100644
index 9ff3b080f6576..0000000000000
--- a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
+++ /dev/null
@@ -1,254 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_unknown;
-struct size_known {
-  int field;
-};
-
-typedef void(*fn_ptr_ty)(void);
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-struct on_member_pointer_complete_ty {
-  struct size_known * buf __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_incomplete_ty {
-  struct size_unknown * buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
-  int count;
-};
-
-struct on_member_pointer_const_incomplete_ty {
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
-  const struct size_unknown * buf __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_void_ty {
-  void* buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty {
-  // buffer of `count` function pointers is allowed
-  void (**fn_ptr)(void) __counted_by(count);
-  int count;
-};
-
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty {
-  // buffer of `count` function pointers is allowed
-  fn_ptr_ty* fn_ptr __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_fn_ty {
-  // buffer of `count` functions is not allowed
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  void (*fn_ptr)(void) __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ty {
-  // buffer of `count` functions is not allowed
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  fn_ptr_ty fn_ptr __counted_by(count);
-  int count;
-};
-
-struct has_unannotated_vla {
-  int count;
-  int buffer[];
-};
-
-struct on_member_pointer_struct_with_vla {
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
-  struct has_unannotated_vla* objects __counted_by(count);
-  int count;
-};
-
-struct has_annotated_vla {
-  int count;
-  int buffer[] __counted_by(count);
-};
-
-// Currently prevented because computing the size of `objects` at runtime would
-// require an O(N) walk of `objects` to take into account the length of the VLA
-// in each struct instance.
-struct on_member_pointer_struct_with_annotated_vla {
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
-  struct has_annotated_vla* objects __counted_by(count);
-  int count;
-};
-
-struct on_pointer_anon_buf {
-  // TODO: Support referring to parent scope
-  struct {
-    // expected-error at +1{{use of undeclared identifier 'count'}}
-    struct size_known *buf __counted_by(count);
-  };
-  int count;
-};
-
-struct on_pointer_anon_count {
-  struct size_known *buf __counted_by(count);
-  struct {
-    int count;
-  };
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute and is **not** late parsed resulting in the `count`
-// field being unavailable.
-
-struct on_member_pointer_complete_ty_ty_pos {
-  // TODO: Allow this
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct size_known *__counted_by(count) buf;
-  int count;
-};
-
-struct on_member_pointer_incomplete_ty_ty_pos {
-  // TODO: Allow this
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct size_unknown * __counted_by(count) buf;
-  int count;
-};
-
-struct on_member_pointer_const_incomplete_ty_ty_pos {
-  // TODO: Allow this
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  const struct size_unknown * __counted_by(count) buf;
-  int count;
-};
-
-struct on_member_pointer_void_ty_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being an incomplete type.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  void *__counted_by(count) buf;
-  int count;
-};
-
-// -
-
-struct on_member_pointer_fn_ptr_ty_pos {
-  // TODO: buffer of `count` function pointers should be allowed
-  // but fails because this isn't late parsed.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  void (** __counted_by(count) fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
-  // TODO: buffer of `count` function pointers should be allowed
-  // but fails because this isn't late parsed.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  fn_ptr_ty* __counted_by(count) fn_ptr;
-  int count;
-};
-
-struct on_member_pointer_fn_ty_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a function type.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  void (* __counted_by(count) fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ty_pos {
-  // TODO: buffer of `count` function pointers should be allowed
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  void (** __counted_by(count) fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_typedef_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a function type.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  fn_ptr_ty __counted_by(count) fn_ptr;
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a function type.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  void (* __counted_by(count) * fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_struct_with_vla_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a struct type with a VLA.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct has_unannotated_vla *__counted_by(count) objects;
-  int count;
-};
-
-struct on_member_pointer_struct_with_annotated_vla_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a struct type with a VLA.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct has_annotated_vla* __counted_by(count) objects;
-  int count;
-};
-
-struct on_nested_pointer_inner {
-  // TODO: This should be disallowed because in the `-fbounds-safety` model
-  // `__counted_by` can only be nested when used in function parameters.
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct size_known *__counted_by(count) *buf;
-  int count;
-};
-
-struct on_nested_pointer_outer {
-  // TODO: Allow this
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct size_known **__counted_by(count) buf;
-  int count;
-};
-
-struct on_pointer_anon_buf_ty_pos {
-  struct {
-    // TODO: Support referring to parent scope
-    // expected-error at +1{{use of undeclared identifier 'count'}}
-    struct size_known * __counted_by(count) buf;
-  };
-  int count;
-};
-
-struct on_pointer_anon_count_ty_pos {
-  // TODO: Allow this
-  // expected-error at +1{{use of undeclared identifier 'count'}}
-  struct size_known *__counted_by(count) buf;
-  struct {
-    int count;
-  };
-};
-
-//==============================================================================
-// __counted_by on struct non-pointer members
-//==============================================================================
-
-struct on_pod_ty {
-  // expected-error at +1{{'counted_by' only applies to pointers or C99 flexible array members}}
-  int wrong_ty __counted_by(count);
-  int count;
-};
-
-struct on_void_ty {
-  // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error at +1{{field has incomplete type 'void'}}
-  void wrong_ty __counted_by(count);
-  int count;
-};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
deleted file mode 100644
index 9b0f2eafb13c2..0000000000000
--- a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
+++ /dev/null
@@ -1,17 +0,0 @@
-// __SVInt8_t is specific to ARM64 so specify that in the target triple
-// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct on_sizeless_pointee_ty {
-    int count;
-    // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because '__SVInt8_t' is a sizeless type}}
-    __SVInt8_t* member __counted_by(count);
-};
-
-struct on_sizeless_ty {
-    int count;
-    // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-    // expected-error at +1{{field has sizeless type '__SVInt8_t'}}
-    __SVInt8_t member __counted_by(count);
-};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
deleted file mode 100644
index cd2bfe36938b2..0000000000000
--- a/clang/test/Sema/attr-counted-by-struct-ptrs.c
+++ /dev/null
@@ -1,224 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_unknown;
-struct size_known {
-  int field;
-};
-
-typedef void(*fn_ptr_ty)(void);
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-struct on_member_pointer_complete_ty {
-  int count;
-  struct size_known * buf __counted_by(count);
-};
-
-struct on_member_pointer_incomplete_ty {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
-  struct size_unknown * buf __counted_by(count);
-};
-
-struct on_member_pointer_const_incomplete_ty {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
-  const struct size_unknown * buf __counted_by(count);
-};
-
-struct on_member_pointer_void_ty {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
-  void* buf __counted_by(count);
-};
-
-struct on_member_pointer_fn_ptr_ty {
-  int count;
-  // buffer of `count` function pointers is allowed
-  void (**fn_ptr)(void) __counted_by(count);
-};
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty {
-  int count;
-  // buffer of `count` function pointers is allowed
-  fn_ptr_ty* fn_ptr __counted_by(count);
-};
-
-struct on_member_pointer_fn_ty {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  void (*fn_ptr)(void) __counted_by(count);
-};
-
-struct on_member_pointer_fn_ptr_ty_ty {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  fn_ptr_ty fn_ptr __counted_by(count);
-};
-
-struct has_unannotated_vla {
-  int count;
-  int buffer[];
-};
-
-struct on_member_pointer_struct_with_vla {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
-  struct has_unannotated_vla* objects __counted_by(count);
-};
-
-struct has_annotated_vla {
-  int count;
-  int buffer[] __counted_by(count);
-};
-
-// Currently prevented because computing the size of `objects` at runtime would
-// require an O(N) walk of `objects` to take into account the length of the VLA
-// in each struct instance.
-struct on_member_pointer_struct_with_annotated_vla {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
-  struct has_annotated_vla* objects __counted_by(count);
-};
-
-struct on_pointer_anon_buf {
-  int count;
-  struct {
-    struct size_known *buf __counted_by(count);
-  };
-};
-
-struct on_pointer_anon_count {
-  struct {
-    int count;
-  };
-  struct size_known *buf __counted_by(count);
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute
-
-struct on_member_pointer_complete_ty_ty_pos {
-  int count;
-  struct size_known *__counted_by(count) buf;
-};
-
-struct on_member_pointer_incomplete_ty_ty_pos {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
-  struct size_unknown * __counted_by(count) buf;
-};
-
-struct on_member_pointer_const_incomplete_ty_ty_pos {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
-  const struct size_unknown * __counted_by(count) buf;
-};
-
-struct on_member_pointer_void_ty_ty_pos {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
-  void *__counted_by(count) buf;
-};
-
-// -
-
-struct on_member_pointer_fn_ptr_ty_pos {
-  int count;
-  // buffer of `count` function pointers is allowed
-  void (** __counted_by(count) fn_ptr)(void);
-};
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
-  int count;
-  // buffer of `count` function pointers is allowed
-  fn_ptr_ty* __counted_by(count) fn_ptr;
-};
-
-struct on_member_pointer_fn_ty_ty_pos {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  void (* __counted_by(count) fn_ptr)(void);
-};
-
-struct on_member_pointer_fn_ptr_ty_ty_pos {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  fn_ptr_ty __counted_by(count) fn_ptr;
-};
-
-// TODO: This should be forbidden but isn't due to counted_by being treated
-// as a declaration attribute.
-struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
-  int count;
-  void (* __counted_by(count) * fn_ptr)(void);
-};
-
-struct on_member_pointer_struct_with_vla_ty_pos {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
-  struct has_unannotated_vla *__counted_by(count) objects;
-};
-
-// Currently prevented because computing the size of `objects` at runtime would
-// require an O(N) walk of `objects` to take into account the length of the VLA
-// in each struct instance.
-struct on_member_pointer_struct_with_annotated_vla_ty_pos {
-  int count;
-  // expected-error at +1{{counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
-  struct has_annotated_vla* __counted_by(count) objects;
-};
-
-struct on_nested_pointer_inner {
-  // TODO: This should be disallowed because in the `-fbounds-safety` model
-  // `__counted_by` can only be nested when used in function parameters.
-  int count;
-  struct size_known *__counted_by(count) *buf;
-};
-
-struct on_nested_pointer_outer {
-  int count;
-  struct size_known **__counted_by(count) buf;
-};
-
-struct on_pointer_anon_buf_ty_pos {
-  int count;
-  struct {
-    struct size_known * __counted_by(count) buf;
-  };
-};
-
-struct on_pointer_anon_count_ty_pos {
-  struct {
-    int count;
-  };
-  struct size_known *__counted_by(count) buf;
-};
-
-//==============================================================================
-// __counted_by on struct non-pointer members
-//==============================================================================
-
-struct on_pod_ty {
-  int count;
-  // expected-error at +1{{'counted_by' only applies to pointers or C99 flexible array members}}
-  int wrong_ty __counted_by(count);
-};
-
-struct on_void_ty {
-  int count;
-  // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error at +1{{field has incomplete type 'void'}}
-  void wrong_ty __counted_by(count);
-};
diff --git a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
deleted file mode 100644
index 31c0007501c48..0000000000000
--- a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// __SVInt8_t is specific to ARM64 so specify that in the target triple
-// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct on_sizeless_elt_ty {
-    int count;
-    // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-    // expected-error at +1{{array has sizeless element type '__SVInt8_t'}}
-    __SVInt8_t arr[] __counted_by(count);
-};
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
deleted file mode 100644
index 3de6bd55e2d8e..0000000000000
--- a/clang/test/Sema/attr-counted-by-vla.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct bar;
-
-struct not_found {
-  int count;
-  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
-};
-
-struct no_found_count_not_in_substruct {
-  unsigned long flags;
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct A {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct {
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct_2 {
-  struct {
-    unsigned char count; // expected-note {{'count' declared here}}
-  };
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_in_other_unnamed_substruct {
-  struct {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct {
-  struct _a1 {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct_2 {
-  struct _a2 {
-    unsigned char count;
-  } a2;
-
-  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-};
-
-struct not_found_suggest {
-  int bork;
-  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
-};
-
-int global; // expected-note {{'global' declared here}}
-
-struct found_outside_of_struct {
-  int bork;
-  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
-};
-
-struct self_referrential {
-  int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
-};
-
-struct non_int_count {
-  double dbl_count;
-  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct array_of_ints_count {
-  int integers[2];
-  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct not_a_fam {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct bar' is an incomplete type}}
-  struct bar *non_fam __counted_by(count);
-};
-
-struct not_a_c99_fam {
-  int count;
-  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' on arrays only applies to C99 flexible array members}}
-};
-
-struct annotated_with_anon_struct {
-  unsigned long flags;
-  struct {
-    unsigned char count;
-    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
-  };
-};
-
-//==============================================================================
-// __counted_by on a struct VLA with element type that has unknown size
-//==============================================================================
-
-struct size_unknown; // expected-note 2{{forward declaration of 'struct size_unknown'}}
-struct on_member_arr_incomplete_ty_ty_pos {
-  int count;
-  // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error at +1{{array has incomplete element type 'struct size_unknown'}}
-  struct size_unknown buf[] __counted_by(count);
-};
-
-struct on_member_arr_incomplete_const_ty_ty_pos {
-  int count;
-  // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error at +1{{array has incomplete element type 'const struct size_unknown'}}
-  const struct size_unknown buf[] __counted_by(count);
-};
-
-struct on_member_arr_void_ty_ty_pos {
-  int count;
-  // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error at +1{{array has incomplete element type 'void'}}
-  void buf[] __counted_by(count);
-};
-
-typedef void(fn_ty)(int);
-
-struct on_member_arr_fn_ptr_ty {
-  int count;
-  // An Array of function pointers is allowed
-  fn_ty* buf[] __counted_by(count);
-};
-
-struct on_member_arr_fn_ty {
-  int count;
-  // An array of functions is not allowed.
-  // expected-error at +2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error at +1{{'buf' declared as array of functions of type 'fn_ty' (aka 'void (int)')}}
-  fn_ty buf[] __counted_by(count);
-};
-
-
-// `buffer_of_structs_with_unnannotated_vla`,
-// `buffer_of_structs_with_annotated_vla`, and
-// `buffer_of_const_structs_with_annotated_vla` are currently prevented because
-// computing the size of `Arr` at runtime would require an O(N) walk of `Arr`
-// elements to take into account the length of the VLA in each struct instance.
-
-struct has_unannotated_VLA {
-  int count;
-  char buffer[];
-};
-
-struct has_annotated_VLA {
-  int count;
-  char buffer[] __counted_by(count);
-};
-
-struct buffer_of_structs_with_unnannotated_vla {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member}}
-  struct has_unannotated_VLA Arr[] __counted_by(count);
-};
-
-
-struct buffer_of_structs_with_annotated_vla {
-  int count;
-  // expected-error at +1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member}}
-  struct has_annotated_VLA Arr[] __counted_by(count);
-};
-
-struct buffer_of_const_structs_with_annotated_vla {
-  int count;
-  // Make sure the `const` qualifier is printed when printing the element type.
-  // expected-error at +1{{'counted_by' cannot be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member}}
-  const struct has_annotated_VLA Arr[] __counted_by(count);
-};
-
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
new file mode 100644
index 0000000000000..d5d4ebf557392
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by.c
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct bar;
+
+struct not_found {
+  int count;
+  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
+};
+
+struct no_found_count_not_in_substruct {
+  unsigned long flags;
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct A {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct {
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct_2 {
+  struct {
+    unsigned char count; // expected-note {{'count' declared here}}
+  };
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_in_other_unnamed_substruct {
+  struct {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct {
+  struct _a1 {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct_2 {
+  struct _a2 {
+    unsigned char count;
+  } a2;
+
+  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+};
+
+struct not_found_suggest {
+  int bork;
+  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
+};
+
+int global; // expected-note {{'global' declared here}}
+
+struct found_outside_of_struct {
+  int bork;
+  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
+};
+
+struct self_referrential {
+  int bork;
+  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
+};
+
+struct non_int_count {
+  double dbl_count;
+  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct array_of_ints_count {
+  int integers[2];
+  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct not_a_fam {
+  int count;
+  struct bar *non_fam __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
+};
+
+struct not_a_c99_fam {
+  int count;
+  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
+};
+
+struct annotated_with_anon_struct {
+  unsigned long flags;
+  struct {
+    unsigned char count;
+    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
+  };
+};

>From c587483da0b50efa04146fde205da1d16731e12e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Sun, 19 May 2024 06:21:40 -0700
Subject: [PATCH 17/44] Revert "[Bounds-Safety] Fix
 `pragma-attribute-supported-attributes-list.test`"

Issue #92687

This reverts commit 112eadd55f06bee15caadff688ea0b45acbfa804.
---
 clang/test/Misc/pragma-attribute-supported-attributes-list.test | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 99732694f72a5..fd0e6d71baa80 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -63,6 +63,7 @@
 // CHECK-NEXT: CoroOnlyDestroyWhenComplete (SubjectMatchRule_record)
 // CHECK-NEXT: CoroReturnType (SubjectMatchRule_record)
 // CHECK-NEXT: CoroWrapper (SubjectMatchRule_function)
+// CHECK-NEXT: CountedBy (SubjectMatchRule_field)
 // CHECK-NEXT: DLLExport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: DLLImport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: Destructor (SubjectMatchRule_function)

>From 10edb4991c12738e60843d55cd9edbf6d702d9eb Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu at amd.com>
Date: Sun, 19 May 2024 16:59:03 +0300
Subject: [PATCH 18/44] [Clang][CodeGen] Start migrating away from assuming the
 Default AS is 0 (#88182)

At the moment, Clang is rather liberal in assuming that 0 (and by extension unqualified) is always a safe default. This does not work for targets that actually use a different value for the default / generic AS (for example, the SPIRV that obtains from HIPSPV or SYCL). This patch is a first, fairly safe step towards trying to clear things up by querying a modules' default AS from the target, rather than assuming it's 0, alongside fixing a few places where things break / we encode the 0 == DefaultAS assumption. A bunch of existing tests are extended to check for non-zero default AS usage.
---
 clang/lib/CodeGen/CGException.cpp             |   5 +-
 clang/lib/CodeGen/CGExprCXX.cpp               |   7 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   3 +-
 clang/lib/CodeGen/CodeGenTypeCache.h          |   2 +-
 .../CodeGenCXX/dynamic-cast-address-space.cpp | 123 ++++++++++++++++--
 clang/test/CodeGenCXX/eh.cpp                  |   6 +-
 clang/test/CodeGenCXX/nrvo.cpp                |   4 +-
 .../template-param-objects-address-space.cpp  |  10 ++
 ...w-expression-typeinfo-in-address-space.cpp |   2 +
 .../try-catch-with-address-space.cpp          |   7 +-
 .../typeid-cxx11-with-address-space.cpp       |   4 +
 .../CodeGenCXX/typeid-with-address-space.cpp  |  11 ++
 .../typeinfo-with-address-space.cpp           |   7 +
 .../vtable-assume-load-address-space.cpp      | 110 ++++++++++------
 ...e-pointer-initialization-address-space.cpp |   7 +
 clang/test/CodeGenCXX/vtt-address-space.cpp   |   7 +
 clang/test/CodeGenCXX/wasm-eh.cpp             |   4 +-
 llvm/examples/ExceptionDemo/ExceptionDemo.cpp |   2 +-
 llvm/include/llvm/IR/Intrinsics.td            |   4 +-
 .../WebAssembly/lower-em-exceptions.ll        |   6 +-
 .../GVNHoist/infinite-loop-indirect.ll        |   6 +-
 llvm/test/Transforms/Inline/inline_invoke.ll  |  10 +-
 .../Transforms/LICM/scalar-promote-unwind.ll  |   6 +-
 .../LowerTypeTests/cfi-unwind-direct-call.ll  |   6 +-
 .../Transforms/NewGVN/2011-09-07-TypeIdFor.ll |  14 +-
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |   2 +-
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   |   4 +-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir |   2 +-
 28 files changed, 283 insertions(+), 98 deletions(-)

diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index 34f289334a7df..8acda3f2eb864 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -1052,7 +1052,8 @@ static void emitWasmCatchPadBlock(CodeGenFunction &CGF,
   CGF.Builder.CreateStore(Exn, CGF.getExceptionSlot());
   llvm::CallInst *Selector = CGF.Builder.CreateCall(GetSelectorFn, CPI);
 
-  llvm::Function *TypeIDFn = CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for);
+  llvm::Function *TypeIDFn =
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for, {CGF.VoidPtrTy});
 
   // If there's only a single catch-all, branch directly to its handler.
   if (CatchScope.getNumHandlers() == 1 &&
@@ -1137,7 +1138,7 @@ static void emitCatchDispatchBlock(CodeGenFunction &CGF,
 
   // Select the right handler.
   llvm::Function *llvm_eh_typeid_for =
-    CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for);
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for, {CGF.VoidPtrTy});
   llvm::Type *argTy = llvm_eh_typeid_for->getArg(0)->getType();
   LangAS globAS = CGF.CGM.GetGlobalVarAddressSpace(nullptr);
 
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index c18c36d3f3f32..0cfdb7effe470 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2216,7 +2216,12 @@ static llvm::Value *EmitTypeidFromVTable(CodeGenFunction &CGF, const Expr *E,
 }
 
 llvm::Value *CodeGenFunction::EmitCXXTypeidExpr(const CXXTypeidExpr *E) {
-  llvm::Type *PtrTy = llvm::PointerType::getUnqual(getLLVMContext());
+  // Ideally, we would like to use GlobalsInt8PtrTy here, however, we cannot,
+  // primarily because the result of applying typeid is a value of type
+  // type_info, which is declared & defined by the standard library
+  // implementation and expects to operate on the generic (default) AS.
+  // https://reviews.llvm.org/D157452 has more context, and a possible solution.
+  llvm::Type *PtrTy = Int8PtrTy;
   LangAS GlobAS = CGM.GetGlobalVarAddressSpace(nullptr);
 
   auto MaybeASCast = [=](auto &&TypeInfo) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 489c08a4d4819..227813ad44e8b 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -368,7 +368,8 @@ CodeGenModule::CodeGenModule(ASTContext &C,
   IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth());
   IntPtrTy = llvm::IntegerType::get(LLVMContext,
     C.getTargetInfo().getMaxPointerWidth());
-  Int8PtrTy = llvm::PointerType::get(LLVMContext, 0);
+  Int8PtrTy = llvm::PointerType::get(LLVMContext,
+                                     C.getTargetAddressSpace(LangAS::Default));
   const llvm::DataLayout &DL = M.getDataLayout();
   AllocaInt8PtrTy =
       llvm::PointerType::get(LLVMContext, DL.getAllocaAddrSpace());
diff --git a/clang/lib/CodeGen/CodeGenTypeCache.h b/clang/lib/CodeGen/CodeGenTypeCache.h
index 083d69214fb3c..e273ebe3b060f 100644
--- a/clang/lib/CodeGen/CodeGenTypeCache.h
+++ b/clang/lib/CodeGen/CodeGenTypeCache.h
@@ -51,7 +51,7 @@ struct CodeGenTypeCache {
     llvm::IntegerType *PtrDiffTy;
   };
 
-  /// void*, void** in address space 0
+  /// void*, void** in the target's default address space (often 0)
   union {
     llvm::PointerType *UnqualPtrTy;
     llvm::PointerType *VoidPtrTy;
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index 83a408984b760..3d5e32516c7af 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -1,24 +1,127 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --no-generate-body-for-unused-prefixes --version 4
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+
 struct A { virtual void f(); };
 struct B : A { };
 
-// CHECK: {{define.*@_Z1fP1A}}
-// CHECK-SAME:  personality ptr @__gxx_personality_v0
 B fail;
+//.
+// CHECK: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
+// CHECK: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
+// CHECK: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
+// CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
+// CHECK: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
+// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
+// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
+// WITH-NONZERO-DEFAULT-AS: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
+// WITH-NONZERO-DEFAULT-AS: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
+// WITH-NONZERO-DEFAULT-AS: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
+// WITH-NONZERO-DEFAULT-AS: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
+//.
+// CHECK-LABEL: define dso_local noundef nonnull align 8 dereferenceable(8) ptr @_Z1fP1A(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__dynamic_cast(ptr [[TMP0]], ptr addrspace(1) @_ZTI1A, ptr addrspace(1) @_ZTI1B, i64 0) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+// CHECK-NEXT:    br i1 [[TMP2]], label [[DYNAMIC_CAST_BAD_CAST:%.*]], label [[DYNAMIC_CAST_END:%.*]]
+// CHECK:       dynamic_cast.bad_cast:
+// CHECK-NEXT:    invoke void @__cxa_bad_cast() #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+// CHECK:       invoke.cont:
+// CHECK-NEXT:    unreachable
+// CHECK:       dynamic_cast.end:
+// CHECK-NEXT:    br label [[TRY_CONT:%.*]]
+// CHECK:       lpad:
+// CHECK-NEXT:    [[TMP3:%.*]] = landingpad { ptr, i32 }
+// CHECK-NEXT:            catch ptr null
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0
+// CHECK-NEXT:    store ptr [[TMP4]], ptr addrspace(5) [[EXN_SLOT]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 1
+// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[EHSELECTOR_SLOT]], align 4
+// CHECK-NEXT:    br label [[CATCH:%.*]]
+// CHECK:       catch:
+// CHECK-NEXT:    [[EXN:%.*]] = load ptr, ptr addrspace(5) [[EXN_SLOT]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = call ptr @__cxa_begin_catch(ptr [[EXN]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @__cxa_end_catch()
+// CHECK-NEXT:    br label [[TRY_CONT]]
+// CHECK:       try.cont:
+// CHECK-NEXT:    ret ptr addrspacecast (ptr addrspace(1) @fail to ptr)
+//
+// WITH-NONZERO-DEFAULT-AS-LABEL: define spir_func noundef align 8 dereferenceable(8) ptr addrspace(4) @_Z1fP1A(
+// WITH-NONZERO-DEFAULT-AS-SAME: ptr addrspace(4) noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+// WITH-NONZERO-DEFAULT-AS-NEXT:  entry:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[RETVAL:%.*]] = alloca ptr addrspace(4), align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr addrspace(4), align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// WITH-NONZERO-DEFAULT-AS-NEXT:    store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP1:%.*]] = call spir_func ptr addrspace(4) @__dynamic_cast(ptr addrspace(4) [[TMP0]], ptr addrspace(1) @_ZTI1A, ptr addrspace(1) @_ZTI1B, i64 0) #[[ATTR3:[0-9]+]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP2:%.*]] = icmp eq ptr addrspace(4) [[TMP1]], null
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br i1 [[TMP2]], label [[DYNAMIC_CAST_BAD_CAST:%.*]], label [[DYNAMIC_CAST_END:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       dynamic_cast.bad_cast:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    invoke spir_func void @__cxa_bad_cast() #[[ATTR4:[0-9]+]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       invoke.cont:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    unreachable
+// WITH-NONZERO-DEFAULT-AS:       dynamic_cast.end:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[TRY_CONT:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       lpad:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP3:%.*]] = landingpad { ptr addrspace(4), i32 }
+// WITH-NONZERO-DEFAULT-AS-NEXT:            catch ptr addrspace(4) null
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP4:%.*]] = extractvalue { ptr addrspace(4), i32 } [[TMP3]], 0
+// WITH-NONZERO-DEFAULT-AS-NEXT:    store ptr addrspace(4) [[TMP4]], ptr [[EXN_SLOT]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP5:%.*]] = extractvalue { ptr addrspace(4), i32 } [[TMP3]], 1
+// WITH-NONZERO-DEFAULT-AS-NEXT:    store i32 [[TMP5]], ptr [[EHSELECTOR_SLOT]], align 4
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[CATCH:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       catch:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[EXN:%.*]] = load ptr addrspace(4), ptr [[EXN_SLOT]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP6:%.*]] = call spir_func ptr addrspace(4) @__cxa_begin_catch(ptr addrspace(4) [[EXN]]) #[[ATTR3]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    call spir_func void @__cxa_end_catch()
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[TRY_CONT]]
+// WITH-NONZERO-DEFAULT-AS:       try.cont:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    ret ptr addrspace(4) addrspacecast (ptr addrspace(1) @fail to ptr addrspace(4))
+//
 const B& f(A *a) {
   try {
-    // CHECK: call ptr @__dynamic_cast
-    // CHECK: br i1
-    // CHECK: invoke void @__cxa_bad_cast() [[NR:#[0-9]+]]
     dynamic_cast<const B&>(*a);
   } catch (...) {
-    // CHECK:      landingpad { ptr, i32 }
-    // CHECK-NEXT:   catch ptr null
   }
   return fail;
 }
 
-// CHECK: declare ptr @__dynamic_cast(ptr, ptr addrspace(1), ptr addrspace(1), i64) [[NUW_RO:#[0-9]+]]
 
-// CHECK: attributes [[NUW_RO]] = { nounwind willreturn memory(read) }
-// CHECK: attributes [[NR]] = { noreturn }
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR3]] = { nounwind }
+// CHECK: attributes #[[ATTR4]] = { noreturn }
+//.
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// WITH-NONZERO-DEFAULT-AS: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// WITH-NONZERO-DEFAULT-AS: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenCXX/eh.cpp b/clang/test/CodeGenCXX/eh.cpp
index 5c592a96e27b7..f174b5d84fdf5 100644
--- a/clang/test/CodeGenCXX/eh.cpp
+++ b/clang/test/CodeGenCXX/eh.cpp
@@ -81,7 +81,7 @@ namespace test5 {
 // CHECK:      invoke void @__cxa_throw(ptr [[EXNOBJ]], ptr @_ZTIN5test51AE, ptr @_ZN5test51AD1Ev) [[NR]]
 // CHECK-NEXT:   to label {{%.*}} unwind label %[[HANDLER:[^ ]*]]
 //      :    [[HANDLER]]:  (can't check this in Release-Asserts builds)
-// CHECK:      {{%.*}} = call i32 @llvm.eh.typeid.for(ptr @_ZTIN5test51AE)
+// CHECK:      {{%.*}} = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIN5test51AE)
 }
 
 namespace test6 {
@@ -96,7 +96,7 @@ namespace test6 {
 
 // PR7127
 namespace test7 {
-// CHECK-LABEL:      define{{.*}} i32 @_ZN5test73fooEv() 
+// CHECK-LABEL:      define{{.*}} i32 @_ZN5test73fooEv()
 // CHECK-SAME:  personality ptr @__gxx_personality_v0
   int foo() {
 // CHECK:      [[CAUGHTEXNVAR:%.*]] = alloca ptr
@@ -119,7 +119,7 @@ namespace test7 {
 // CHECK-NEXT: store i32 [[SELECTOR]], ptr [[SELECTORVAR]]
 // CHECK-NEXT: br label
 // CHECK:      [[SELECTOR:%.*]] = load i32, ptr [[SELECTORVAR]]
-// CHECK-NEXT: [[T0:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+// CHECK-NEXT: [[T0:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
 // CHECK-NEXT: icmp eq i32 [[SELECTOR]], [[T0]]
 // CHECK-NEXT: br i1
 // CHECK:      [[T0:%.*]] = load ptr, ptr [[CAUGHTEXNVAR]]
diff --git a/clang/test/CodeGenCXX/nrvo.cpp b/clang/test/CodeGenCXX/nrvo.cpp
index 33dc4cf9dbc8d..23ac04511514d 100644
--- a/clang/test/CodeGenCXX/nrvo.cpp
+++ b/clang/test/CodeGenCXX/nrvo.cpp
@@ -628,7 +628,7 @@ void may_throw();
 // CHECK-EH-03-NEXT:    br label [[CATCH_DISPATCH:%.*]]
 // CHECK-EH-03:       catch.dispatch:
 // CHECK-EH-03-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK-EH-03-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTI1X) #[[ATTR7]]
+// CHECK-EH-03-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI1X) #[[ATTR7]]
 // CHECK-EH-03-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[SEL]], [[TMP3]]
 // CHECK-EH-03-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 // CHECK-EH-03:       catch:
@@ -707,7 +707,7 @@ void may_throw();
 // CHECK-EH-11-NEXT:    br label [[CATCH_DISPATCH:%.*]]
 // CHECK-EH-11:       catch.dispatch:
 // CHECK-EH-11-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK-EH-11-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTI1X) #[[ATTR6]]
+// CHECK-EH-11-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI1X) #[[ATTR6]]
 // CHECK-EH-11-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[SEL]], [[TMP3]]
 // CHECK-EH-11-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 // CHECK-EH-11:       catch:
diff --git a/clang/test/CodeGenCXX/template-param-objects-address-space.cpp b/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
index b54dcfe77934e..b3733decdb550 100644
--- a/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
+++ b/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -std=c++20 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple spirv64-unknown-unknown -fsycl-is-device -std=c++20 %s -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct S { char buf[32]; };
 template<S s> constexpr const char *begin() { return s.buf; }
@@ -8,25 +9,34 @@ extern const void *callee(const S*);
 template<S s> constexpr const void* observable_addr() { return callee(&s); }
 
 // CHECK: [[HELLO:@_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE]]
+// WITH-NONZERO-DEFAULT-AS: [[HELLO:@_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE]]
 // CHECK-SAME: = linkonce_odr addrspace(1) constant { <{ [11 x i8], [21 x i8] }> } { <{ [11 x i8], [21 x i8] }> <{ [11 x i8] c"hello world", [21 x i8] zeroinitializer }> }, comdat
 
 // CHECK: @p
 // CHECK-SAME: addrspace(1) global ptr addrspacecast (ptr addrspace(1) [[HELLO]] to ptr)
+// WITH-NONZERO-DEFAULT-AS: addrspace(1) global ptr addrspace(4) addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4))
 const char *p = begin<S{"hello world"}>();
 
 // CHECK: @q
 // CHECK-SAME: addrspace(1) global ptr addrspacecast (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) [[HELLO]], i64 11) to ptr)
+// WITH-NONZERO-DEFAULT-AS: addrspace(1) global ptr addrspace(4) addrspacecast (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) [[HELLO]], i64 11) to ptr addrspace(4))
 const char *q = end<S{"hello world"}>();
 
 const void *(*r)() = &retval<S{"hello world"}>;
 
 // CHECK: @s
 // CHECK-SAME: addrspace(1) global ptr null
+// WITH-NONZERO-DEFAULT-AS: addrspace(1) global ptr addrspace(4) null
 const void *s = observable_addr<S{"hello world"}>();
 
 // CHECK: define linkonce_odr noundef ptr @_Z6retvalIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} noundef ptr addrspace(4) @_Z6retvalIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
 // CHECK: ret ptr addrspacecast (ptr addrspace(1) [[HELLO]] to ptr)
+// WITH-NONZERO-DEFAULT-AS: ret ptr addrspace(4) addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4))
 
 // CHECK: define linkonce_odr noundef ptr @_Z15observable_addrIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} noundef ptr addrspace(4) @_Z15observable_addrIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
 // CHECK: %call = call noundef ptr @_Z6calleePK1S(ptr noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr))
+// WITH-NONZERO-DEFAULT-AS: %call = call {{.*}} noundef ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4)))
 // CHECK: declare noundef ptr @_Z6calleePK1S(ptr noundef)
+// WITH-NONZERO-DEFAULT-AS: declare {{.*}} noundef ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef)
diff --git a/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp b/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
index d8c23d427e67a..3acbdd8fd97ee 100644
--- a/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
+++ b/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct X {
   ~X();
@@ -15,3 +16,4 @@ void f() {
 }
 
 // CHECK: declare void @__cxa_throw(ptr, ptr addrspace(1), ptr)
+// WITH-NONZERO-DEFAULT-AS: declare{{.*}} void @__cxa_throw(ptr addrspace(4), ptr addrspace(1), ptr addrspace(4))
diff --git a/clang/test/CodeGenCXX/try-catch-with-address-space.cpp b/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
index 279d29f50fd41..412ac6c287258 100644
--- a/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct X { };
 
@@ -10,7 +11,8 @@ void f() {
     // CHECK: ptr addrspace(1) @_ZTI1X
   } catch (const X x) {
     // CHECK: catch ptr addrspace(1) @_ZTI1X
-    // CHECK: call i32 @llvm.eh.typeid.for(ptr addrspacecast (ptr addrspace(1) @_ZTI1X to ptr))
+    // CHECK: call i32 @llvm.eh.typeid.for.p0(ptr addrspacecast (ptr addrspace(1) @_ZTI1X to ptr))
+    // WITH-NONZERO-DEFAULT-AS: call i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTI1X to ptr addrspace(4)))
   }
 }
 
@@ -20,6 +22,7 @@ void h() {
     // CHECK: ptr addrspace(1) @_ZTIPKc
   } catch (char const(&)[4]) {
     // CHECK: catch ptr addrspace(1) @_ZTIA4_c
-    // CHECK: call i32 @llvm.eh.typeid.for(ptr addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr))
+    // CHECK: call i32 @llvm.eh.typeid.for.p0(ptr addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr))
+    // WITH-NONZERO-DEFAULT-AS: call i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr addrspace(4)))
   }
 }
diff --git a/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp b/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
index c4e7d36acff13..f6dc38ec9f292 100644
--- a/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 #include <typeinfo>
 
 namespace Test1 {
@@ -19,14 +20,17 @@ struct B : virtual A {};
 struct C { int n; };
 
 // CHECK: @_ZN5Test15itemsE ={{.*}} constant [4 x {{.*}}] [{{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr), {{.*}} @_ZN5Test19make_implINS_1AEEEPvv {{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr), {{.*}} @_ZN5Test19make_implINS_1BEEEPvv {{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11CE to ptr), {{.*}} @_ZN5Test19make_implINS_1CEEEPvv {{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIi to ptr), {{.*}} @_ZN5Test19make_implIiEEPvv }]
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test15itemsE ={{.*}} addrspace(1) constant [4 x {{.*}}] [{{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implINS_1AEEEPvv {{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implINS_1BEEEPvv {{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11CE to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implINS_1CEEEPvv {{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implIiEEPvv }]
 extern constexpr Item items[] = {
   item<A>("A"), item<B>("B"), item<C>("C"), item<int>("int")
 };
 
 // CHECK: @_ZN5Test11xE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test11xE ={{.*}} addrspace(1) constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr addrspace(4)), align 8
 constexpr auto &x = items[0].ti;
 
 // CHECK: @_ZN5Test11yE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test11yE ={{.*}} addrspace(1) constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr addrspace(4)), align 8
 constexpr auto &y = typeid(B{});
 
 }
diff --git a/clang/test/CodeGenCXX/typeid-with-address-space.cpp b/clang/test/CodeGenCXX/typeid-with-address-space.cpp
index b439770a8b631..98af17f4fc888 100644
--- a/clang/test/CodeGenCXX/typeid-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeid-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 #include <typeinfo>
 
 namespace Test1 {
@@ -7,19 +8,23 @@ namespace Test1 {
 struct A { virtual void f(); };
 
 // CHECK: @_ZN5Test16int_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIi to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test16int_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), align 8
 const std::type_info &int_ti = typeid(int);
 
 // CHECK: @_ZN5Test14A_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test14A_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr addrspace(4)), align 8
 const std::type_info &A_ti = typeid(const volatile A &);
 
 volatile char c;
 
 // CHECK: @_ZN5Test14c_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIc to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test14c_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIc to ptr addrspace(4)), align 8
 const std::type_info &c_ti = typeid(c);
 
 extern const double &d;
 
 // CHECK: @_ZN5Test14d_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTId to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test14d_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTId to ptr addrspace(4)), align 8
 const std::type_info &d_ti = typeid(d);
 
 extern A &a;
@@ -28,18 +33,24 @@ extern A &a;
 const std::type_info &a_ti = typeid(a);
 
 // CHECK: @_ZN5Test18A10_c_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIA10_c to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test18A10_c_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIA10_c to ptr addrspace(4)), align 8
 const std::type_info &A10_c_ti = typeid(char const[10]);
 
 // CHECK-LABEL: define{{.*}} ptr @_ZN5Test11fEv
 // CHECK-SAME:  personality ptr @__gxx_personality_v0
+// WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} ptr addrspace(4) @_ZN5Test11fEv
+// WITH-NONZERO-DEFAULT-AS-SAME:  personality ptr @__gxx_personality_v0
 const char *f() {
   try {
     // CHECK: br i1
     // CHECK: invoke void @__cxa_bad_typeid() [[NR:#[0-9]+]]
+    // WITH-NONZERO-DEFAULT-AS: invoke{{.*}} void @__cxa_bad_typeid() [[NR:#[0-9]+]]
     return typeid(*static_cast<A *>(0)).name();
   } catch (...) {
     // CHECK:      landingpad { ptr, i32 }
     // CHECK-NEXT:   catch ptr null
+    // WITH-NONZERO-DEFAULT-AS:      landingpad { ptr addrspace(4), i32 }
+    // WITH-NONZERO-DEFAULT-AS-NEXT:   catch ptr addrspace(4) null
   }
 
   return 0;
diff --git a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
index 80f6ab0903e51..350303cc6e9b3 100644
--- a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s -check-prefix=AS
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -o - | FileCheck %s -check-prefix=NONZERO-DEFAULT-AS
 // RUN: %clang_cc1 -I%S %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s -check-prefix=NO-AS
 #include <typeinfo>
 
@@ -25,24 +26,30 @@ class B : A {
 
 unsigned long Fn(B& b) {
 // AS: %call = call noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr), ptr {{.*}} %2)
+// NONZERO-DEFAULT-AS: %call = call{{.*}} noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr addrspace(4)), ptr addrspace(4) {{.*}} %2)
 // NO-AS: %call = call noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr {{.*}} @_ZTISt9type_info, ptr {{.*}} %2)
     if (typeid(std::type_info) == typeid(b))
         return 42;
 // AS: %call2 = call noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr), ptr {{.*}} %5)
+// NONZERO-DEFAULT-AS: %call2 = call{{.*}} noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), ptr addrspace(4) {{.*}} %5)
 // NO-AS: %call2 = call noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr {{.*}} @_ZTIi, ptr {{.*}} %5)
     if (typeid(int) != typeid(b))
         return 1712;
 // AS: %call5 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr))
+// NONZERO-DEFAULT-AS: %call5 = call{{.*}} noundef ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr addrspace(4)))
 // NO-AS: %call5 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} @_ZTI1A)
 // AS: %call7 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} %8)
+// NONZERO-DEFAULT-AS: %call7 = call{{.*}} noundef ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} %8)
 // NO-AS: %call7 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} %8)
     if (typeid(A).name() == typeid(b).name())
         return 0;
 // AS: %call11 = call noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr {{.*}} %11, ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr))
+// NONZERO-DEFAULT-AS: %call11 = call{{.*}} noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr addrspace(4) {{.*}} %11, ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr addrspace(4)))
 // NO-AS:   %call11 = call noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr {{.*}} %11, ptr {{.*}} @_ZTIf)
     if (typeid(b).before(typeid(float)))
         return 1;
 // AS: %call15 = call noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr {{.*}} %14)
+// NONZERO-DEFAULT-AS: %call15 = call{{.*}} noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr addrspace(4) {{.*}} %14)
 // NO-AS: %call15 = call noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr {{.*}} %14)
     return typeid(b).hash_code();
 }
diff --git a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
index d765fe94d9b08..ecafa99d8be00 100644
--- a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
@@ -1,14 +1,17 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -emit-llvm -o %t.ms.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
 // FIXME: Assume load should not require -fstrict-vtable-pointers
 
 // RUN: FileCheck --check-prefix=CHECK1 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK2 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK3 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK4 --input-file=%t.ll %s
-// RUN: FileCheck --check-prefix=CHECK5 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK-MS --input-file=%t.ms.ll %s
 // RUN: FileCheck --check-prefix=CHECK6 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK7 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK8 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK9 --input-file=%t.ll %s
 namespace test1 {
 
 struct A {
@@ -23,8 +26,8 @@ struct B : A {
 void g(A *a) { a->foo(); }
 
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooAEv()
-// CHECK1: call void @_ZN5test11AC1Ev(ptr
-// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK1: call{{.*}} void @_ZN5test11AC1Ev(ptr {{((addrspace(4)){0,1})}}
+// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11AE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
@@ -35,8 +38,8 @@ void fooA() {
 }
 
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooBEv()
-// CHECK1: call void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
-// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK1: call{{.*}} void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
+// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11BE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
@@ -46,7 +49,7 @@ void fooB() {
   g(&b);
 }
 // there should not be any assumes in the ctor that calls base ctor
-// CHECK1-LABEL: define linkonce_odr void @_ZN5test11BC2Ev(ptr
+// CHECK1-LABEL: define linkonce_odr{{.*}} void @_ZN5test11BC2Ev(ptr
 // CHECK1-NOT: @llvm.assume(
 // CHECK1-LABEL: {{^}}}
 }
@@ -69,17 +72,17 @@ void g(A *a) { a->foo(); }
 void h(B *b) { b->bar(); }
 
 // CHECK2-LABEL: define{{.*}} void @_ZN5test24testEv()
-// CHECK2: call void @_ZN5test21CC1Ev(ptr
+// CHECK2: call{{.*}} void @_ZN5test21CC1Ev(ptr
 // CHECK2: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{.*}}
 // CHECK2: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 0, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP]])
 
-// CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 8
-// CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %[[ADD_PTR]]
+// CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}, i64 8
+// CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%[[ADD_PTR]]
 // CHECK2: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 1, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP2]])
 
-// CHECK2: call void @_ZN5test21gEPNS_1AE(
+// CHECK2: call{{.*}} void @_ZN5test21gEPNS_1AE(
 // CHECK2-LABEL: {{^}}}
 
 void test() {
@@ -106,7 +109,7 @@ struct C : virtual A, B {
 void g(B *a) { a->foo(); }
 
 // CHECK3-LABEL: define{{.*}} void @_ZN5test34testEv()
-// CHECK3: call void @_ZN5test31CC1Ev(ptr
+// CHECK3: call{{.*}} void @_ZN5test31CC1Ev(ptr
 // CHECK3: %[[CMP:.*]] = icmp eq ptr addrspace(1) %{{.*}}, getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test31CE, i32 0, i32 0, i32 3)
 // CHECK3: call void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
@@ -134,12 +137,12 @@ struct C : B {
 void g(C *c) { c->foo(); }
 
 // CHECK4-LABEL: define{{.*}} void @_ZN5test44testEv()
-// CHECK4: call void @_ZN5test41CC1Ev(ptr
-// CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK4: call{{.*}} void @_ZN5test41CC1Ev(ptr
+// CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK4: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP]]
 
-// CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK4: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: {{^}}}
@@ -150,6 +153,27 @@ void test() {
 }
 } // test4
 
+namespace testMS {
+
+struct __declspec(novtable) S {
+  virtual void foo();
+};
+
+void g(S &s) { s.foo(); }
+
+// if struct has novtable specifier, then we can't generate assumes
+// CHECK-MS-LABEL: define dso_local void @"?test at testMS@@YAXXZ"()
+// CHECK-MS: call x86_thiscallcc noundef ptr @"??0S at testMS@@QAE at XZ"(
+// CHECK-MS-NOT: @llvm.assume
+// CHECK-MS-LABEL: {{^}}}
+
+void test() {
+  S s;
+  g(s);
+}
+
+} // testMS
+
 namespace test6 {
 struct A {
   A();
@@ -161,17 +185,17 @@ struct B : A {
 };
 // FIXME: Because A's vtable is external, and no virtual functions are hidden,
 // it's safe to generate assumption loads.
-// CHECK5-LABEL: define{{.*}} void @_ZN5test61gEv()
-// CHECK5: call void @_ZN5test61AC1Ev(
-// CHECK5-NOT: call void @llvm.assume(
+// CHECK6-LABEL: define{{.*}} void @_ZN5test61gEv()
+// CHECK6: call{{.*}} void @_ZN5test61AC1Ev(
+// CHECK6-NOT: call void @llvm.assume(
 
 // We can't emit assumption loads for B, because if we would refer to vtable
 // it would refer to functions that will not be able to find (like implicit
 // inline destructor).
 
-// CHECK5-LABEL:   call void @_ZN5test61BC1Ev(
-// CHECK5-NOT: call void @llvm.assume(
-// CHECK5-LABEL: {{^}}}
+// CHECK6-LABEL:   call{{.*}} void @_ZN5test61BC1Ev(
+// CHECK6-NOT: call void @llvm.assume(
+// CHECK6-LABEL: {{^}}}
 void g() {
   A *a = new A;
   B *b = new B;
@@ -180,7 +204,7 @@ void g() {
 
 namespace test7 {
 // Because A's key function is defined here, vtable is generated in this TU
-// CHECK6: @_ZTVN5test71AE ={{.*}} unnamed_addr addrspace(1) constant
+// CHECK7: @_ZTVN5test71AE ={{.*}} unnamed_addr addrspace(1) constant
 struct A {
   A();
   virtual void foo();
@@ -188,10 +212,10 @@ struct A {
 };
 void A::foo() {}
 
-// CHECK6-LABEL: define{{.*}} void @_ZN5test71gEv()
-// CHECK6: call void @_ZN5test71AC1Ev(
-// CHECK6: call void @llvm.assume(
-// CHECK6-LABEL: {{^}}}
+// CHECK7-LABEL: define{{.*}} void @_ZN5test71gEv()
+// CHECK7: call{{.*}} void @_ZN5test71AC1Ev(
+// CHECK7: call void @llvm.assume(
+// CHECK7-LABEL: {{^}}}
 void g() {
   A *a = new A();
   a->bar();
@@ -205,14 +229,14 @@ struct A {
   virtual void bar();
 };
 
-// CHECK7-DAG: @_ZTVN5test81BE = available_externally unnamed_addr addrspace(1) constant
+// CHECK8-DAG: @_ZTVN5test81BE = available_externally unnamed_addr addrspace(1) constant
 struct B : A {
   B();
   void foo();
   void bar();
 };
 
-// CHECK7-DAG: @_ZTVN5test81CE = linkonce_odr unnamed_addr addrspace(1) constant
+// CHECK8-DAG: @_ZTVN5test81CE = linkonce_odr unnamed_addr addrspace(1) constant
 struct C : A {
   C();
   void bar();
@@ -227,14 +251,14 @@ struct D : A {
 };
 void D::bar() {}
 
-// CHECK7-DAG: @_ZTVN5test81EE = linkonce_odr unnamed_addr addrspace(1) constant
+// CHECK8-DAG: @_ZTVN5test81EE = linkonce_odr unnamed_addr addrspace(1) constant
 struct E : A {
   E();
 };
 
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81bEv()
-// CHECK7: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81bEv()
+// CHECK8: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void b() {
   B b;
   b.bar();
@@ -243,26 +267,26 @@ void b() {
 // FIXME: C has inline virtual functions which prohibits as from generating
 // assumption loads, but because vtable is generated in this TU (key function
 // defined here) it would be correct to refer to it.
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81cEv()
-// CHECK7-NOT: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81cEv()
+// CHECK8-NOT: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void c() {
   C c;
   c.bar();
 }
 
 // FIXME: We could generate assumption loads here.
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81dEv()
-// CHECK7-NOT: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81dEv()
+// CHECK8-NOT: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void d() {
   D d;
   d.bar();
 }
 
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81eEv()
-// CHECK7: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81eEv()
+// CHECK8: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void e() {
   E e;
   e.bar();
@@ -276,9 +300,9 @@ struct S {
   __attribute__((visibility("hidden"))) virtual void doStuff();
 };
 
-// CHECK8-LABEL: define{{.*}} void @_ZN5test94testEv()
-// CHECK8-NOT: @llvm.assume(
-// CHECK8: }
+// CHECK9-LABEL: define{{.*}} void @_ZN5test94testEv()
+// CHECK9-NOT: @llvm.assume(
+// CHECK9: }
 void test() {
   S *s = new S();
   s->doStuff();
diff --git a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
index a3f12f0ebfc87..876d0845cc515 100644
--- a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct Field {
   Field();
@@ -24,6 +25,7 @@ struct A : Base {
 // CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev(
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} void @_ZN1AC2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 A::A() { }
 
 // CHECK-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr {{[^,]*}} %this) unnamed_addr
@@ -31,6 +33,7 @@ A::A() { }
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 A::~A() { }
 
 struct B : Base {
@@ -43,18 +46,22 @@ void f() { B b; }
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC1Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN1BC2Ev(
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BC1Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD1Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN1BD2Ev(
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BD1Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
 // CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BC2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BD2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
diff --git a/clang/test/CodeGenCXX/vtt-address-space.cpp b/clang/test/CodeGenCXX/vtt-address-space.cpp
index 24f4e2a755da0..4c3d0a534611c 100644
--- a/clang/test/CodeGenCXX/vtt-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtt-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 // This is the sample from the C++ Itanium ABI, p2.6.2.
 namespace Test {
@@ -25,3 +26,9 @@ namespace Test {
 // CHECK: define linkonce_odr void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %this, ptr addrspace(1) noundef %vtt)
 // CHECK: define linkonce_odr void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
 // CHECK: define linkonce_odr void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
+// WITH-NONZERO-DEFAULT-AS: call {{.*}} void @_ZN4Test2V2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(20) %2, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 11))
+// WITH-NONZERO-DEFAULT-AS: call {{.*}} void @_ZN4Test2C1C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %this1, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 1))
+// WITH-NONZERO-DEFAULT-AS: call {{.*}} void @_ZN4Test2C2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %3, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 3))
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} void @_ZN4Test2V2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(20) %this, ptr addrspace(1) noundef %vtt)
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} void @_ZN4Test2C1C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %this, ptr addrspace(1) noundef %vtt)
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} void @_ZN4Test2C2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %this, ptr addrspace(1) noundef %vtt)
diff --git a/clang/test/CodeGenCXX/wasm-eh.cpp b/clang/test/CodeGenCXX/wasm-eh.cpp
index af023f52191b9..1b17498ba9ce9 100644
--- a/clang/test/CodeGenCXX/wasm-eh.cpp
+++ b/clang/test/CodeGenCXX/wasm-eh.cpp
@@ -34,7 +34,7 @@ void test0() {
 // CHECK-NEXT:   %[[EXN:.*]] = call ptr @llvm.wasm.get.exception(token %[[CATCHPAD]])
 // CHECK-NEXT:   store ptr %[[EXN]], ptr %exn.slot
 // CHECK-NEXT:   %[[SELECTOR:.*]] = call i32 @llvm.wasm.get.ehselector(token %[[CATCHPAD]])
-// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #7
+// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #7
 // CHECK-NEXT:   %[[MATCHES:.*]] = icmp eq i32 %[[SELECTOR]], %[[TYPEID]]
 // CHECK-NEXT:   br i1 %[[MATCHES]], label %[[CATCH_INT_BB:.*]], label %[[CATCH_FALLTHROUGH_BB:.*]]
 
@@ -51,7 +51,7 @@ void test0() {
 // CHECK-NEXT:   br label %[[TRY_CONT_BB:.*]]
 
 // CHECK: [[CATCH_FALLTHROUGH_BB]]
-// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTId) #7
+// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTId) #7
 // CHECK-NEXT:   %[[MATCHES:.*]] = icmp eq i32 %[[SELECTOR]], %[[TYPEID]]
 // CHECK-NEXT:   br i1 %[[MATCHES]], label %[[CATCH_FLOAT_BB:.*]], label %[[RETHROW_BB:.*]]
 
diff --git a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
index 0afc6b30d140e..fdee76cb96146 100644
--- a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -1865,7 +1865,7 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 
   // llvm.eh.typeid.for intrinsic
 
-  getDeclaration(&module, llvm::Intrinsic::eh_typeid_for);
+  getDeclaration(&module, llvm::Intrinsic::eh_typeid_for, builder.getPtrTy());
 }
 
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 78f0dbec863e9..3019f68083d42 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1371,7 +1371,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
 
 // The result of eh.typeid.for depends on the enclosing function, but inside a
 // given function it is 'const' and may be CSE'd etc.
-def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
+def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem]>;
 
 def int_eh_return_i32 : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>;
 def int_eh_return_i64 : Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty]>;
@@ -1730,7 +1730,7 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic<
 
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
-// TODO: We should introduce a new memory kind fo traps (and other side effects 
+// TODO: We should introduce a new memory kind fo traps (and other side effects
 //       we only model to keep things alive).
 def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrInaccessibleMemOnly,
                IntrWriteMem]>, ClangBuiltin<"__builtin_trap">;
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
index d17a5b419e351..f6b36c56c6d3d 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -44,7 +44,7 @@ lpad:                                             ; preds = %entry
 ; CHECK-NEXT: %[[CDR:.*]] = extractvalue { ptr, i32 } %[[IVI2]], 1
 
 catch.dispatch:                                   ; preds = %lpad
-  %3 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %3 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %matches = icmp eq i32 %2, %3
   br i1 %matches, label %catch1, label %catch
 ; CHECK: catch.dispatch:
@@ -139,7 +139,7 @@ lpad:                                             ; preds = %entry
   br label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %lpad
-  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %matches = icmp eq i32 %3, %4
   br i1 %matches, label %catch1, label %catch
 
@@ -162,7 +162,7 @@ declare void @foo(i32)
 declare ptr @bar(i8, i8)
 
 declare i32 @__gxx_personality_v0(...)
-declare i32 @llvm.eh.typeid.for(ptr)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
 declare ptr @__cxa_begin_catch(ptr)
 declare void @__cxa_end_catch()
 declare void @__cxa_call_unexpected(ptr)
diff --git a/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll b/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
index aef55af81dcac..a7e6ff30d8b2f 100644
--- a/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
+++ b/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
@@ -292,7 +292,7 @@ define i32 @foo2(ptr nocapture readonly %i) local_unnamed_addr personality ptr @
 ; CHECK-NEXT:    [[BC1:%.*]] = add i32 [[TMP0]], 10
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #[[ATTR1]]
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[BC7:%.*]] = add i32 [[TMP0]], 10
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) #[[ATTR1]]
@@ -340,7 +340,7 @@ lpad:
   %bc1 = add i32 %0, 10
   %3 = extractvalue { ptr, i32 } %2, 0
   %4 = extractvalue { ptr, i32 } %2, 1
-  %5 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #2
+  %5 = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #2
   %matches = icmp eq i32 %4, %5
   %bc7 = add i32 %0, 10
   %6 = tail call ptr @__cxa_begin_catch(ptr %3) #2
@@ -383,7 +383,7 @@ declare void @__cxa_throw(ptr, ptr, ptr) local_unnamed_addr
 declare i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(ptr) #1
+declare i32 @llvm.eh.typeid.for.p0(ptr) #1
 
 declare ptr @__cxa_begin_catch(ptr) local_unnamed_addr
 
diff --git a/llvm/test/Transforms/Inline/inline_invoke.ll b/llvm/test/Transforms/Inline/inline_invoke.ll
index 89c56447c07bd..5441e2a9e63b9 100644
--- a/llvm/test/Transforms/Inline/inline_invoke.ll
+++ b/llvm/test/Transforms/Inline/inline_invoke.ll
@@ -19,7 +19,7 @@ declare void @use(i32) nounwind
 
 declare void @opaque()
 
-declare i32 @llvm.eh.typeid.for(ptr) nounwind
+declare i32 @llvm.eh.typeid.for.p0(ptr) nounwind
 
 declare i32 @__gxx_personality_v0(...)
 
@@ -74,7 +74,7 @@ lpad:                                             ; preds = %entry
             catch ptr @_ZTIi
   %eh.exc = extractvalue { ptr, i32 } %exn, 0
   %eh.selector = extractvalue { ptr, i32 } %exn, 1
-  %0 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) nounwind
+  %0 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) nounwind
   %1 = icmp eq i32 %eh.selector, %0
   br i1 %1, label %catch, label %eh.resume
 
@@ -109,7 +109,7 @@ eh.resume:
 ; CHECK-NEXT: phi { ptr, i32 } [
 ; CHECK-NEXT: extractvalue { ptr, i32 }
 ; CHECK-NEXT: extractvalue { ptr, i32 }
-; CHECK-NEXT: call i32 @llvm.eh.typeid.for(
+; CHECK-NEXT: call i32 @llvm.eh.typeid.for.p0(
 
 
 ;; Test 1 - Correctly handle phis in outer landing pads.
@@ -133,7 +133,7 @@ lpad:
             catch ptr @_ZTIi
   %eh.exc = extractvalue { ptr, i32 } %exn, 0
   %eh.selector = extractvalue { ptr, i32 } %exn, 1
-  %0 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) nounwind
+  %0 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) nounwind
   %1 = icmp eq i32 %eh.selector, %0
   br i1 %1, label %catch, label %eh.resume
 
@@ -212,7 +212,7 @@ eh.resume:
 ; CHECK-NEXT: [[EXNJ1:%.*]] = phi { ptr, i32 } [ [[EXNJ2]], %[[LPAD_JOIN2]] ], [ [[LPADVAL1]], %[[RESUME1]] ]
 ; CHECK-NEXT: extractvalue { ptr, i32 } [[EXNJ1]], 0
 ; CHECK-NEXT: [[SELJ1:%.*]] = extractvalue { ptr, i32 } [[EXNJ1]], 1
-; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.eh.typeid.for(
+; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.eh.typeid.for.p0(
 ; CHECK-NEXT: icmp eq i32 [[SELJ1]], [[T]]
 
 ; CHECK:      call void @use(i32 [[XJ1]])
diff --git a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
index be11722d2d567..f7829c4d6e4d9 100644
--- a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
@@ -304,7 +304,7 @@ define void @loop_within_tryblock() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
 ; CHECK-NEXT:    br label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 ; CHECK:       catch:
@@ -355,7 +355,7 @@ lpad:
   br label %catch.dispatch
 
 catch.dispatch:
-  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #3
+  %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #3
   %matches = icmp eq i32 %3, %4
   br i1 %matches, label %catch, label %eh.resume
 
@@ -564,6 +564,6 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-declare i32 @llvm.eh.typeid.for(ptr)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
 
 declare void @f() uwtable
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
index 3e1f8b97e98b8..4d5055cc5a760 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
@@ -65,7 +65,7 @@ lpad:                                             ; preds = %cfi.cont
   %1 = landingpad { ptr, i32 }
   catch ptr @_ZTIi
   %2 = extractvalue { ptr, i32 } %1, 1
-  %3 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTIi) #5
+  %3 = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi) #5
   %matches = icmp eq i32 %2, %3
   br i1 %matches, label %catch, label %eh.resume
 
@@ -90,7 +90,7 @@ declare void @__cfi_slowpath(i64, ptr) local_unnamed_addr
 declare i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nofree nosync nounwind memory(none)
-declare i32 @llvm.eh.typeid.for(ptr) #2
+declare i32 @llvm.eh.typeid.for.p0(ptr) #2
 
 declare ptr @__cxa_begin_catch(ptr) local_unnamed_addr
 
@@ -181,7 +181,7 @@ attributes #8 = { noreturn nounwind }
 ; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { ptr, i32 }
 ; CHECK-NEXT:            catch ptr @_ZTIi
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTIi) #[[ATTR6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi) #[[ATTR6]]
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 ; CHECK:       catch:
diff --git a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
index 675e7da26a105..afd7610b71624 100644
--- a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
+++ b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
@@ -10,7 +10,7 @@ declare void @_Z4barv()
 
 declare void @_Z7cleanupv()
 
-declare i32 @llvm.eh.typeid.for(ptr) nounwind readonly
+declare i32 @llvm.eh.typeid.for.p0(ptr) nounwind readonly
 
 declare ptr @__cxa_begin_catch(ptr) nounwind
 
@@ -32,11 +32,11 @@ define void @_Z3foov() uwtable personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:            catch ptr @_ZTIb
 ; CHECK-NEXT:    [[EXC_PTR2_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0
 ; CHECK-NEXT:    [[FILTER3_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
-; CHECK-NEXT:    [[TYPEID_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+; CHECK-NEXT:    [[TYPEID_I:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID_I]]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PPAD:%.*]], label [[NEXT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[TYPEID1_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+; CHECK-NEXT:    [[TYPEID1_I:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIb)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID1_I]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[PPAD2:%.*]], label [[NEXT2:%.*]]
 ; CHECK:       ppad:
@@ -77,12 +77,12 @@ lpad:                                             ; preds = %entry
   catch ptr @_ZTIb
   %exc_ptr2.i = extractvalue { ptr, i32 } %0, 0
   %filter3.i = extractvalue { ptr, i32 } %0, 1
-  %typeid.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %typeid.i = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %1 = icmp eq i32 %filter3.i, %typeid.i
   br i1 %1, label %ppad, label %next
 
 next:                                             ; preds = %lpad
-  %typeid1.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+  %typeid1.i = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIb)
   %2 = icmp eq i32 %filter3.i, %typeid1.i
   br i1 %2, label %ppad2, label %next2
 
@@ -98,12 +98,12 @@ ppad2:                                            ; preds = %next
 
 next2:                                            ; preds = %next
   call void @_Z7cleanupv()
-  %typeid = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %typeid = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %4 = icmp eq i32 %filter3.i, %typeid
   br i1 %4, label %ppad3, label %next3
 
 next3:                                            ; preds = %next2
-  %typeid1 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+  %typeid1 = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIb)
   %5 = icmp eq i32 %filter3.i, %typeid1
   br i1 %5, label %ppad4, label %unwind
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index bd347d0cf6308..57af89f5dbf8d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -635,7 +635,7 @@ def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend", [0]>,
 // Exception handling intrinsics.
 //
 
-def LLVM_EhTypeidForOp : LLVM_OneResultIntrOp<"eh.typeid.for"> {
+def LLVM_EhTypeidForOp : LLVM_OneResultIntrOp<"eh.typeid.for", [], [0]> {
     let arguments = (ins LLVM_AnyPointer:$type_info);
     let assemblyFormat = "$type_info attr-dict `:` functional-type(operands, results)";
 }
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index e43024ff868e5..9a5528002ef5e 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -732,7 +732,7 @@ define void @coro_promise(ptr %0, i32 %1, i1 %2) {
 ; CHECK-LABEL:  llvm.func @eh_typeid_for
 define void @eh_typeid_for(ptr %0) {
   ; CHECK: llvm.intr.eh.typeid.for %{{.*}} : (!llvm.ptr) -> i32
-  %2 = call i32 @llvm.eh.typeid.for(ptr %0)
+  %2 = call i32 @llvm.eh.typeid.for.p0(ptr %0)
   ret void
 }
 
@@ -1082,7 +1082,7 @@ declare i1 @llvm.coro.end(ptr, i1, token)
 declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 declare void @llvm.coro.resume(ptr)
 declare ptr @llvm.coro.promise(ptr nocapture, i32, i1)
-declare i32 @llvm.eh.typeid.for(ptr)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
 declare ptr @llvm.stacksave.p0()
 declare ptr addrspace(1) @llvm.stacksave.p1()
 declare void @llvm.stackrestore.p0(ptr)
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 238c3e4263cb0..1e533aeacfb49 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -724,7 +724,7 @@ llvm.func @coro_promise(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : i1) {
 
 // CHECK-LABEL: @eh_typeid_for
 llvm.func @eh_typeid_for(%arg0 : !llvm.ptr) {
-    // CHECK: call i32 @llvm.eh.typeid.for
+    // CHECK: call i32 @llvm.eh.typeid.for.p0
     %0 = llvm.intr.eh.typeid.for %arg0 : (!llvm.ptr) -> i32
     llvm.return
 }

>From 0c7d268ba72767b70c7bf0bc8ae6422c509f94d8 Mon Sep 17 00:00:00 2001
From: aengelke <engelke at in.tum.de>
Date: Sun, 19 May 2024 16:38:53 +0200
Subject: [PATCH 19/44] [CodeGen][SDAG] Skip preferred extend at O0 (#92643)

This is a pure optimization to avoid redundant extensions, but iterating
over all users is expensive, so don't do this at -O0.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h               | 1 +
 llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 979ef8033eb5e..ed6962685f7b0 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -469,6 +469,7 @@ class SelectionDAG {
   MachineFunction &getMachineFunction() const { return *MF; }
   const Pass *getPass() const { return SDAGISelPass; }
 
+  CodeGenOptLevel getOptLevel() const { return OptLevel; }
   const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
   const TargetMachine &getTarget() const { return TM; }
   const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8fb6b11b8805c..35f840201e4ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -222,8 +222,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
           InitializeRegForValue(&I);
 
-      // Decide the preferred extend type for a value.
-      PreferredExtendType[&I] = getPreferredExtendForValue(&I);
+      // Decide the preferred extend type for a value. This iterates over all
+      // users and therefore isn't cheap, so don't do this at O0.
+      if (DAG->getOptLevel() != CodeGenOptLevel::None)
+        PreferredExtendType[&I] = getPreferredExtendForValue(&I);
     }
   }
 

>From 9e4ef0dee18c0c99325e8d56f16c149020e89d37 Mon Sep 17 00:00:00 2001
From: aengelke <engelke at in.tum.de>
Date: Sun, 19 May 2024 16:39:19 +0200
Subject: [PATCH 20/44] [CodeGen][SDAG] Track returntwice in lowering info
 (#92640)

This saves an extra iteration over the all instructions of the function.
---
 llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 4 ++++
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp     | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 35f840201e4ba..de22d230b1c32 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -214,6 +214,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           if (CI->isMustTailCall() && Fn->isVarArg())
             MF->getFrameInfo().setHasMustTailInVarArgFunc(true);
         }
+
+        // Determine if there is a call to setjmp in the machine function.
+        if (Call->hasFnAttr(Attribute::ReturnsTwice))
+          MF->setExposesReturnsTwice(true);
       }
 
       // Mark values used outside their block as exported, by allocating
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index b5694c955b8c8..8addaf1ae3e54 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -680,9 +680,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     }
   }
 
-  // Determine if there is a call to setjmp in the machine function.
-  MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
-
   // Determine if floating point is used for msvc
   computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI());
 

>From eab92cb7f33be16a6a17549182e9237112b7a183 Mon Sep 17 00:00:00 2001
From: Nhat Nguyen <nhat7203 at gmail.com>
Date: Sun, 19 May 2024 10:57:11 -0400
Subject: [PATCH 21/44] [llvm] Add KnownBits implementations for avgFloor and
 avgCeil (#86445)

This PR is to address the issue #84640
---
 llvm/include/llvm/Support/KnownBits.h         | 12 +++++++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 29 +++++++++++------
 llvm/lib/Support/KnownBits.cpp                | 31 +++++++++++++++++++
 llvm/unittests/Support/KnownBitsTest.cpp      | 12 +++++++
 4 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index 9b7f405b62564..ba4a5f01036ca 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -354,6 +354,18 @@ struct KnownBits {
   /// Compute knownbits resulting from llvm.usub.sat(LHS, RHS)
   static KnownBits usub_sat(const KnownBits &LHS, const KnownBits &RHS);
 
+  /// Compute knownbits resulting from APIntOps::avgFloorS
+  static KnownBits avgFloorS(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute knownbits resulting from APIntOps::avgFloorU
+  static KnownBits avgFloorU(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute knownbits resulting from APIntOps::avgCeilS
+  static KnownBits avgCeilS(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute knownbits resulting from APIntOps::avgCeilU
+  static KnownBits avgCeilU(const KnownBits &LHS, const KnownBits &RHS);
+
   /// Compute known bits resulting from multiplying LHS and RHS.
   static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS,
                        bool NoUndefSelfMultiply = false);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2e1f4b7e5b374..72685a2d77216 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3468,19 +3468,28 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known = KnownBits::mulhs(Known, Known2);
     break;
   }
-  case ISD::AVGFLOORU:
-  case ISD::AVGCEILU:
-  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::avgFloorU(Known, Known2);
+    break;
+  }
+  case ISD::AVGCEILU: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::avgCeilU(Known, Known2);
+    break;
+  }
+  case ISD::AVGFLOORS: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::avgFloorS(Known, Known2);
+    break;
+  }
   case ISD::AVGCEILS: {
-    bool IsCeil = Opcode == ISD::AVGCEILU || Opcode == ISD::AVGCEILS;
-    bool IsSigned = Opcode == ISD::AVGFLOORS || Opcode == ISD::AVGCEILS;
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = IsSigned ? Known.sext(BitWidth + 1) : Known.zext(BitWidth + 1);
-    Known2 = IsSigned ? Known2.sext(BitWidth + 1) : Known2.zext(BitWidth + 1);
-    KnownBits Carry = KnownBits::makeConstant(APInt(1, IsCeil ? 1 : 0));
-    Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
-    Known = Known.extractBits(BitWidth, 1);
+    Known = KnownBits::avgCeilS(Known, Known2);
     break;
   }
   case ISD::SELECT:
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index fe47884f3e55a..d6012a8eea8a6 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -774,6 +774,37 @@ KnownBits KnownBits::usub_sat(const KnownBits &LHS, const KnownBits &RHS) {
   return computeForSatAddSub(/*Add*/ false, /*Signed*/ false, LHS, RHS);
 }
 
+static KnownBits avgCompute(KnownBits LHS, KnownBits RHS, bool IsCeil,
+                            bool IsSigned) {
+  unsigned BitWidth = LHS.getBitWidth();
+  LHS = IsSigned ? LHS.sext(BitWidth + 1) : LHS.zext(BitWidth + 1);
+  RHS = IsSigned ? RHS.sext(BitWidth + 1) : RHS.zext(BitWidth + 1);
+  KnownBits Carry = KnownBits::makeConstant(APInt(1, IsCeil ? 1 : 0));
+  LHS = KnownBits::computeForAddCarry(LHS, RHS, Carry);
+  LHS = LHS.extractBits(BitWidth, 1);
+  return LHS;
+}
+
+KnownBits KnownBits::avgFloorS(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ false,
+                    /* IsSigned */ true);
+}
+
+KnownBits KnownBits::avgFloorU(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ false,
+                    /* IsSigned */ false);
+}
+
+KnownBits KnownBits::avgCeilS(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ true,
+                    /* IsSigned */ true);
+}
+
+KnownBits KnownBits::avgCeilU(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ true,
+                    /* IsSigned */ false);
+}
+
 KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
                          bool NoUndefSelfMultiply) {
   unsigned BitWidth = LHS.getBitWidth();
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index d740707027166..824cf7501fd44 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -501,6 +501,18 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       "mulhu", KnownBits::mulhu,
       [](const APInt &N1, const APInt &N2) { return APIntOps::mulhu(N1, N2); },
       /*CheckOptimality=*/false);
+
+  testBinaryOpExhaustive("avgFloorS", KnownBits::avgFloorS, APIntOps::avgFloorS,
+                         false);
+
+  testBinaryOpExhaustive("avgFloorU", KnownBits::avgFloorU, APIntOps::avgFloorU,
+                         false);
+
+  testBinaryOpExhaustive("avgCeilU", KnownBits::avgCeilU, APIntOps::avgCeilU,
+                         false);
+
+  testBinaryOpExhaustive("avgCeilS", KnownBits::avgCeilS, APIntOps::avgCeilS,
+                         false);
 }
 
 TEST(KnownBitsTest, UnaryExhaustive) {

>From c1c1567d60983298a0db0efefd78899985464f19 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 19 May 2024 17:35:42 +0200
Subject: [PATCH 22/44] SimplifyLibCalls: Permit pow(2, x) -> ldexp(1, x) fold
 for vectors (#92532)

---
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |  7 +-
 .../Transforms/InstCombine/pow-to-ldexp.ll    | 69 ++++++-------------
 2 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c9567b740026b..eb1224abf00e2 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2087,15 +2087,16 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
 
   AttributeList NoAttrs; // Attributes are only meaningful on the original call
 
+  const bool UseIntrinsic = Pow->doesNotAccessMemory();
+
   // pow(2.0, itofp(x)) -> ldexp(1.0, x)
-  // TODO: This does not work for vectors because there is no ldexp intrinsic.
-  if (!Ty->isVectorTy() && match(Base, m_SpecificFP(2.0)) &&
+  if ((UseIntrinsic || !Ty->isVectorTy()) && match(Base, m_SpecificFP(2.0)) &&
       (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
       hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) {
       Constant *One = ConstantFP::get(Ty, 1.0);
 
-      if (Pow->doesNotAccessMemory()) {
+      if (UseIntrinsic) {
         return copyFlags(*Pow, B.CreateIntrinsic(Intrinsic::ldexp,
                                                  {Ty, ExpoI->getType()},
                                                  {One, ExpoI}, Pow, "exp2"));
diff --git a/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll b/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
index 27249dd5d72ae..b61f8809bd259 100644
--- a/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
+++ b/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
@@ -144,16 +144,10 @@ define half @pow_sitofp_f16_const_base_2(i32 %x) {
 }
 
 define <2 x float> @pow_sitofp_v2f32_const_base_2(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x float> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x float>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <2 x float> @llvm.pow.v2f32(<2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x float> [[POW]]
+; LDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
@@ -205,15 +199,10 @@ define <2 x float> @pow_sitofp_v2f32_const_base_mixed_2(<2 x i32> %x) {
 }
 
 define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call nsz afn <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x float> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call nsz afn <2 x float> @llvm.powi.v2f32.v2i32(<2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x i32> [[X]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x float> [[POW]]
+; LDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call nsz afn <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
@@ -227,16 +216,10 @@ define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(<2 x i32> %x) {
 }
 
 define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(<vscale x 4 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
-; LDEXP-EXP2-SAME: <vscale x 4 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <vscale x 4 x float> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
-; LDEXP-NOEXP2-SAME: <vscale x 4 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <vscale x 4 x float> [[POW]]
+; LDEXP-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
+; LDEXP-SAME: <vscale x 4 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[X]])
+; LDEXP-NEXT:    ret <vscale x 4 x float> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
 ; NOLDEXP-SAME: <vscale x 4 x i32> [[X:%.*]]) {
@@ -250,16 +233,10 @@ define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(<vscale x 4 x i32>
 }
 
 define <2 x half> @pow_sitofp_v2f16_const_base_2(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> <half 0xH3C00, half 0xH3C00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x half> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x half>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <2 x half> @llvm.pow.v2f16(<2 x half> <half 0xH4000, half 0xH4000>, <2 x half> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x half> [[POW]]
+; LDEXP-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> <half 0xH3C00, half 0xH3C00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x half> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
@@ -273,16 +250,10 @@ define <2 x half> @pow_sitofp_v2f16_const_base_2(<2 x i32> %x) {
 }
 
 define <2 x double> @pow_sitofp_v2f64_const_base_2(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x double> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x double>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <2 x double> @llvm.pow.v2f64(<2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x double> [[POW]]
+; LDEXP-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x double> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {

>From b050048d35f6580fb427e6de9063444aa85625c6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 19 May 2024 16:45:23 +0100
Subject: [PATCH 23/44] [VPlan] Simplify (X && Y) || (X && !Y) -> X. (#89386)

Simplify a common pattern generated for masks when folding the tail.

PR: https://github.com/llvm/llvm-project/pull/89386
---
 .../Transforms/Vectorize/VPlanPatternMatch.h  |  8 +++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 15 +++++++++++++-
 .../LoopVectorize/AArch64/masked-call.ll      |  9 +++------
 .../AArch64/scalable-strict-fadd.ll           |  3 +--
 .../LoopVectorize/RISCV/uniform-load-store.ll | 20 ++++---------------
 .../Transforms/LoopVectorize/uniform-blend.ll |  5 +----
 .../unused-blend-mask-for-first-operand.ll    | 12 ++---------
 .../vplan-sink-scalars-and-merge.ll           | 19 +++++-------------
 8 files changed, 37 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 50b08bbb7ebf7..56cbaa4201297 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -270,9 +270,15 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
 
 template <typename Op0_t, typename Op1_t>
 inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
-m_Or(const Op0_t &Op0, const Op1_t &Op1) {
+m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
+
+template <typename Op0_t, typename Op1_t>
+inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>
+m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1);
+}
 } // namespace VPlanPatternMatch
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c0eb6d710ad34..4c968c2834b10 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -935,6 +935,19 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 #endif
   }
 
+  // Simplify (X && Y) || (X && !Y) -> X.
+  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
+  // && (Y || Z) and (X || !X) into true. This requires queuing newly created
+  // recipes to be visited during simplification.
+  VPValue *X, *Y, *X1, *Y1;
+  if (match(&R,
+            m_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                       m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+      X == X1 && Y == Y1) {
+    R.getVPSingleValue()->replaceAllUsesWith(X);
+    return;
+  }
+
   if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
                             m_Mul(m_SpecificInt(1), m_VPValue(A)))))
     return R.getVPSingleValue()->replaceAllUsesWith(A);
@@ -1402,7 +1415,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
         // for dependence analysis). Instead, replace it with an equivalent Add.
         // This is possible as all users of the disjoint OR only access lanes
         // where the operands are disjoint or poison otherwise.
-        if (match(RecWithFlags, m_Or(m_VPValue(A), m_VPValue(B))) &&
+        if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
             RecWithFlags->isDisjoint()) {
           VPBuilder Builder(RecWithFlags);
           VPInstruction *New = Builder.createOverflowingOp(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index b91579106261a..d335ac4b69709 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -223,10 +223,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]])
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]]
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP10]]
 ; TFCOMMON-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP13]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -272,16 +271,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP14]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i1> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = or <vscale x 2 x i1> [[TMP15]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = or <vscale x 2 x i1> [[TMP16]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP17]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP18]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[TMP23]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[TMP24]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ddc004657ed5b..bcf8096f1b738 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1241,9 +1241,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1ce4cb928e808..ee70f4aa35850 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -462,13 +462,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison)
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP15]]
 ; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -510,13 +507,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP4]]
 ; TF-FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
+; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
@@ -1296,12 +1290,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
 ; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP13]])
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP16]]
 ; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1344,12 +1335,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP2]])
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP5]]
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
+; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 19cbcac6090c6..f33ec1419b114 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -86,11 +86,8 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[MASK1]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> undef
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[TMP8]], <4 x i64> [[PREDPHI]], <4 x i64> undef
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[MASK1]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
index 0f7bd3d71feb4..d79b4a7cefc25 100644
--- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
@@ -172,8 +172,6 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[Y]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -184,14 +182,8 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP11]], ptr [[B]], ptr poison
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], ptr [[B]], ptr poison
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PREDPHI]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT3]], <4 x i16> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 1e60e57a5409d..ae5879bb2bae9 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -361,15 +361,12 @@ define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, vp<[[NOT]]>
-; CHECK-NEXT:   EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]>, vp<[[MASK3]]>
 ; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[OR]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK1]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
@@ -462,16 +459,13 @@ define void @pred_cfg2(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%c.0>
-; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, vp<[[NOT]]>
-; CHECK-NEXT:   EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]>, vp<[[MASK3]]>
 ; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
-; CHECK-NEXT:   EMIT vp<[[MASK4:%.+]]> = logical-and vp<[[OR]]>, ir<%c.1>
+; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK4]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK3]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
@@ -570,16 +564,13 @@ define void @pred_cfg3(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%c.0>
-; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, vp<[[NOT]]>
-; CHECK-NEXT:   EMIT vp<[[MASK4:%.+]]> = or vp<[[MASK2]]>, vp<[[MASK3]]>
 ; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
-; CHECK-NEXT:   EMIT vp<[[MASK5:%.+]]> = logical-and vp<[[MASK4]]>, ir<%c.0>
+; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK5]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK3]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:

>From 643f36184bd3d9a95cbfd608af6f1cccc69e0187 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas at microsoft.com>
Date: Sun, 19 May 2024 09:27:56 -0700
Subject: [PATCH 24/44] HLSL availability diagnostics design doc (#92207)

Design document for the HLSL availability diagnostic modes

Fixes microsoft/hlsl-specs#190

---------

Co-authored-by: Xiang Li <python3kgae at outlook.com>
---
 clang/docs/HLSL/AvailabilityDiagnostics.rst | 137 ++++++++++++++++++++
 clang/docs/HLSL/HLSLDocs.rst                |   1 +
 2 files changed, 138 insertions(+)
 create mode 100644 clang/docs/HLSL/AvailabilityDiagnostics.rst

diff --git a/clang/docs/HLSL/AvailabilityDiagnostics.rst b/clang/docs/HLSL/AvailabilityDiagnostics.rst
new file mode 100644
index 0000000000000..bb9d02f21dde6
--- /dev/null
+++ b/clang/docs/HLSL/AvailabilityDiagnostics.rst
@@ -0,0 +1,137 @@
+=============================
+HLSL Availability Diagnostics
+=============================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+HLSL availability diagnostics emits errors or warning when unavailable shader APIs are used. Unavailable shader APIs are APIs that are exposed in HLSL code but are not available in the target shader stage or shader model version.
+
+There are three modes of HLSL availability diagnostic:
+
+#. **Default mode** - compiler emits an error when an unavailable API is found in a code that is reachable from the shader entry point function or from an exported library function (when compiling a shader library)
+
+#. **Relaxed mode** - same as default mode except the compiler emits a warning. This mode is enabled by ``-Wno-error=hlsl-availability``.
+
+#. **Strict mode** - compiler emits an error when an unavailable API is found in parsed code regardless of whether it can be reached from the shader entry point or exported functions, or not. This mode is enabled by ``-fhlsl-strict-availability``.
+
+Implementation Details
+======================
+
+Environment Parameter
+---------------------
+
+In order to encode API availability based on the shader model version and shader model stage a new ``environment`` parameter was added to the existing Clang ``availability`` attribute.
+
+The values allowed for this parameter are a subset of values allowed as the ``llvm::Triple`` environment component. If the environment parameters is present, the declared availability attribute applies only to targets with the same platform and environment.
+
+Default and Relaxed Diagnostic Modes
+------------------------------------
+
+This mode is implemented in ``DiagnoseHLSLAvailability`` class in ``SemaHLSL.cpp`` and it is invoked after the whole translation unit is parsed (from ``Sema::ActOnEndOfTranslationUnit``). The implementation iterates over all shader entry points and exported library functions in the translation unit and performs an AST traversal of each function body.
+
+When a reference to another function or member method is found (``DeclRefExpr`` or ``MemberExpr``) and it has a body, the AST of the referenced function is also scanned. This chain of AST traversals will reach all of the code that is reachable from the initial shader entry point or exported library function and avoids the need to generate a call graph.
+
+All shader APIs have an availability attribute that specifies the shader model version (and environment, if applicable) when this API was first introduced.When a reference to a function without a definition is found and it has an availability attribute, the version of the attribute is checked against the target shader model version and shader stage (if shader stage context is known), and an appropriate diagnostic is generated as needed.
+
+All shader entry functions have ``HLSLShaderAttr`` attribute that specifies what type of shader this function represents. However, for exported library functions the target shader stage is unknown, so in this case the HLSL API availability will be only checked against the shader model version. It means that for exported library functions the diagnostic of APIs with availability specific to shader stage will be deferred until DXIL linking time.
+
+A list of functions that were already scanned is kept in order to avoid duplicate scans and diagnostics (see ``DiagnoseHLSLAvailability::ScannedDecls``). It might happen that a shader library has multiple shader entry points for different shader stages that all call into the same shared function. It is therefore important to record not just that a function has been scanned, but also in which shader stage context. This is done by using ``llvm::DenseMap`` that maps ``FunctionDecl *`` to a ``unsigned`` bitmap that represents a set of shader stages (or environments) the function has been scanned for. The ``N``'th bit in the set is set if the function has been scanned in shader environment whose ``HLSLShaderAttr::ShaderType`` integer value equals ``N``.
+
+The emitted diagnostic messages belong to ``hlsl-availability`` diagnostic group and are reported as errors by default. With ``-Wno-error=hlsl-availability`` flag they become warning, making it relaxed HLSL diagnostics mode.
+
+Strict Diagnostic Mode
+----------------------
+
+When strict HLSL availability diagnostic mode is enabled the compiler must report all HLSL API availability issues regardless of code reachability. The implementation of this mode takes advantage of an existing diagnostic scan in ``DiagnoseUnguardedAvailability`` class which is already traversing AST of each function as soon as the function body has been parsed. For HLSL, this pass was only slightly modified, such as making sure diagnostic messages are in the ``hlsl-availability`` group and that availability checks based on shader stage are not included if the shader stage context is unknown.
+
+If the compilation target is a shader library, only availability based on shader model version can be diagnosed during this scan. To diagnose availability based on shader stage, the compiler needs to run the AST traversals implementated in ``DiagnoseHLSLAvailability`` at the end of the translation unit as described above.
+
+As a result, availability based on specific shader stage will only be diagnosed in code that is reachable from a shader entry point or library export function. It also means that function bodies might be scanned multiple time. When that happens, care should be taken not to produce duplicated diagnostics.
+
+========
+Examples
+========
+
+**Note**
+For the example below, the ``WaveActiveCountBits`` API function became available in shader model 6.0 and ``WaveMultiPrefixSum`` in shader model 6.5.
+
+The availability of ``ddx`` function depends on a shader stage. It is available for pixel shaders in shader model 2.1 and higher, for compute, mesh and amplification shaders in shader model 6.6 and higher. For any other shader stages it is not available.
+
+Compute shader example
+======================
+
+.. code-block:: c++
+
+   float unusedFunction(float f) {
+     return ddx(f);
+   }
+
+   [numthreads(4, 4, 1)]
+   void main(uint3 threadId : SV_DispatchThreadId) {
+     float f1 = ddx(threadId.x);
+     float f2 = WaveActiveCountBits(threadId.y == 1.0);
+   }
+
+When compiled as compute shader for shader model version 5.0, Clang will emit the following error by default:
+
+.. code-block:: console
+
+   <>:7:13: error: 'ddx' is only available in compute shader environment on Shader Model 6.6 or newer
+   <>:8:13: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
+
+With relaxed diagnostic mode this errors will become warnings.
+
+With strict diagnostic mode, in addition to the 2 errors above Clang will also emit error for the ``ddx`` call in ``unusedFunction``.:
+
+.. code-block:: console
+
+   <>:2:9: error: 'ddx' is only available in compute shader environment on Shader Model 6.5 or newer
+   <>:7:13: error: 'ddx' is only available in compute shader environment on Shader Model 6.5 or newer
+   <>:7:13: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
+
+Shader library example
+======================
+
+.. code-block:: c++
+
+   float myFunction(float f) {
+     return ddx(f);
+   }
+
+   float unusedFunction(float f) {
+     return WaveMultiPrefixSum(f, 1.0);
+   }
+
+   [shader("compute")]
+   [numthreads(4, 4, 1)]
+   void main(uint3 threadId : SV_DispatchThreadId) {
+      float f = 3;
+      float e = myFunction(f);
+   }
+
+   [shader("pixel")]
+   void main() {
+      float f = 3;
+      float e = myFunction(f);
+   }
+
+When compiled as shader library vshader model version 6.4, Clang will emit the following error by default:
+
+.. code-block:: console
+
+   <>:2:9: error: 'ddx' is only available in compute shader environment on Shader Model 6.5 or newer
+
+With relaxed diagnostic mode this errors will become warnings.
+
+With strict diagnostic mode Clang will also emit errors for availability issues in code that is not used by any of the entry points:
+
+.. code-block:: console
+
+   <>2:9: error: 'ddx' is only available in compute shader environment on Shader Model 6.6 or newer
+   <>:6:9: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
+
+Note that ``myFunction`` is reachable from both pixel and compute shader entry points is therefore scanned twice - once for each context. The diagnostic is emitted only for the compute shader context.
diff --git a/clang/docs/HLSL/HLSLDocs.rst b/clang/docs/HLSL/HLSLDocs.rst
index 97b2425f013b3..1e50a66d984b5 100644
--- a/clang/docs/HLSL/HLSLDocs.rst
+++ b/clang/docs/HLSL/HLSLDocs.rst
@@ -16,3 +16,4 @@ HLSL Design and Implementation
    ResourceTypes
    EntryFunctions
    FunctionCalls
+   AvailabilityDiagnostics

>From c34079c9455515fd1eb4feaa7613a57e88b7209d Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160 at users.noreply.github.com>
Date: Sun, 19 May 2024 11:39:46 -0500
Subject: [PATCH 25/44] [DOCS] ORCv2.rst Typo (#89482)

---
 llvm/docs/ORCv2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/ORCv2.rst b/llvm/docs/ORCv2.rst
index 910ef5b9f3d02..333977a0aaa66 100644
--- a/llvm/docs/ORCv2.rst
+++ b/llvm/docs/ORCv2.rst
@@ -780,7 +780,7 @@ constructs a new ThreadSafeContext value from a std::unique_ptr<LLVMContext>:
     // separate context.
     for (const auto &IRPath : IRPaths) {
       auto Ctx = std::make_unique<LLVMContext>();
-      auto M = std::make_unique<LLVMContext>("M", *Ctx);
+      auto M = std::make_unique<Module>("M", *Ctx);
       CompileLayer.add(MainJD, ThreadSafeModule(std::move(M), std::move(Ctx)));
     }
 

>From 3f33c4c14e79e68007cf1460e4a0e606eb199da5 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas at microsoft.com>
Date: Sun, 19 May 2024 10:46:12 -0700
Subject: [PATCH 26/44] [Clang][HLSL] Add environment parameter to availability
 attribute (#89809)

Add `environment` parameter to Clang availability attribute. The allowed
values for this parameter are a subset of values allowed in the
`llvm::Triple` environment component. If the `environment` parameters is
present, the declared availability attribute applies only to targets
with the same platform and environment.

This new parameter will be initially used for annotating HLSL functions
for the `shadermodel` platform because in HLSL built-in function
availability can depend not just on the shader model version (mapped to
`llvm::Triple::OSType`) but also on the target shader stage (mapped to
`llvm::Triple::EnvironmentType`). See example in #89802 and
microsoft/hlsl-specs#204 for more details.

The environment parameter is currently supported only for HLSL.

Fixes #89802
---
 clang/include/clang/Basic/Attr.td             |  33 ++++-
 clang/include/clang/Basic/AttrDocs.td         |   5 +
 .../clang/Basic/DiagnosticParseKinds.td       |   2 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  17 ++-
 clang/include/clang/Parse/Parser.h            |   3 +
 clang/include/clang/Sema/ParsedAttr.h         |  42 ++++--
 clang/include/clang/Sema/Sema.h               |  15 +-
 clang/lib/AST/DeclBase.cpp                    |  28 +++-
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  15 +-
 clang/lib/Index/CommentToXML.cpp              |   6 +
 clang/lib/Parse/ParseDecl.cpp                 |  20 ++-
 clang/lib/Sema/SemaAPINotes.cpp               |   3 +-
 clang/lib/Sema/SemaAvailability.cpp           | 128 +++++++++++++-----
 clang/lib/Sema/SemaDecl.cpp                   |   2 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  48 +++++--
 clang/test/Parser/attr-availability.c         |   2 +
 clang/test/Sema/attr-availability-ios.c       |   1 +
 .../attr-availability-compute.hlsl            |  73 ++++++++++
 .../attr-availability-errors.hlsl             |  11 ++
 .../Availability/attr-availability-mesh.hlsl  |  73 ++++++++++
 .../Availability/attr-availability-pixel.hlsl |  63 +++++++++
 clang/test/SemaHLSL/AvailabilityMarkup.hlsl   |  25 ----
 .../SemaHLSL/WaveBuiltinAvailability.hlsl     |   4 +-
 23 files changed, 508 insertions(+), 111 deletions(-)
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
 delete mode 100644 clang/test/SemaHLSL/AvailabilityMarkup.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 38ee8356583be..7008bea483c87 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -999,7 +999,7 @@ def Availability : InheritableAttr {
               VersionArgument<"deprecated">, VersionArgument<"obsoleted">,
               BoolArgument<"unavailable">, StringArgument<"message">,
               BoolArgument<"strict">, StringArgument<"replacement">,
-              IntArgument<"priority">];
+              IntArgument<"priority">, IdentifierArgument<"environment">];
   let AdditionalMembers =
 [{static llvm::StringRef getPrettyPlatformName(llvm::StringRef Platform) {
     return llvm::StringSwitch<llvm::StringRef>(Platform)
@@ -1019,7 +1019,7 @@ def Availability : InheritableAttr {
              .Case("xros", "visionOS")
              .Case("xros_app_extension", "visionOS (App Extension)")
              .Case("swift", "Swift")
-             .Case("shadermodel", "HLSL ShaderModel")
+             .Case("shadermodel", "Shader Model")
              .Case("ohos", "OpenHarmony OS")
              .Default(llvm::StringRef());
 }
@@ -1059,7 +1059,34 @@ static llvm::StringRef canonicalizePlatformName(llvm::StringRef Platform) {
              .Case("visionos_app_extension", "xros_app_extension")
              .Case("ShaderModel", "shadermodel")
              .Default(Platform);
-} }];
+}
+static llvm::StringRef getPrettyEnviromentName(llvm::StringRef Environment) {
+    return llvm::StringSwitch<llvm::StringRef>(Environment)
+             .Case("pixel", "pixel shader")
+             .Case("vertex", "vertex shader")
+             .Case("geometry", "geometry shader")
+             .Case("hull", "hull shader")
+             .Case("domain", "domain shader")
+             .Case("compute", "compute shader")
+             .Case("mesh", "mesh shader")
+             .Case("amplification", "amplification shader")
+             .Case("library", "shader library")
+             .Default(Environment);
+}
+static llvm::Triple::EnvironmentType getEnvironmentType(llvm::StringRef Environment) {
+    return llvm::StringSwitch<llvm::Triple::EnvironmentType>(Environment)
+             .Case("pixel", llvm::Triple::Pixel)
+             .Case("vertex", llvm::Triple::Vertex)
+             .Case("geometry", llvm::Triple::Geometry)
+             .Case("hull", llvm::Triple::Hull)
+             .Case("domain", llvm::Triple::Domain)
+             .Case("compute", llvm::Triple::Compute)
+             .Case("mesh", llvm::Triple::Mesh)
+             .Case("amplification", llvm::Triple::Amplification)
+             .Case("library", llvm::Triple::Library)
+             .Default(llvm::Triple::UnknownEnvironment);
+}
+}];
   let HasCustomParsing = 1;
   let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Named]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index b48aaf65558ac..54197d588eb45 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1593,6 +1593,11 @@ replacement=\ *string-literal*
   a warning about use of a deprecated declaration. The Fix-It will replace
   the deprecated declaration with the new declaration specified.
 
+environment=\ *identifier*
+  Target environment in which this declaration is available. If present,
+  the availability attribute applies only to targets with the same platform
+  and environment. The parameter is currently supported only in HLSL.
+
 Multiple availability attributes can be placed on a declaration, which may
 correspond to different platforms. For most platforms, the availability
 attribute with the platform corresponding to the target platform will be used;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 8316845844cb2..46656fc66044d 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1112,6 +1112,8 @@ def err_zero_version : Error<
   "version number must have non-zero major, minor, or sub-minor version">;
 def err_availability_expected_platform : Error<
   "expected a platform name, e.g., 'macos'">;
+def err_availability_expected_environment : Error<
+  "expected an environment name, e.g., 'compute'">;
 
 // objc_bridge_related attribute
 def err_objcbridge_related_expected_related_class : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 09b1874f9fddd..e3b4186f1b06f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3837,6 +3837,9 @@ def note_cannot_use_trivial_abi_reason : Note<
 // Availability attribute
 def warn_availability_unknown_platform : Warning<
   "unknown platform %0 in availability macro">, InGroup<Availability>;
+def warn_availability_unknown_environment : Warning<
+  "unknown environment %0 in availability macro">, InGroup<Availability>;
+
 def warn_availability_version_ordering : Warning<
   "feature cannot be %select{introduced|deprecated|obsoleted}0 in %1 version "
   "%2 before it was %select{introduced|deprecated|obsoleted}3 in version %4; "
@@ -3867,13 +3870,21 @@ def note_protocol_method : Note<
 def warn_availability_fuchsia_unavailable_minor : Warning<
   "Fuchsia API Level prohibits specifying a minor or sub-minor version">,
   InGroup<Availability>;
+def err_availability_unexpected_parameter: Error<
+  "unexpected parameter '%0' in availability attribute, not permitted in %select{HLSL|C/C++}1">;
 
 def warn_unguarded_availability :
-  Warning<"%0 is only available on %1 %2 or newer">,
+  Warning<"%0 is only available %select{|in %4 environment }3on %1 %2 or newer">,
+  InGroup<UnguardedAvailability>, DefaultIgnore;
+def warn_unguarded_availability_unavailable :
+  Warning<"%0 is unavailable">,
   InGroup<UnguardedAvailability>, DefaultIgnore;
 def warn_unguarded_availability_new :
   Warning<warn_unguarded_availability.Summary>,
   InGroup<UnguardedAvailabilityNew>;
+def warn_unguarded_availability_unavailable_new :
+  Warning<warn_unguarded_availability_unavailable.Summary>,
+  InGroup<UnguardedAvailabilityNew>;
 def note_decl_unguarded_availability_silence : Note<
   "annotate %select{%1|anonymous %1}0 with an availability attribute to silence this warning">;
 def note_unguarded_available_silence : Note<
@@ -5870,8 +5881,8 @@ def note_availability_specified_here : Note<
   "%0 has been explicitly marked "
   "%select{unavailable|deleted|deprecated}1 here">;
 def note_partial_availability_specified_here : Note<
-  "%0 has been marked as being introduced in %1 %2 here, "
-  "but the deployment target is %1 %3">;
+  "%0 has been marked as being introduced in %1 %2 %select{|in %5 environment }4here, "
+  "but the deployment target is %1 %3%select{| %6 environment }4">;
 def note_implicitly_deleted : Note<
   "explicitly defaulted function was implicitly deleted here">;
 def warn_not_enough_argument : Warning<
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 1e796e828b10a..5f04664141d29 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -153,6 +153,9 @@ class Parser : public CodeCompletionHandler {
   /// Identifier for "replacement".
   IdentifierInfo *Ident_replacement;
 
+  /// Identifier for "environment".
+  IdentifierInfo *Ident_environment;
+
   /// Identifiers used by the 'external_source_symbol' attribute.
   IdentifierInfo *Ident_language, *Ident_defined_in,
       *Ident_generated_declaration, *Ident_USR;
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 8368d9ce61466..22cbd0d90ee43 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -40,6 +40,7 @@ class LangOptions;
 class Sema;
 class Stmt;
 class TargetInfo;
+struct IdentifierLoc;
 
 /// Represents information about a change in availability for
 /// an entity, which is part of the encoding of the 'availability'
@@ -68,12 +69,14 @@ struct AvailabilityData {
   AvailabilityChange Changes[NumAvailabilitySlots];
   SourceLocation StrictLoc;
   const Expr *Replacement;
+  const IdentifierLoc *EnvironmentLoc;
 
   AvailabilityData(const AvailabilityChange &Introduced,
                    const AvailabilityChange &Deprecated,
-                   const AvailabilityChange &Obsoleted,
-                   SourceLocation Strict, const Expr *ReplaceExpr)
-    : StrictLoc(Strict), Replacement(ReplaceExpr) {
+                   const AvailabilityChange &Obsoleted, SourceLocation Strict,
+                   const Expr *ReplaceExpr, const IdentifierLoc *EnvironmentLoc)
+      : StrictLoc(Strict), Replacement(ReplaceExpr),
+        EnvironmentLoc(EnvironmentLoc) {
     Changes[IntroducedSlot] = Introduced;
     Changes[DeprecatedSlot] = Deprecated;
     Changes[ObsoletedSlot] = Obsoleted;
@@ -234,7 +237,7 @@ class ParsedAttr final
              const AvailabilityChange &deprecated,
              const AvailabilityChange &obsoleted, SourceLocation unavailable,
              const Expr *messageExpr, Form formUsed, SourceLocation strict,
-             const Expr *replacementExpr)
+             const Expr *replacementExpr, const IdentifierLoc *environmentLoc)
       : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
         NumArgs(1), Invalid(false), UsedAsTypeAttr(false), IsAvailability(true),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
@@ -243,8 +246,9 @@ class ParsedAttr final
         Info(ParsedAttrInfo::get(*this)) {
     ArgsUnion PVal(Parm);
     memcpy(getArgsBuffer(), &PVal, sizeof(ArgsUnion));
-    new (getAvailabilityData()) detail::AvailabilityData(
-        introduced, deprecated, obsoleted, strict, replacementExpr);
+    new (getAvailabilityData())
+        detail::AvailabilityData(introduced, deprecated, obsoleted, strict,
+                                 replacementExpr, environmentLoc);
   }
 
   /// Constructor for objc_bridge_related attributes.
@@ -445,6 +449,12 @@ class ParsedAttr final
     return getAvailabilityData()->Replacement;
   }
 
+  const IdentifierLoc *getEnvironment() const {
+    assert(getParsedKind() == AT_Availability &&
+           "Not an availability attribute");
+    return getAvailabilityData()->EnvironmentLoc;
+  }
+
   const ParsedType &getMatchingCType() const {
     assert(getParsedKind() == AT_TypeTagForDatatype &&
            "Not a type_tag_for_datatype attribute");
@@ -759,11 +769,13 @@ class AttributePool {
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
                      ParsedAttr::Form form, SourceLocation strict,
-                     const Expr *ReplacementExpr) {
+                     const Expr *ReplacementExpr,
+                     IdentifierLoc *EnvironmentLoc) {
     void *memory = allocate(AttributeFactory::AvailabilityAllocSize);
-    return add(new (memory) ParsedAttr(
-        attrName, attrRange, scopeName, scopeLoc, Param, introduced, deprecated,
-        obsoleted, unavailable, MessageExpr, form, strict, ReplacementExpr));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
+                                       Param, introduced, deprecated, obsoleted,
+                                       unavailable, MessageExpr, form, strict,
+                                       ReplacementExpr, EnvironmentLoc));
   }
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
@@ -994,10 +1006,12 @@ class ParsedAttributes : public ParsedAttributesView {
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
                      ParsedAttr::Form form, SourceLocation strict,
-                     const Expr *ReplacementExpr) {
-    ParsedAttr *attr = pool.create(
-        attrName, attrRange, scopeName, scopeLoc, Param, introduced, deprecated,
-        obsoleted, unavailable, MessageExpr, form, strict, ReplacementExpr);
+                     const Expr *ReplacementExpr,
+                     IdentifierLoc *EnvironmentLoc) {
+    ParsedAttr *attr =
+        pool.create(attrName, attrRange, scopeName, scopeLoc, Param, introduced,
+                    deprecated, obsoleted, unavailable, MessageExpr, form,
+                    strict, ReplacementExpr, EnvironmentLoc);
     addAtEnd(attr);
     return attr;
   }
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b16a304960d3f..6c89d275215de 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -39,6 +39,7 @@
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/DarwinSDKInfo.h"
 #include "clang/Basic/ExpressionTraits.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/OpenCLOptions.h"
 #include "clang/Basic/PragmaKinds.h"
@@ -3580,13 +3581,13 @@ class Sema final : public SemaBase {
   bool CheckAttrTarget(const ParsedAttr &CurrAttr);
   bool CheckAttrNoArgs(const ParsedAttr &CurrAttr);
 
-  AvailabilityAttr *
-  mergeAvailabilityAttr(NamedDecl *D, const AttributeCommonInfo &CI,
-                        IdentifierInfo *Platform, bool Implicit,
-                        VersionTuple Introduced, VersionTuple Deprecated,
-                        VersionTuple Obsoleted, bool IsUnavailable,
-                        StringRef Message, bool IsStrict, StringRef Replacement,
-                        AvailabilityMergeKind AMK, int Priority);
+  AvailabilityAttr *mergeAvailabilityAttr(
+      NamedDecl *D, const AttributeCommonInfo &CI, IdentifierInfo *Platform,
+      bool Implicit, VersionTuple Introduced, VersionTuple Deprecated,
+      VersionTuple Obsoleted, bool IsUnavailable, StringRef Message,
+      bool IsStrict, StringRef Replacement, AvailabilityMergeKind AMK,
+      int Priority, IdentifierInfo *IIEnvironment);
+
   TypeVisibilityAttr *
   mergeTypeVisibilityAttr(Decl *D, const AttributeCommonInfo &CI,
                           TypeVisibilityAttr::VisibilityType Vis);
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 03e1055251c24..65d5eeb6354eb 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -666,12 +666,28 @@ static AvailabilityResult CheckAvailability(ASTContext &Context,
   // Make sure that this declaration has already been introduced.
   if (!A->getIntroduced().empty() &&
       EnclosingVersion < A->getIntroduced()) {
-    if (Message) {
-      Message->clear();
-      llvm::raw_string_ostream Out(*Message);
-      VersionTuple VTI(A->getIntroduced());
-      Out << "introduced in " << PrettyPlatformName << ' '
-          << VTI << HintMessage;
+    IdentifierInfo *IIEnv = A->getEnvironment();
+    StringRef TargetEnv =
+        Context.getTargetInfo().getTriple().getEnvironmentName();
+    StringRef EnvName = AvailabilityAttr::getPrettyEnviromentName(TargetEnv);
+    // Matching environment or no environment on attribute
+    if (!IIEnv || (!TargetEnv.empty() && IIEnv->getName() == TargetEnv)) {
+      if (Message) {
+        Message->clear();
+        llvm::raw_string_ostream Out(*Message);
+        VersionTuple VTI(A->getIntroduced());
+        Out << "introduced in " << PrettyPlatformName << " " << VTI << " "
+            << EnvName << HintMessage;
+      }
+    }
+    // Non-matching environment or no environment on target
+    else {
+      if (Message) {
+        Message->clear();
+        llvm::raw_string_ostream Out(*Message);
+        Out << "not available on " << PrettyPlatformName << " " << EnvName
+            << HintMessage;
+      }
     }
 
     return A->getStrict() ? AR_Unavailable : AR_NotYetIntroduced;
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 3390f0962f67d..bc72e8a00e0d5 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -18,14 +18,21 @@ namespace hlsl {
 
 #define _HLSL_BUILTIN_ALIAS(builtin)                                           \
   __attribute__((clang_builtin_alias(builtin)))
-#define _HLSL_AVAILABILITY(environment, version)                               \
-  __attribute__((availability(environment, introduced = version)))
+#define _HLSL_AVAILABILITY(platform, version)                                  \
+  __attribute__((availability(platform, introduced = version)))
+#define _HLSL_AVAILABILITY_STAGE(platform, version, stage)                     \
+  __attribute__((                                                              \
+      availability(platform, introduced = version, environment = stage)))
 
 #ifdef __HLSL_ENABLE_16_BIT
-#define _HLSL_16BIT_AVAILABILITY(environment, version)                         \
-  __attribute__((availability(environment, introduced = version)))
+#define _HLSL_16BIT_AVAILABILITY(platform, version)                            \
+  __attribute__((availability(platform, introduced = version)))
+#define _HLSL_16BIT_AVAILABILITY_STAGE(platform, version, stage)               \
+  __attribute__((                                                              \
+      availability(platform, introduced = version, environment = stage)))
 #else
 #define _HLSL_16BIT_AVAILABILITY(environment, version)
+#define _HLSL_16BIT_AVAILABILITY_STAGE(environment, version, stage)
 #endif
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Index/CommentToXML.cpp b/clang/lib/Index/CommentToXML.cpp
index 295f3f228ff79..3372fbba43831 100644
--- a/clang/lib/Index/CommentToXML.cpp
+++ b/clang/lib/Index/CommentToXML.cpp
@@ -12,6 +12,7 @@
 #include "clang/AST/Comment.h"
 #include "clang/AST/CommentVisitor.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
 #include "clang/Index/USRGeneration.h"
@@ -1052,6 +1053,11 @@ void CommentASTToXMLConverter::visitFullComment(const FullComment *C) {
       }
       if (AA->getUnavailable())
         Result << "<Unavailable/>";
+
+      IdentifierInfo *Environment = AA->getEnvironment();
+      if (Environment) {
+        Result << "<Environment>" << Environment->getName() << "</Environment>";
+      }
       Result << "</Availability>";
     }
   }
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 2ce8fa98089f6..445d3fd66e387 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -1256,6 +1256,7 @@ void Parser::ParseAvailabilityAttribute(
   enum { Introduced, Deprecated, Obsoleted, Unknown };
   AvailabilityChange Changes[Unknown];
   ExprResult MessageExpr, ReplacementExpr;
+  IdentifierLoc *EnvironmentLoc = nullptr;
 
   // Opening '('.
   BalancedDelimiterTracker T(*this, tok::l_paren);
@@ -1303,6 +1304,7 @@ void Parser::ParseAvailabilityAttribute(
     Ident_message = PP.getIdentifierInfo("message");
     Ident_strict = PP.getIdentifierInfo("strict");
     Ident_replacement = PP.getIdentifierInfo("replacement");
+    Ident_environment = PP.getIdentifierInfo("environment");
   }
 
   // Parse the optional "strict", the optional "replacement" and the set of
@@ -1350,6 +1352,13 @@ void Parser::ParseAvailabilityAttribute(
       continue;
     }
 
+    if (Keyword == Ident_environment) {
+      if (EnvironmentLoc != nullptr) {
+        Diag(KeywordLoc, diag::err_availability_redundant)
+            << Keyword << SourceRange(EnvironmentLoc->Loc);
+      }
+    }
+
     if (Tok.isNot(tok::equal)) {
       Diag(Tok, diag::err_expected_after) << Keyword << tok::equal;
       SkipUntil(tok::r_paren, StopAtSemi);
@@ -1371,6 +1380,15 @@ void Parser::ParseAvailabilityAttribute(
         continue;
       }
     }
+    if (Keyword == Ident_environment) {
+      if (Tok.isNot(tok::identifier)) {
+        Diag(Tok, diag::err_availability_expected_environment);
+        SkipUntil(tok::r_paren, StopAtSemi);
+        return;
+      }
+      EnvironmentLoc = ParseIdentifierLoc();
+      continue;
+    }
 
     // Special handling of 'NA' only when applied to introduced or
     // deprecated.
@@ -1452,7 +1470,7 @@ void Parser::ParseAvailabilityAttribute(
                SourceRange(AvailabilityLoc, T.getCloseLocation()), ScopeName,
                ScopeLoc, Platform, Changes[Introduced], Changes[Deprecated],
                Changes[Obsoleted], UnavailableLoc, MessageExpr.get(), Form,
-               StrictLoc, ReplacementExpr.get());
+               StrictLoc, ReplacementExpr.get(), EnvironmentLoc);
 }
 
 /// Parse the contents of the "external_source_symbol" attribute.
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 443bf162044ff..c80b08e361cfa 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -269,7 +269,8 @@ static void ProcessAPINotes(Sema &S, Decl *D,
               ASTAllocateString(S.Context, Info.UnavailableMsg),
               /*Strict=*/false,
               /*Replacement=*/StringRef(),
-              /*Priority=*/Sema::AP_Explicit);
+              /*Priority=*/Sema::AP_Explicit,
+              /*Environment=*/nullptr);
         },
         [](const Decl *D) {
           return llvm::find_if(D->attrs(), [](const Attr *next) -> bool {
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 5ebc25317bf37..663b6f35b869d 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -14,20 +14,37 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/DelayedDiagnostic.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaObjC.h"
+#include "llvm/ADT/StringRef.h"
 #include <optional>
 
 using namespace clang;
 using namespace sema;
 
+static bool hasMatchingEnvironmentOrNone(const ASTContext &Context,
+                                         const AvailabilityAttr *AA) {
+  IdentifierInfo *IIEnvironment = AA->getEnvironment();
+  auto Environment = Context.getTargetInfo().getTriple().getEnvironment();
+  if (!IIEnvironment || Environment == llvm::Triple::UnknownEnvironment)
+    return true;
+
+  llvm::Triple::EnvironmentType ET =
+      AvailabilityAttr::getEnvironmentType(IIEnvironment->getName());
+  return Environment == ET;
+}
+
 static const AvailabilityAttr *getAttrForPlatform(ASTContext &Context,
                                                   const Decl *D) {
+  AvailabilityAttr const *PartialMatch = nullptr;
   // Check each AvailabilityAttr to find the one for this platform.
+  // For multiple attributes with the same platform try to find one for this
+  // environment.
   for (const auto *A : D->attrs()) {
     if (const auto *Avail = dyn_cast<AvailabilityAttr>(A)) {
       // FIXME: this is copied from CheckAvailability. We should try to
@@ -46,11 +63,15 @@ static const AvailabilityAttr *getAttrForPlatform(ASTContext &Context,
       StringRef TargetPlatform = Context.getTargetInfo().getPlatformName();
 
       // Match the platform name.
-      if (RealizedPlatform == TargetPlatform)
-        return Avail;
+      if (RealizedPlatform == TargetPlatform) {
+        // Find the best matching attribute for this environment
+        if (hasMatchingEnvironmentOrNone(Context, Avail))
+          return Avail;
+        PartialMatch = Avail;
+      }
     }
   }
-  return nullptr;
+  return PartialMatch;
 }
 
 /// The diagnostic we should emit for \c D, and the declaration that
@@ -118,10 +139,9 @@ ShouldDiagnoseAvailabilityOfDecl(Sema &S, const NamedDecl *D,
 /// whether we should emit a diagnostic for \c K and \c DeclVersion in
 /// the context of \c Ctx. For example, we should emit an unavailable diagnostic
 /// in a deprecated context, but not the other way around.
-static bool
-ShouldDiagnoseAvailabilityInContext(Sema &S, AvailabilityResult K,
-                                    VersionTuple DeclVersion, Decl *Ctx,
-                                    const NamedDecl *OffendingDecl) {
+static bool ShouldDiagnoseAvailabilityInContext(
+    Sema &S, AvailabilityResult K, VersionTuple DeclVersion,
+    const IdentifierInfo *DeclEnv, Decl *Ctx, const NamedDecl *OffendingDecl) {
   assert(K != AR_Available && "Expected an unavailable declaration here!");
 
   // If this was defined using CF_OPTIONS, etc. then ignore the diagnostic.
@@ -140,7 +160,8 @@ ShouldDiagnoseAvailabilityInContext(Sema &S, AvailabilityResult K,
   auto CheckContext = [&](const Decl *C) {
     if (K == AR_NotYetIntroduced) {
       if (const AvailabilityAttr *AA = getAttrForPlatform(S.Context, C))
-        if (AA->getIntroduced() >= DeclVersion)
+        if (AA->getIntroduced() >= DeclVersion &&
+            AA->getEnvironment() == DeclEnv)
           return true;
     } else if (K == AR_Deprecated) {
       if (C->isDeprecated())
@@ -344,10 +365,14 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   unsigned available_here_select_kind;
 
   VersionTuple DeclVersion;
-  if (const AvailabilityAttr *AA = getAttrForPlatform(S.Context, OffendingDecl))
+  const AvailabilityAttr *AA = getAttrForPlatform(S.Context, OffendingDecl);
+  const IdentifierInfo *IIEnv = nullptr;
+  if (AA) {
     DeclVersion = AA->getIntroduced();
+    IIEnv = AA->getEnvironment();
+  }
 
-  if (!ShouldDiagnoseAvailabilityInContext(S, K, DeclVersion, Ctx,
+  if (!ShouldDiagnoseAvailabilityInContext(S, K, DeclVersion, IIEnv, Ctx,
                                            OffendingDecl))
     return;
 
@@ -355,8 +380,7 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
 
   // The declaration can have multiple availability attributes, we are looking
   // at one of them.
-  const AvailabilityAttr *A = getAttrForPlatform(S.Context, OffendingDecl);
-  if (A && A->isInherited()) {
+  if (AA && AA->isInherited()) {
     for (const Decl *Redecl = OffendingDecl->getMostRecentDecl(); Redecl;
          Redecl = Redecl->getPreviousDecl()) {
       const AvailabilityAttr *AForRedecl =
@@ -376,26 +400,43 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
     // not specified for deployment targets >= to iOS 11 or equivalent or
     // for declarations that were introduced in iOS 11 (macOS 10.13, ...) or
     // later.
-    const AvailabilityAttr *AA =
-        getAttrForPlatform(S.getASTContext(), OffendingDecl);
+    assert(AA != nullptr && "expecting valid availability attribute");
     VersionTuple Introduced = AA->getIntroduced();
+    bool EnvironmentMatchesOrNone =
+        hasMatchingEnvironmentOrNone(S.getASTContext(), AA);
+
+    const TargetInfo &TI = S.getASTContext().getTargetInfo();
+    std::string PlatformName(
+        AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
+    llvm::StringRef TargetEnvironment(AvailabilityAttr::getPrettyEnviromentName(
+        TI.getTriple().getEnvironmentName()));
+    llvm::StringRef AttrEnvironment =
+        AA->getEnvironment() ? AvailabilityAttr::getPrettyEnviromentName(
+                                   AA->getEnvironment()->getName())
+                             : "";
+    bool UseEnvironment =
+        (!AttrEnvironment.empty() && !TargetEnvironment.empty());
 
     bool UseNewWarning = shouldDiagnoseAvailabilityByDefault(
         S.Context, S.Context.getTargetInfo().getPlatformMinVersion(),
         Introduced);
-    unsigned Warning = UseNewWarning ? diag::warn_unguarded_availability_new
-                                     : diag::warn_unguarded_availability;
 
-    std::string PlatformName(AvailabilityAttr::getPrettyPlatformName(
-        S.getASTContext().getTargetInfo().getPlatformName()));
+    unsigned DiagKind =
+        EnvironmentMatchesOrNone
+            ? (UseNewWarning ? diag::warn_unguarded_availability_new
+                             : diag::warn_unguarded_availability)
+            : (UseNewWarning ? diag::warn_unguarded_availability_unavailable_new
+                             : diag::warn_unguarded_availability_unavailable);
 
-    S.Diag(Loc, Warning) << OffendingDecl << PlatformName
-                         << Introduced.getAsString();
+    S.Diag(Loc, DiagKind) << OffendingDecl << PlatformName
+                          << Introduced.getAsString() << UseEnvironment
+                          << TargetEnvironment;
 
     S.Diag(OffendingDecl->getLocation(),
            diag::note_partial_availability_specified_here)
         << OffendingDecl << PlatformName << Introduced.getAsString()
-        << S.Context.getTargetInfo().getPlatformMinVersion().getAsString();
+        << S.Context.getTargetInfo().getPlatformMinVersion().getAsString()
+        << UseEnvironment << AttrEnvironment << TargetEnvironment;
 
     if (const auto *Enclosing = findEnclosingDeclToAnnotate(Ctx)) {
       if (const auto *TD = dyn_cast<TagDecl>(Enclosing))
@@ -772,14 +813,17 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
 
     const AvailabilityAttr *AA =
       getAttrForPlatform(SemaRef.getASTContext(), OffendingDecl);
+    bool EnvironmentMatchesOrNone =
+        hasMatchingEnvironmentOrNone(SemaRef.getASTContext(), AA);
     VersionTuple Introduced = AA->getIntroduced();
 
-    if (AvailabilityStack.back() >= Introduced)
+    if (EnvironmentMatchesOrNone && AvailabilityStack.back() >= Introduced)
       return;
 
     // If the context of this function is less available than D, we should not
     // emit a diagnostic.
-    if (!ShouldDiagnoseAvailabilityInContext(SemaRef, Result, Introduced, Ctx,
+    if (!ShouldDiagnoseAvailabilityInContext(SemaRef, Result, Introduced,
+                                             AA->getEnvironment(), Ctx,
                                              OffendingDecl))
       return;
 
@@ -787,25 +831,39 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
     // not specified for deployment targets >= to iOS 11 or equivalent or
     // for declarations that were introduced in iOS 11 (macOS 10.13, ...) or
     // later.
-    unsigned DiagKind =
-        shouldDiagnoseAvailabilityByDefault(
-            SemaRef.Context,
-            SemaRef.Context.getTargetInfo().getPlatformMinVersion(), Introduced)
-            ? diag::warn_unguarded_availability_new
-            : diag::warn_unguarded_availability;
+    bool UseNewDiagKind = shouldDiagnoseAvailabilityByDefault(
+        SemaRef.Context,
+        SemaRef.Context.getTargetInfo().getPlatformMinVersion(), Introduced);
+
+    const TargetInfo &TI = SemaRef.getASTContext().getTargetInfo();
+    std::string PlatformName(
+        AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
+    llvm::StringRef TargetEnvironment(AvailabilityAttr::getPrettyEnviromentName(
+        TI.getTriple().getEnvironmentName()));
+    llvm::StringRef AttrEnvironment =
+        AA->getEnvironment() ? AvailabilityAttr::getPrettyEnviromentName(
+                                   AA->getEnvironment()->getName())
+                             : "";
+    bool UseEnvironment =
+        (!AttrEnvironment.empty() && !TargetEnvironment.empty());
 
-    std::string PlatformName(AvailabilityAttr::getPrettyPlatformName(
-        SemaRef.getASTContext().getTargetInfo().getPlatformName()));
+    unsigned DiagKind =
+        EnvironmentMatchesOrNone
+            ? (UseNewDiagKind ? diag::warn_unguarded_availability_new
+                              : diag::warn_unguarded_availability)
+            : (UseNewDiagKind
+                   ? diag::warn_unguarded_availability_unavailable_new
+                   : diag::warn_unguarded_availability_unavailable);
 
     SemaRef.Diag(Range.getBegin(), DiagKind)
-        << Range << D << PlatformName << Introduced.getAsString();
+        << Range << D << PlatformName << Introduced.getAsString()
+        << UseEnvironment << TargetEnvironment;
 
     SemaRef.Diag(OffendingDecl->getLocation(),
                  diag::note_partial_availability_specified_here)
         << OffendingDecl << PlatformName << Introduced.getAsString()
-        << SemaRef.Context.getTargetInfo()
-               .getPlatformMinVersion()
-               .getAsString();
+        << SemaRef.Context.getTargetInfo().getPlatformMinVersion().getAsString()
+        << UseEnvironment << AttrEnvironment << TargetEnvironment;
 
     auto FixitDiag =
         SemaRef.Diag(Range.getBegin(), diag::note_unguarded_available_silence)
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index f2b9202255cd4..557fe10619c35 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2879,7 +2879,7 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
         D, *AA, AA->getPlatform(), AA->isImplicit(), AA->getIntroduced(),
         AA->getDeprecated(), AA->getObsoleted(), AA->getUnavailable(),
         AA->getMessage(), AA->getStrict(), AA->getReplacement(), AMK,
-        AA->getPriority());
+        AA->getPriority(), AA->getEnvironment());
   else if (const auto *VA = dyn_cast<VisibilityAttr>(Attr))
     NewAttr = S.mergeVisibilityAttr(D, *VA, VA->getVisibility());
   else if (const auto *VA = dyn_cast<TypeVisibilityAttr>(Attr))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 30776ff537fb5..ca5938083917f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -26,6 +26,7 @@
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/DarwinSDKInfo.h"
 #include "clang/Basic/HLSLRuntime.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
@@ -52,6 +53,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include <optional>
 
 using namespace clang;
@@ -2495,7 +2497,7 @@ AvailabilityAttr *Sema::mergeAvailabilityAttr(
     bool Implicit, VersionTuple Introduced, VersionTuple Deprecated,
     VersionTuple Obsoleted, bool IsUnavailable, StringRef Message,
     bool IsStrict, StringRef Replacement, AvailabilityMergeKind AMK,
-    int Priority) {
+    int Priority, IdentifierInfo *Environment) {
   VersionTuple MergedIntroduced = Introduced;
   VersionTuple MergedDeprecated = Deprecated;
   VersionTuple MergedObsoleted = Obsoleted;
@@ -2529,6 +2531,12 @@ AvailabilityAttr *Sema::mergeAvailabilityAttr(
         continue;
       }
 
+      IdentifierInfo *OldEnvironment = OldAA->getEnvironment();
+      if (OldEnvironment != Environment) {
+        ++i;
+        continue;
+      }
+
       // If there is an existing availability attribute for this platform that
       // has a lower priority use the existing one and discard the new
       // attribute.
@@ -2647,7 +2655,7 @@ AvailabilityAttr *Sema::mergeAvailabilityAttr(
       !OverrideOrImpl) {
     auto *Avail = ::new (Context) AvailabilityAttr(
         Context, CI, Platform, Introduced, Deprecated, Obsoleted, IsUnavailable,
-        Message, IsStrict, Replacement, Priority);
+        Message, IsStrict, Replacement, Priority, Environment);
     Avail->setImplicit(Implicit);
     return Avail;
   }
@@ -2706,13 +2714,34 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     }
   }
 
+  if (S.getLangOpts().HLSL && IsStrict)
+    S.Diag(AL.getStrictLoc(), diag::err_availability_unexpected_parameter)
+        << "strict" << /* HLSL */ 0;
+
   int PriorityModifier = AL.isPragmaClangAttribute()
                              ? Sema::AP_PragmaClangAttribute
                              : Sema::AP_Explicit;
+
+  const IdentifierLoc *EnvironmentLoc = AL.getEnvironment();
+  IdentifierInfo *IIEnvironment = nullptr;
+  if (EnvironmentLoc) {
+    if (S.getLangOpts().HLSL) {
+      IIEnvironment = EnvironmentLoc->Ident;
+      if (AvailabilityAttr::getEnvironmentType(
+              EnvironmentLoc->Ident->getName()) ==
+          llvm::Triple::EnvironmentType::UnknownEnvironment)
+        S.Diag(EnvironmentLoc->Loc, diag::warn_availability_unknown_environment)
+            << EnvironmentLoc->Ident;
+    } else {
+      S.Diag(EnvironmentLoc->Loc, diag::err_availability_unexpected_parameter)
+          << "environment" << /* C/C++ */ 1;
+    }
+  }
+
   AvailabilityAttr *NewAttr = S.mergeAvailabilityAttr(
       ND, AL, II, false /*Implicit*/, Introduced.Version, Deprecated.Version,
       Obsoleted.Version, IsUnavailable, Str, IsStrict, Replacement,
-      Sema::AMK_None, PriorityModifier);
+      Sema::AMK_None, PriorityModifier, IIEnvironment);
   if (NewAttr)
     D->addAttr(NewAttr);
 
@@ -2768,8 +2797,8 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       AvailabilityAttr *NewAttr = S.mergeAvailabilityAttr(
           ND, AL, NewII, true /*Implicit*/, NewIntroduced, NewDeprecated,
           NewObsoleted, IsUnavailable, Str, IsStrict, Replacement,
-          Sema::AMK_None,
-          PriorityModifier + Sema::AP_InferredFromOtherPlatform);
+          Sema::AMK_None, PriorityModifier + Sema::AP_InferredFromOtherPlatform,
+          IIEnvironment);
       if (NewAttr)
         D->addAttr(NewAttr);
     }
@@ -2810,8 +2839,8 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       AvailabilityAttr *NewAttr = S.mergeAvailabilityAttr(
           ND, AL, NewII, true /*Implicit*/, NewIntroduced, NewDeprecated,
           NewObsoleted, IsUnavailable, Str, IsStrict, Replacement,
-          Sema::AMK_None,
-          PriorityModifier + Sema::AP_InferredFromOtherPlatform);
+          Sema::AMK_None, PriorityModifier + Sema::AP_InferredFromOtherPlatform,
+          IIEnvironment);
       if (NewAttr)
         D->addAttr(NewAttr);
     }
@@ -2844,7 +2873,7 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
           MinMacCatalystVersion(Deprecated.Version),
           MinMacCatalystVersion(Obsoleted.Version), IsUnavailable, Str,
           IsStrict, Replacement, Sema::AMK_None,
-          PriorityModifier + Sema::AP_InferredFromOtherPlatform);
+          PriorityModifier + Sema::AP_InferredFromOtherPlatform, IIEnvironment);
       if (NewAttr)
         D->addAttr(NewAttr);
     } else if (II->getName() == "macos" && GetSDKInfo() &&
@@ -2887,7 +2916,8 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
               VersionOrEmptyVersion(NewObsoleted), /*IsUnavailable=*/false, Str,
               IsStrict, Replacement, Sema::AMK_None,
               PriorityModifier + Sema::AP_InferredFromOtherPlatform +
-                  Sema::AP_InferredFromOtherPlatform);
+                  Sema::AP_InferredFromOtherPlatform,
+              IIEnvironment);
           if (NewAttr)
             D->addAttr(NewAttr);
         }
diff --git a/clang/test/Parser/attr-availability.c b/clang/test/Parser/attr-availability.c
index aab0f2f3a852a..9d84d9c1df363 100644
--- a/clang/test/Parser/attr-availability.c
+++ b/clang/test/Parser/attr-availability.c
@@ -30,6 +30,8 @@ void f11(void) __attribute__((availability(macosx,message=u"b"))); // expected-w
 
 void f12(void) __attribute__((availability(macosx,message="a" u"b"))); // expected-warning {{encoding prefix 'u' on an unevaluated string literal has no effect}}
 
+void f13(void) __attribute__((availability(shadermodel, introduced = 6.0, environment=pixel))); // expected-error {{unexpected parameter 'environment' in availability attribute, not permitted in C/C++}}
+
 enum E{
     gorf __attribute__((availability(macosx,introduced=8.5, message = 10.0))), // expected-error {{expected string literal for optional message in 'availability' attribute}}
     garf __attribute__((availability(macosx,introduced=8.5, message))), // expected-error {{expected '=' after 'message'}}
diff --git a/clang/test/Sema/attr-availability-ios.c b/clang/test/Sema/attr-availability-ios.c
index b97b7e688cc61..b001e70b5ff5c 100644
--- a/clang/test/Sema/attr-availability-ios.c
+++ b/clang/test/Sema/attr-availability-ios.c
@@ -9,6 +9,7 @@ void f4(int) __attribute__((availability(macosx,introduced=10.1,deprecated=10.3,
 void f5(int) __attribute__((availability(ios,introduced=2.0))) __attribute__((availability(ios,deprecated=3.0))); // expected-note {{'f5' has been explicitly marked deprecated here}}
 void f6(int) __attribute__((availability(ios,deprecated=3.0))); // expected-note {{'f6' has been explicitly marked deprecated here}}
 void f6(int) __attribute__((availability(iOS,introduced=2.0)));
+void f7(int) __attribute__((availability(ios,introduced=2.0, environment=e))); // expected-error {{unexpected parameter 'environment' in availability attribute, not permitted in C/C++}}
 
 void test(void) {
   f0(0); // expected-warning{{'f0' is deprecated: first deprecated in iOS 2.1}}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
new file mode 100644
index 0000000000000..8fa696ea11649
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-compute -fsyntax-only -verify %s
+
+// Platform shader model, no environment parameter
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f1(); // #f1
+
+__attribute__((availability(shadermodel, introduced = 5.1)))
+unsigned f2(); // #f2
+
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f3();
+
+// Platform shader model, environment parameter restricting earlier version,
+// available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f4(); // #f4
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f5();
+
+// Platform shader model, environment parameter restricting earlier version,
+// never available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = mesh)))
+unsigned f6();  // #f6
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f7(); // #f7
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f8();
+
+[numthreads(4,1,1)]
+int main() {
+    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
+    unsigned A = f1(); // #f1_call
+
+    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
+    unsigned B = f2(); // #f2_call
+
+    unsigned C = f3();
+
+    // expected-warning@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f4 {{'f4' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f4_call {{enclose 'f4' in a __builtin_available check to silence this warning}}
+    unsigned D = f4(); // #f4_call
+
+    unsigned E = f5();
+
+    // expected-warning@#f6_call {{'f6' is only available in compute shader environment on Shader Model 6.0 or newer}}
+    // expected-note@#f6 {{'f6' has been marked as being introduced in Shader Model 6.0 in compute shader environment here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f6_call {{enclose 'f6' in a __builtin_available check to silence this warning}}
+    unsigned F = f6(); // #f6_call
+
+    // expected-warning@#f7_call {{'f7' is unavailable}}
+    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 compute shader environment}}
+    // expected-note@#f7_call {{enclose 'f7' in a __builtin_available check to silence this warning}}
+    unsigned G = f7(); // #f7_call
+
+    unsigned H = f8();
+
+    return 0;
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl
new file mode 100644
index 0000000000000..2682eb5fbb5c2
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.5-library -fsyntax-only -verify %s
+
+
+void f1(void) __attribute__((availability(shadermodel, introduced = 6.0, environment="pixel"))); // expected-error {{expected an environment name, e.g., 'compute'}}
+
+void f2(void) __attribute__((availability(shadermodel, introduced = 6.0, environment=pixel, environment=compute))); // expected-error {{redundant 'environment' availability change; only the last specified change will be used}}
+
+void f3(void) __attribute__((availability(shadermodel, strict, introduced = 6.0, environment = mesh))); // expected-error {{unexpected parameter 'strict' in availability attribute, not permitted in HLSL}}
+
+int main() {
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
new file mode 100644
index 0000000000000..40a7ddbb1de98
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-mesh -fsyntax-only -verify %s
+
+// Platform shader model, no environment parameter
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f1(); // #f1
+
+__attribute__((availability(shadermodel, introduced = 5.1)))
+unsigned f2(); // #f2
+
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f3();
+
+// Platform shader model, environment parameter restricting earlier version,
+// available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f4(); // #f4
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f5(); // #f5
+
+// Platform shader model, environment parameter restricting earlier version,
+// never available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = mesh)))
+unsigned f6();  // #f6
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f7(); // #f7
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f8(); // #f8
+
+[numthreads(4,1,1)]
+int main() {
+    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
+    unsigned A = f1(); // #f1_call
+
+    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
+    unsigned B = f2(); // #f2_call
+
+    unsigned C = f3();
+
+    // expected-warning@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f4 {{'f4' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f4_call {{enclose 'f4' in a __builtin_available check to silence this warning}}
+    unsigned D = f4(); // #f4_call
+
+    unsigned E = f5(); // #f5_call
+
+    unsigned F = f6(); // #f6_call
+
+    // expected-warning@#f7_call {{'f7' is only available in mesh shader environment on Shader Model 6.0 or newer}}
+    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 mesh shader environment}}
+    // expected-note@#f7_call {{enclose 'f7' in a __builtin_available check to silence this warning}}
+    unsigned G = f7(); // #f7_call
+
+    // expected-warning@#f8_call {{'f8' is only available in mesh shader environment on Shader Model 6.0 or newer}}
+    // expected-note@#f8 {{'f8' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 mesh shader environment}}
+    // expected-note@#f8_call {{enclose 'f8' in a __builtin_available check to silence this warning}}
+    unsigned H = f8(); // #f8_call
+
+    return 0;
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
new file mode 100644
index 0000000000000..59d09a9cd276f
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-pixel -fsyntax-only -verify %s
+
+// Platform shader model, no environment parameter
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f1(); // #f1
+
+__attribute__((availability(shadermodel, introduced = 5.1)))
+unsigned f2(); // #f2
+
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f3();
+
+// Platform shader model, environment parameter restricting earlier version,
+// available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f4(); // #f4
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f5();
+
+// Platform shader model, environment parameter restricting earlier version,
+// never available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = mesh)))
+unsigned f6();  // #f6
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f7(); // #f7
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f8();
+
+int main() {
+    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
+    unsigned A = f1(); // #f1_call
+
+    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
+    unsigned B = f2(); // #f2_call
+
+    unsigned C = f3();
+
+    unsigned D = f4(); // #f4_call
+
+    unsigned E = f5();
+
+    unsigned F = f6(); // #f6_call
+
+    unsigned G = f7(); // #f7_call
+
+    unsigned H = f8();
+
+    return 0;
+}
diff --git a/clang/test/SemaHLSL/AvailabilityMarkup.hlsl b/clang/test/SemaHLSL/AvailabilityMarkup.hlsl
deleted file mode 100644
index b883957af0871..0000000000000
--- a/clang/test/SemaHLSL/AvailabilityMarkup.hlsl
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-library -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.0)))
-unsigned fn6_0(); // #fn6_0
-
-__attribute__((availability(shadermodel, introduced = 5.1)))
-unsigned fn5_1(); // #fn5_1
-
-__attribute__((availability(shadermodel, introduced = 5.0)))
-unsigned fn5_0();
-
-void fn() {
-    // expected-warning@#fn6_0_site {{'fn6_0' is only available on HLSL ShaderModel 6.0 or newer}}
-    // expected-note@#fn6_0 {{'fn6_0' has been marked as being introduced in HLSL ShaderModel 6.0 here, but the deployment target is HLSL ShaderModel 5.0}}
-    // expected-note@#fn6_0_site {{enclose 'fn6_0' in a __builtin_available check to silence this warning}}
-    unsigned A = fn6_0(); // #fn6_0_site
-
-    // expected-warning@#fn5_1_site {{'fn5_1' is only available on HLSL ShaderModel 5.1 or newer}}
-    // expected-note@#fn5_1 {{'fn5_1' has been marked as being introduced in HLSL ShaderModel 5.1 here, but the deployment target is HLSL ShaderModel 5.0}}
-    // expected-note@#fn5_1_site {{enclose 'fn5_1' in a __builtin_available check to silence this warning}}
-    unsigned B = fn5_1(); // #fn5_1_site
-
-    unsigned C = fn5_0();
-}
-
diff --git a/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl b/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
index 0e45edc6a4c86..185b79be37be5 100644
--- a/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
+++ b/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
@@ -2,8 +2,8 @@
 // WaveActiveCountBits is unavailable before ShaderModel 6.0.
 
 unsigned foo(bool b) {
-    // expected-warning@#site {{'WaveActiveCountBits' is only available on HLSL ShaderModel 6.0 or newer}}
-    // expected-note at hlsl/hlsl_intrinsics.h:* {{'WaveActiveCountBits' has been marked as being introduced in HLSL ShaderModel 6.0 here, but the deployment target is HLSL ShaderModel 5.0}}
+    // expected-warning@#site {{'WaveActiveCountBits' is only available on Shader Model 6.0 or newer}}
+    // expected-note at hlsl/hlsl_intrinsics.h:* {{'WaveActiveCountBits' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
     // expected-note@#site {{enclose 'WaveActiveCountBits' in a __builtin_available check to silence this warning}}
     return hlsl::WaveActiveCountBits(b); // #site
 }

>From 0cd2bf3521a52f255c2b0d466f2f48f15d4a89a9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 19 May 2024 20:56:21 +0200
Subject: [PATCH 27/44] ValueTracking: Correct undef handling for constant FP
 vectors (#92557)

Treat undef as unknown, and poison as ignorable.
---
 llvm/lib/Analysis/ValueTracking.cpp           |   2 +-
 .../AMDGPU/amdgpu-codegenprepare-fdiv.ll      | 130 +++++++++---------
 llvm/test/Transforms/Attributor/nofpclass.ll  |   2 +-
 llvm/test/Transforms/InstCombine/and-fcmp.ll  |  27 +++-
 llvm/test/Transforms/InstCombine/or-fcmp.ll   |  49 ++++++-
 5 files changed, 135 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index e8c5f9b3dc25d..2d1486d252c3e 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4751,7 +4751,7 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         Known = KnownFPClass();
         return;
       }
-      if (isa<UndefValue>(Elt))
+      if (isa<PoisonValue>(Elt))
         continue;
       auto *CElt = dyn_cast<ConstantFP>(Elt);
       if (!CElt) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 6bda962d1b9ca..b69afa3ab1f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -2151,7 +2151,7 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 ; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
@@ -2222,9 +2222,9 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 ; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP19]])
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]])
@@ -2281,7 +2281,7 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 ; DAZ-NEXT:    [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0
 ; DAZ-NEXT:    [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1
 ; DAZ-NEXT:    [[TMP19:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]])
-; DAZ-NEXT:    [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0
 ; DAZ-NEXT:    [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1
 ; DAZ-NEXT:    [[TMP23:%.*]] = fmul contract float [[TMP21]], [[TMP19]]
@@ -2313,7 +2313,7 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 
   ; Matches the rsq instruction accuracy
   %sqrt.md.1ulp.undef = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
-  %md.1ulp.undef = fdiv contract <2 x float> <float 1.0, float undef>, %sqrt.md.1ulp.undef, !fpmath !2
+  %md.1ulp.undef = fdiv contract <2 x float> <float 1.0, float poison>, %sqrt.md.1ulp.undef, !fpmath !2
   store volatile <2 x float> %md.1ulp.undef, ptr addrspace(1) %out, align 4
 
   ; Test mismatched metadata/flags between the sqrt and fdiv
@@ -3121,7 +3121,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
@@ -3170,9 +3170,9 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]])
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
 ; IEEE-BADFREXP-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]]
 ; IEEE-BADFREXP-NEXT:    [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]])
@@ -3217,7 +3217,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; DAZ-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
 ; DAZ-NEXT:    [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1
 ; DAZ-NEXT:    [[TMP32:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]])
-; DAZ-NEXT:    [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP33]], 0
 ; DAZ-NEXT:    [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP33]], 1
 ; DAZ-NEXT:    [[TMP36:%.*]] = fmul contract float [[TMP34]], [[TMP32]]
@@ -3230,7 +3230,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3272,7 +3272,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
@@ -3321,9 +3321,9 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]])
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
 ; IEEE-BADFREXP-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]]
 ; IEEE-BADFREXP-NEXT:    [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]])
@@ -3361,7 +3361,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; DAZ-NEXT:    [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0
 ; DAZ-NEXT:    [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1
 ; DAZ-NEXT:    [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]])
-; DAZ-NEXT:    [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
 ; DAZ-NEXT:    [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1
 ; DAZ-NEXT:    [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]]
@@ -3374,7 +3374,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg)
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3382,7 +3382,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float>
 ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(
 ; IEEE-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath [[META2:![0-9]+]]
-; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; IEEE-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
 ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(
@@ -3399,11 +3399,11 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float>
 ; DAZ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1
 ; DAZ-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2
 ; DAZ-NEXT:    [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3
-; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract afn <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom
+  %partial.rsq = fdiv contract afn <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom
   ret <4 x float> %partial.rsq
 }
 
@@ -3411,7 +3411,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(<4 x fl
 ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(
 ; IEEE-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath [[META2]]
-; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; IEEE-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
 ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(
@@ -3428,11 +3428,11 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(<4 x fl
 ; DAZ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1
 ; DAZ-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2
 ; DAZ-NEXT:    [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3
-; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom
   ret <4 x float> %partial.rsq
 }
 
@@ -3471,7 +3471,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
@@ -3517,9 +3517,9 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]])
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]])
@@ -3553,7 +3553,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; DAZ-NEXT:    [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
 ; DAZ-NEXT:    [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1
 ; DAZ-NEXT:    [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]])
-; DAZ-NEXT:    [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0
 ; DAZ-NEXT:    [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1
 ; DAZ-NEXT:    [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]]
@@ -3566,7 +3566,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg)
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3607,7 +3607,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = sub i32 0, [[TMP30]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]])
-; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]]
+; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float poison, [[TMP33]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2
@@ -3650,7 +3650,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = sub i32 0, [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]])
-; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]]
+; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float poison, [[TMP33]]
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2
@@ -3681,7 +3681,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; DAZ-NEXT:    [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]])
 ; DAZ-NEXT:    [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]]
 ; DAZ-NEXT:    [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]])
-; DAZ-NEXT:    [[TMP22:%.*]] = fmul arcp contract float undef, [[TMP21]]
+; DAZ-NEXT:    [[TMP22:%.*]] = fmul arcp contract float poison, [[TMP21]]
 ; DAZ-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i64 0
 ; DAZ-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP18]], i64 1
 ; DAZ-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP20]], i64 2
@@ -3689,7 +3689,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3697,7 +3697,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(<4 x fl
 ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(
 ; IEEE-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath [[META2]]
-; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; IEEE-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
 ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(
@@ -3714,11 +3714,11 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(<4 x fl
 ; DAZ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1
 ; DAZ-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2
 ; DAZ-NEXT:    [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3
-; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom
+  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom
   ret <4 x float> %partial.rsq
 }
 
@@ -3755,7 +3755,7 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = sub i32 0, [[TMP27]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP26]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]])
-; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float undef, [[TMP30]]
+; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float poison, [[TMP30]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP17]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP24]], i64 2
@@ -3794,7 +3794,7 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = sub i32 0, [[TMP27]]
 ; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP26]])
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]])
-; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float undef, [[TMP30]]
+; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float poison, [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP17]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP24]], i64 2
@@ -3813,24 +3813,24 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; DAZ-NEXT:    [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]])
 ; DAZ-NEXT:    [[TMP9:%.*]] = fmul arcp float 4.000000e+00, [[TMP8]]
 ; DAZ-NEXT:    [[TMP10:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]])
-; DAZ-NEXT:    [[TMP11:%.*]] = fmul arcp float undef, [[TMP10]]
+; DAZ-NEXT:    [[TMP11:%.*]] = fmul arcp float poison, [[TMP10]]
 ; DAZ-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
 ; DAZ-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP7]], i64 1
 ; DAZ-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP9]], i64 2
 ; DAZ-NEXT:    [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 3
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RCP]]
 ;
-  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %arg, !fpmath !2
+  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %arg, !fpmath !2
   ret <4 x float> %partial.rcp
 }
 
 define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp_correct(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp_correct(
 ; CHECK-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[PARTIAL_RCP:%.*]] = fdiv arcp <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[ARG]]
+; CHECK-NEXT:    [[PARTIAL_RCP:%.*]] = fdiv arcp <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[ARG]]
 ; CHECK-NEXT:    ret <4 x float> [[PARTIAL_RCP]]
 ;
-  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %arg
+  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %arg
   ret <4 x float> %partial.rcp
 }
 
@@ -3841,7 +3841,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-GOODFREXP-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00)
 ; IEEE-GOODFREXP-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00)
 ; IEEE-GOODFREXP-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00)
-; IEEE-GOODFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2
@@ -3857,21 +3857,21 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-GOODFREXP-NEXT:    [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-GOODFREXP-NEXT:    [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-GOODFREXP-NEXT:    [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0
-; IEEE-GOODFREXP-NEXT:    [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP22:%.*]] = sub i32 0, [[TMP27]]
-; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]])
-; IEEE-GOODFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP22]])
-; IEEE-GOODFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
+; IEEE-GOODFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1
+; IEEE-GOODFREXP-NEXT:    [[TMP22:%.*]] = sub i32 0, [[TMP50]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP51:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP22]])
+; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP51]]
-; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP50]]
+; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]])
+; IEEE-GOODFREXP-NEXT:    [[TMP52:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
+; IEEE-GOODFREXP-NEXT:    [[TMP53:%.*]] = extractvalue { float, i32 } [[TMP52]], 0
+; IEEE-GOODFREXP-NEXT:    [[TMP54:%.*]] = extractvalue { float, i32 } [[TMP52]], 1
+; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP53]], [[TMP28]]
+; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP54]], [[TMP31]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
@@ -3894,7 +3894,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-BADFREXP-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00)
 ; IEEE-BADFREXP-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00)
 ; IEEE-BADFREXP-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00)
-; IEEE-BADFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
 ; IEEE-BADFREXP-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2
@@ -3910,20 +3910,20 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-BADFREXP-NEXT:    [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
 ; IEEE-BADFREXP-NEXT:    [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
 ; IEEE-BADFREXP-NEXT:    [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-BADFREXP-NEXT:    [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-BADFREXP-NEXT:    [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0
+; IEEE-BADFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
+; IEEE-BADFREXP-NEXT:    [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]])
 ; IEEE-BADFREXP-NEXT:    [[TMP22:%.*]] = sub i32 0, [[TMP21]]
-; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]])
-; IEEE-BADFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP22]])
-; IEEE-BADFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
-; IEEE-BADFREXP-NEXT:    [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]])
 ; IEEE-BADFREXP-NEXT:    [[TMP50:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP22]])
+; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
-; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP50]]
+; IEEE-BADFREXP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]])
+; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]])
+; IEEE-BADFREXP-NEXT:    [[TMP51:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
+; IEEE-BADFREXP-NEXT:    [[TMP52:%.*]] = extractvalue { float, i32 } [[TMP51]], 0
+; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
+; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP52]], [[TMP28]]
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]]
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]])
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
@@ -3947,7 +3947,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; DAZ-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00)
 ; DAZ-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00)
 ; DAZ-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00)
-; DAZ-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef)
+; DAZ-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float poison)
 ; DAZ-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
 ; DAZ-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1
 ; DAZ-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2
@@ -3963,7 +3963,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; DAZ-NEXT:    [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0
 ; DAZ-NEXT:    [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1
 ; DAZ-NEXT:    [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP16]])
-; DAZ-NEXT:    [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0
 ; DAZ-NEXT:    [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1
 ; DAZ-NEXT:    [[TMP22:%.*]] = fmul contract float [[TMP20]], [[TMP18]]
@@ -3985,8 +3985,8 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
-  %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> <float 4.0, float 2.0, float 8.0, float undef>), !fpmath !2
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float undef, float 2.0>, %sqrt, !fpmath !2
+  %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> <float 4.0, float 2.0, float 8.0, float poison>), !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float poison, float 2.0>, %sqrt, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 5945fc5e7b0bf..b38f9bae50ccc 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -114,7 +114,7 @@ define <2 x double> @returned_strange_constant_vector_elt() {
 
 ; Test a vector element that's undef
 define <3 x double> @returned_undef_constant_vector_elt() {
-; CHECK-LABEL: define nofpclass(nan inf sub norm) <3 x double> @returned_undef_constant_vector_elt() {
+; CHECK-LABEL: define <3 x double> @returned_undef_constant_vector_elt() {
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    ret <3 x double> <double -0.000000e+00, double 0.000000e+00, double undef>
 ;
diff --git a/llvm/test/Transforms/InstCombine/and-fcmp.ll b/llvm/test/Transforms/InstCombine/and-fcmp.ll
index f1ae2e74ac2e4..c163802fcc935 100644
--- a/llvm/test/Transforms/InstCombine/and-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/and-fcmp.ll
@@ -39,7 +39,9 @@ define i1 @PR1738_logical_noundef(double %x, double noundef %y) {
 
 define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @PR1738_vec_undef(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp ord <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ord <2 x double> [[X:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ord <2 x double> [[Y:%.*]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[OR:%.*]] = and <2 x i1> [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[OR]]
 ;
   %cmp1 = fcmp ord <2 x double> %x, <double 0.0, double undef>
@@ -48,6 +50,17 @@ define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %or
 }
 
+define <2 x i1> @PR1738_vec_poison(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @PR1738_vec_poison(
+; CHECK-NEXT:    [[OR:%.*]] = fcmp ord <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[OR]]
+;
+  %cmp1 = fcmp ord <2 x double> %x, <double 0.0, double poison>
+  %cmp2 = fcmp ord <2 x double> %y, <double poison, double 0.0>
+  %or = and <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
 define i1 @PR41069(i1 %z, float %c, float %d) {
 ; CHECK-LABEL: @PR41069(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord float [[D:%.*]], [[C:%.*]]
@@ -111,8 +124,10 @@ define i1 @PR41069_commute_logical(i1 %z, float %c, float %d) {
 define <2 x i1> @PR41069_vec(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
 ; CHECK-LABEL: @PR41069_vec(
 ; CHECK-NEXT:    [[ORD1:%.*]] = fcmp ord <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord <2 x double> [[D:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[TMP1]], [[ORD1]]
+; CHECK-NEXT:    [[ORD2:%.*]] = fcmp ord <2 x double> [[C:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[ORD1]], [[ORD2]]
+; CHECK-NEXT:    [[ORD3:%.*]] = fcmp ord <2 x double> [[D:%.*]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[AND]], [[ORD3]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %ord1 = fcmp ord <2 x double> %a, %b
@@ -126,8 +141,10 @@ define <2 x i1> @PR41069_vec(<2 x double> %a, <2 x double> %b, <2 x double> %c,
 define <2 x i1> @PR41069_vec_commute(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
 ; CHECK-LABEL: @PR41069_vec_commute(
 ; CHECK-NEXT:    [[ORD1:%.*]] = fcmp ord <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord <2 x double> [[D:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[TMP1]], [[ORD1]]
+; CHECK-NEXT:    [[ORD2:%.*]] = fcmp ord <2 x double> [[C:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[ORD1]], [[ORD2]]
+; CHECK-NEXT:    [[ORD3:%.*]] = fcmp ord <2 x double> [[D:%.*]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[ORD3]], [[AND]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %ord1 = fcmp ord <2 x double> %a, %b
diff --git a/llvm/test/Transforms/InstCombine/or-fcmp.ll b/llvm/test/Transforms/InstCombine/or-fcmp.ll
index ffd927672b413..285b2d958abd8 100644
--- a/llvm/test/Transforms/InstCombine/or-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/or-fcmp.ll
@@ -28,7 +28,9 @@ define i1 @PR1738_logical(double %x, double %y) {
 
 define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @PR1738_vec_undef(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp uno <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp uno <2 x double> [[X:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp uno <2 x double> [[Y:%.*]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[OR]]
 ;
   %cmp1 = fcmp uno <2 x double> %x, <double 0.0, double undef>
@@ -37,6 +39,17 @@ define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %or
 }
 
+define <2 x i1> @PR1738_vec_poison(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @PR1738_vec_poison(
+; CHECK-NEXT:    [[OR:%.*]] = fcmp uno <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[OR]]
+;
+  %cmp1 = fcmp uno <2 x double> %x, <double 0.0, double poison>
+  %cmp2 = fcmp uno <2 x double> %y, <double poison, double 0.0>
+  %or = or <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
 define i1 @PR41069(double %a, double %b, double %c, double %d) {
 ; CHECK-LABEL: @PR41069(
 ; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
@@ -105,26 +118,56 @@ define i1 @PR41069_commute_logical(double %a, double %b, double %c, double %d) {
 
 define <2 x i1> @PR41069_vec(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
 ; CHECK-LABEL: @PR41069_vec(
+; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno <2 x float> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[UNO1]], [[Z:%.*]]
+; CHECK-NEXT:    [[UNO2:%.*]] = fcmp uno <2 x float> [[D:%.*]], <float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[OR]], [[UNO2]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %uno1 = fcmp uno <2 x float> %c, zeroinitializer
+  %or = or <2 x i1> %uno1, %z
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %r = or <2 x i1> %or, %uno2
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @PR41069_vec_poison(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @PR41069_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <2 x float> [[D:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[TMP1]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %uno1 = fcmp uno <2 x float> %c, zeroinitializer
   %or = or <2 x i1> %uno1, %z
-  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float poison>
   %r = or <2 x i1> %or, %uno2
   ret <2 x i1> %r
 }
 
 define <2 x i1> @PR41069_vec_commute(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
 ; CHECK-LABEL: @PR41069_vec_commute(
+; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno <2 x float> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[UNO1]], [[Z:%.*]]
+; CHECK-NEXT:    [[UNO2:%.*]] = fcmp uno <2 x float> [[D:%.*]], <float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[UNO2]], [[OR]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %uno1 = fcmp uno <2 x float> %c, zeroinitializer
+  %or = or <2 x i1> %uno1, %z
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %r = or <2 x i1> %uno2, %or
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @PR41069_vec_commute_poison(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @PR41069_vec_commute_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <2 x float> [[D:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[TMP1]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %uno1 = fcmp uno <2 x float> %c, zeroinitializer
   %or = or <2 x i1> %uno1, %z
-  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float poison>
   %r = or <2 x i1> %uno2, %or
   ret <2 x i1> %r
 }

>From 878642954f5178c55b337afe2bff4e6a92a67a5b Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Sun, 19 May 2024 13:23:04 -0700
Subject: [PATCH 28/44] [BOLT] Fix preserved offset in fixDoubleJumps (#92485)

---
 bolt/lib/Passes/BinaryPasses.cpp       | 14 +++++++++-----
 bolt/test/X86/bb-with-two-tail-calls.s |  8 ++++----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 867f977cebca7..298ba29ff5b3f 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -674,7 +674,8 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
   for (BinaryBasicBlock &BB : Function) {
     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
-                             const MCSymbol *SuccSym) {
+                             const MCSymbol *SuccSym,
+                             std::optional<uint32_t> Offset) {
       // Ignore infinite loop jumps or fallthrough tail jumps.
       if (Pred == Succ || Succ == &BB)
         return false;
@@ -715,9 +716,11 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
           Pred->removeSuccessor(&BB);
           Pred->eraseInstruction(Pred->findInstruction(Branch));
           Pred->addTailCallInstruction(SuccSym);
-          MCInst *TailCall = Pred->getLastNonPseudoInstr();
-          assert(TailCall);
-          MIB->setOffset(*TailCall, BB.getOffset());
+          if (Offset) {
+            MCInst *TailCall = Pred->getLastNonPseudoInstr();
+            assert(TailCall);
+            MIB->setOffset(*TailCall, *Offset);
+          }
         } else {
           return false;
         }
@@ -760,7 +763,8 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
       if (Pred->getSuccessor() == &BB ||
           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
           Pred->getConditionalSuccessor(false) == &BB)
-        if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
+        if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) &&
+            MarkInvalid)
           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
                        BB.isEntryPoint());
     }
diff --git a/bolt/test/X86/bb-with-two-tail-calls.s b/bolt/test/X86/bb-with-two-tail-calls.s
index bb2b0cd4cc23a..b6703e352ff4b 100644
--- a/bolt/test/X86/bb-with-two-tail-calls.s
+++ b/bolt/test/X86/bb-with-two-tail-calls.s
@@ -1,8 +1,6 @@
 # This reproduces a bug with dynostats when trying to compute branch stats
 # at a block with two tails calls (one conditional and one unconditional).
 
-# REQUIRES: system-linux
-
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
 # RUN:   %s -o %t.o
 # RUN: link_fdata %s %t.o %t.fdata
@@ -13,7 +11,7 @@
 # CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed.
 # Two tail calls in the same basic block after SCTC:
 # CHECK:         {{.*}}:   ja      {{.*}} # TAILCALL # Offset: 7 # CTCTakenCount: 4
-# CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL # Offset: 12
+# CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL # Offset: 13
 
   .globl _start
 _start:
@@ -23,7 +21,9 @@ a:  ja b
 x:  ret
 # FDATA: 1 _start #a# 1 _start #b# 2 4
 b:  jmp e
-c:  jmp f
+c:
+    .nops 1
+    jmp f
 
   .globl e
 e:

>From fb2c6597e39e9e1a775525ea0236b2f89e46acff Mon Sep 17 00:00:00 2001
From: Leon Clark <PeddleSpam at users.noreply.github.com>
Date: Sun, 19 May 2024 21:45:24 +0100
Subject: [PATCH 29/44] [AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16
 (#88512)

Use LSH to lower ctlz_zero_undef instead of subtracting leading zeros
for i8 and i16.

Related to [77615](https://github.com/llvm/llvm-project/pull/77615).

---------

Co-authored-by: Leon Clark <leoclark at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  22 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  44 +++-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   2 +
 .../GlobalISel/legalize-ctlz-zero-undef.mir   |  47 ++--
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 232 +++++++-----------
 5 files changed, 169 insertions(+), 178 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d35a022ad6806..980e58510ceb7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3117,20 +3117,30 @@ static bool isCttzOpc(unsigned Opc) {
 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
                                                SelectionDAG &DAG) const {
   auto SL = SDLoc(Op);
+  auto Opc = Op.getOpcode();
   auto Arg = Op.getOperand(0u);
   auto ResultVT = Op.getValueType();
 
   if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
     return {};
 
-  assert(isCtlzOpc(Op.getOpcode()));
+  assert(isCtlzOpc(Opc));
   assert(ResultVT == Arg.getValueType());
 
-  auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
-  auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
-  auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
-  NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
-  NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
+  const uint64_t NumBits = ResultVT.getFixedSizeInBits();
+  SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
+  SDValue NewOp;
+
+  if (Opc == ISD::CTLZ_ZERO_UNDEF) {
+    NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
+    NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
+    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+  } else {
+    NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+    NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
+  }
+
   return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bd7bf78c4c0bd..15a4b6796880f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1270,13 +1270,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .custom();
 
   // The 64-bit versions produce 32-bit results, but only on the SALU.
-  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
-    .legalFor({{S32, S32}, {S32, S64}})
-    .clampScalar(0, S32, S32)
-    .clampScalar(1, S32, S64)
-    .scalarize(0)
-    .widenScalarToNextPow2(0, 32)
-    .widenScalarToNextPow2(1, 32);
+  getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+      .legalFor({{S32, S32}, {S32, S64}})
+      .customIf(scalarNarrowerThan(1, 32))
+      .clampScalar(0, S32, S32)
+      .clampScalar(1, S32, S64)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32)
+      .widenScalarToNextPow2(1, 32);
+
+  getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
+      .legalFor({{S32, S32}, {S32, S64}})
+      .clampScalar(0, S32, S32)
+      .clampScalar(1, S32, S64)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32)
+      .widenScalarToNextPow2(1, 32);
 
   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
   // RegBankSelect.
@@ -2128,6 +2137,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_CTLZ:
   case TargetOpcode::G_CTTZ:
     return legalizeCTLZ_CTTZ(MI, MRI, B);
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+    return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
     return legalizeFPTruncRound(MI, B);
   case TargetOpcode::G_STACKSAVE:
@@ -4145,6 +4156,25 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
+                                                  MachineRegisterInfo &MRI,
+                                                  MachineIRBuilder &B) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(Src);
+  TypeSize NumBits = SrcTy.getSizeInBits();
+
+  assert(NumBits < 32u);
+
+  auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
+  auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
+  auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt);
+  auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
+  B.buildTrunc(Dst, Ctlz);
+  MI.eraseFromParent();
+  return true;
+}
+
 // Check that this is a G_XOR x, -1
 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::G_XOR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8..4b1d821dadc21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
   bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B) const;
+  bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
+                               MachineIRBuilder &B) const;
 
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       const ArgDescriptor *Arg,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index fed277d7d10d0..7748b481cf5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -81,14 +81,12 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_TRUNC %0
     %2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -149,18 +147,15 @@ body: |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
-    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -179,14 +174,12 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
+    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s7) = G_TRUNC %0
     %2:_(s7) = G_CTLZ_ZERO_UNDEF %1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 54adde38d6d22..d94a27e8c0200 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -322,9 +322,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xff
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_sub_i32 s4, s2, 24
+; SI-NEXT:    s_lshl_b32 s2, s2, 24
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -335,9 +334,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s2, s2, 24
 ; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_sub_i32 s2, s2, 24
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -357,13 +355,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, 0.0,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -379,9 +377,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s4, 24
 ; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -399,9 +396,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xffff
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_add_i32 s4, s2, -16
+; SI-NEXT:    s_lshl_b32 s2, s2, 16
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -434,13 +430,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, 0.0,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -16(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -456,9 +452,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s4, 16
 ; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -598,8 +593,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -613,8 +608,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -626,7 +621,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -635,10 +630,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -24(nan), 3(4.203895e-45)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -659,8 +655,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 24, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -693,8 +688,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -729,7 +724,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -738,10 +733,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -16(nan), 3(4.203895e-45)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -764,8 +760,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1110,8 +1105,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1124,8 +1119,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT:    v_ffbh_u32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -1144,13 +1139,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -1172,8 +1167,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1709,12 +1703,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2193,9 +2186,8 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
   ret i7 %ctlz
@@ -2286,9 +2278,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0x3ffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s4, 14
 ; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 14
 ; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 0x3ffff
 ; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
@@ -2326,9 +2317,8 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 14, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
   ret i18 %ctlz
@@ -2365,12 +2355,10 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 14, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 14, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
   ret <2 x i18> %ctlz
@@ -2380,16 +2368,12 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
-; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v2i16:
@@ -2410,12 +2394,10 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
   ret <2 x i16> %ctlz
@@ -2425,20 +2407,15 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_ffbh_u32_e32 v3, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
-; SI-NEXT:    v_or_b32_e32 v2, 0x100000, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_or_b32_e32 v2, 0x200000, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v3, v0, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2462,14 +2439,11 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
   ret <3 x i16> %ctlz
@@ -2479,24 +2453,18 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; SI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2524,18 +2492,13 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v3, 16, v3
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, s4, 16, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
   ret <4 x i16> %ctlz
@@ -2545,27 +2508,24 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v2i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xffffe800, v0
-; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v2i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v1, 0xe800, v1
-; VI-NEXT:    v_subrev_u16_e32 v0, 24, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_ctlz_zero_undef_v2i8:
@@ -2576,10 +2536,8 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
   ret <2 x i8> %ctlz
@@ -2621,12 +2579,10 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 25, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
   ret <2 x i7> %ctlz

>From ad625a407622ba5817ef58e30357139a40cf929e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 19 May 2024 14:51:13 -0700
Subject: [PATCH 30/44] [TableGen] Avoid std::string copy. NFC

Fix #92702
---
 llvm/utils/TableGen/ARMTargetDefEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index 491011643bbfb..b79458529623f 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -170,7 +170,7 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
      << "/// The set of all architectures\n"
      << "static constexpr std::array<const ArchInfo *, " << CppSpellings.size()
      << "> ArchInfos = {\n";
-  for (auto CppSpelling : CppSpellings)
+  for (StringRef CppSpelling : CppSpellings)
     OS << "  &" << CppSpelling << ",\n";
   OS << "};\n";
 

>From 7892d434741ba0ac755e00ae96ca7cdcfaf82d35 Mon Sep 17 00:00:00 2001
From: Ryuichi Watanabe <ryucrosskey at gmail.com>
Date: Mon, 20 May 2024 07:01:47 +0900
Subject: [PATCH 31/44] Update llvm-bugs.yml (#77243)

---
 .github/workflows/llvm-bugs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index f592dd6ccd903..c392078fa4525 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: actions/setup-node at v3
+      - uses: actions/setup-node at v4
         with:
           node-version: 18
           check-latest: true

>From b603237b6c067e82a7c6b73adb7e18c8edfb40dd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Sun, 19 May 2024 15:02:44 -0700
Subject: [PATCH 32/44] [llvm] Use operator==(StringRef, StringRef) (NFC)
 (#92705)

---
 llvm/lib/Option/OptTable.cpp                   |  2 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp   | 10 ++++------
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp    |  6 +++---
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp |  2 +-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index b8b6b90c253f2..3eceb0fbdfc47 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -197,7 +197,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
 
     std::vector<std::string> Result;
     for (StringRef Val : Candidates)
-      if (Val.starts_with(Arg) && Arg.compare(Val))
+      if (Val.starts_with(Arg) && Arg != Val)
         Result.push_back(std::string(Val));
     return Result;
   }
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index cf80a58f43bd9..44e2aeb00d8cc 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -350,16 +350,14 @@ void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl(
         continue;
       }
       StringRef AnnotationName = *AnnotationNameOrErr;
-      if (AnnotationName.compare(
-              InstrProfCorrelator::FunctionNameAttributeName) == 0) {
+      if (AnnotationName == InstrProfCorrelator::FunctionNameAttributeName) {
         if (auto EC =
                 AnnotationFormValue->getAsCString().moveInto(FunctionName))
           consumeError(std::move(EC));
-      } else if (AnnotationName.compare(
-                     InstrProfCorrelator::CFGHashAttributeName) == 0) {
+      } else if (AnnotationName == InstrProfCorrelator::CFGHashAttributeName) {
         CFGHash = AnnotationFormValue->getAsUnsignedConstant();
-      } else if (AnnotationName.compare(
-                     InstrProfCorrelator::NumCountersAttributeName) == 0) {
+      } else if (AnnotationName ==
+                 InstrProfCorrelator::NumCountersAttributeName) {
         NumCounters = AnnotationFormValue->getAsUnsignedConstant();
       }
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 727e4e584c053..f4daab7d06eb5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -171,9 +171,9 @@ getArgAccessQual(const Function &F, unsigned ArgIdx) {
   if (!ArgAttribute)
     return SPIRV::AccessQualifier::ReadWrite;
 
-  if (ArgAttribute->getString().compare("read_only") == 0)
+  if (ArgAttribute->getString() == "read_only")
     return SPIRV::AccessQualifier::ReadOnly;
-  if (ArgAttribute->getString().compare("write_only") == 0)
+  if (ArgAttribute->getString() == "write_only")
     return SPIRV::AccessQualifier::WriteOnly;
   return SPIRV::AccessQualifier::ReadWrite;
 }
@@ -181,7 +181,7 @@ getArgAccessQual(const Function &F, unsigned ArgIdx) {
 static std::vector<SPIRV::Decoration::Decoration>
 getKernelArgTypeQual(const Function &F, unsigned ArgIdx) {
   MDString *ArgAttribute = getOCLKernelArgTypeQual(F, ArgIdx);
-  if (ArgAttribute && ArgAttribute->getString().compare("volatile") == 0)
+  if (ArgAttribute && ArgAttribute->getString() == "volatile")
     return {SPIRV::Decoration::Volatile};
   return {};
 }
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 62b4a9278954c..6623106109316 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1802,7 +1802,7 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
                                            bool &ParseError, SMLoc &End) {
   // A named operator should be either lower or upper case, but not a mix...
   // except in MASM, which uses full case-insensitivity.
-  if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
+  if (Name != Name.lower() && Name != Name.upper() &&
       !getParser().isParsingMasm())
     return false;
   if (Name.equals_insensitive("not")) {

>From 2d5e488c98225108aebfe4aa4acfe6ec1f234a37 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano at gmail.com>
Date: Sun, 19 May 2024 15:09:03 -0700
Subject: [PATCH 33/44] [clang-format][NFC] Clean up SortIncludesTest.cpp

Wherever applicable, replace EXPECT_EQ with verifyFormat and std::string
with StringRef. Also, change a raw string literal to a regular one.
---
 clang/unittests/Format/SortIncludesTest.cpp | 1942 +++++++++----------
 1 file changed, 970 insertions(+), 972 deletions(-)

diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 824fa0078cd03..52ba19627182b 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -53,35 +53,35 @@ class SortIncludesTest : public test::FormatTestBase {
 };
 
 TEST_F(SortIncludesTest, BasicSorting) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\""));
-
-  EXPECT_EQ("// comment\n"
-            "#include <a>\n"
-            "#include <b>",
-            sort("// comment\n"
-                 "#include <b>\n"
-                 "#include <a>",
-                 {tooling::Range(25, 1)}));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\""));
+
+  verifyFormat("// comment\n"
+               "#include <a>\n"
+               "#include <b>",
+               sort("// comment\n"
+                    "#include <b>\n"
+                    "#include <a>",
+                    {tooling::Range(25, 1)}));
 }
 
 TEST_F(SortIncludesTest, TrailingComments) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\" /* long\n"
-            "                  * long\n"
-            "                  * comment*/\n"
-            "#include \"c.h\"\n"
-            "#include \"d.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\" /* long\n"
-                 "                  * long\n"
-                 "                  * comment*/\n"
-                 "#include \"d.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\" /* long\n"
+               "                  * long\n"
+               "                  * comment*/\n"
+               "#include \"c.h\"\n"
+               "#include \"d.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\" /* long\n"
+                    "                  * long\n"
+                    "                  * comment*/\n"
+                    "#include \"d.h\""));
 }
 
 TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
@@ -100,531 +100,531 @@ TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
       {"<path", 9, 11, false},
       {"^<[^/].*\\.h>", 8, 10, false},
       {"^\".*\\.h\"", 10, 12, false}};
-  EXPECT_EQ("#include <sys/param.h>\n"
-            "#include <sys/types.h>\n"
-            "#include <sys/ioctl.h>\n"
-            "#include <sys/socket.h>\n"
-            "#include <sys/stat.h>\n"
-            "#include <sys/wait.h>\n"
-            "\n"
-            "#include <net/if.h>\n"
-            "#include <net/if_dl.h>\n"
-            "#include <net/route.h>\n"
-            "#include <netinet/in.h>\n"
-            "#include <protocols/rwhod.h>\n"
-            "\n"
-            "#include <assert.h>\n"
-            "#include <errno.h>\n"
-            "#include <inttypes.h>\n"
-            "#include <stdio.h>\n"
-            "#include <stdlib.h>\n"
-            "\n"
-            "#include <paths.h>\n"
-            "\n"
-            "#include \"pathnames.h\"",
-            sort("#include <sys/param.h>\n"
-                 "#include <sys/types.h>\n"
-                 "#include <sys/ioctl.h>\n"
-                 "#include <net/if_dl.h>\n"
-                 "#include <net/route.h>\n"
-                 "#include <netinet/in.h>\n"
-                 "#include <sys/socket.h>\n"
-                 "#include <sys/stat.h>\n"
-                 "#include <sys/wait.h>\n"
-                 "#include <net/if.h>\n"
-                 "#include <protocols/rwhod.h>\n"
-                 "#include <assert.h>\n"
-                 "#include <paths.h>\n"
-                 "#include \"pathnames.h\"\n"
-                 "#include <errno.h>\n"
-                 "#include <inttypes.h>\n"
-                 "#include <stdio.h>\n"
-                 "#include <stdlib.h>"));
+  verifyFormat("#include <sys/param.h>\n"
+               "#include <sys/types.h>\n"
+               "#include <sys/ioctl.h>\n"
+               "#include <sys/socket.h>\n"
+               "#include <sys/stat.h>\n"
+               "#include <sys/wait.h>\n"
+               "\n"
+               "#include <net/if.h>\n"
+               "#include <net/if_dl.h>\n"
+               "#include <net/route.h>\n"
+               "#include <netinet/in.h>\n"
+               "#include <protocols/rwhod.h>\n"
+               "\n"
+               "#include <assert.h>\n"
+               "#include <errno.h>\n"
+               "#include <inttypes.h>\n"
+               "#include <stdio.h>\n"
+               "#include <stdlib.h>\n"
+               "\n"
+               "#include <paths.h>\n"
+               "\n"
+               "#include \"pathnames.h\"",
+               sort("#include <sys/param.h>\n"
+                    "#include <sys/types.h>\n"
+                    "#include <sys/ioctl.h>\n"
+                    "#include <net/if_dl.h>\n"
+                    "#include <net/route.h>\n"
+                    "#include <netinet/in.h>\n"
+                    "#include <sys/socket.h>\n"
+                    "#include <sys/stat.h>\n"
+                    "#include <sys/wait.h>\n"
+                    "#include <net/if.h>\n"
+                    "#include <protocols/rwhod.h>\n"
+                    "#include <assert.h>\n"
+                    "#include <paths.h>\n"
+                    "#include \"pathnames.h\"\n"
+                    "#include <errno.h>\n"
+                    "#include <inttypes.h>\n"
+                    "#include <stdio.h>\n"
+                    "#include <stdlib.h>"));
 }
 TEST_F(SortIncludesTest, SortPriorityNotDefined) {
   FmtStyle = getLLVMStyle();
-  EXPECT_EQ("#include \"FormatTestUtils.h\"\n"
-            "#include \"clang/Format/Format.h\"\n"
-            "#include \"llvm/ADT/None.h\"\n"
-            "#include \"llvm/Support/Debug.h\"\n"
-            "#include \"gtest/gtest.h\"",
-            sort("#include \"clang/Format/Format.h\"\n"
-                 "#include \"llvm/ADT/None.h\"\n"
-                 "#include \"FormatTestUtils.h\"\n"
-                 "#include \"gtest/gtest.h\"\n"
-                 "#include \"llvm/Support/Debug.h\""));
+  verifyFormat("#include \"FormatTestUtils.h\"\n"
+               "#include \"clang/Format/Format.h\"\n"
+               "#include \"llvm/ADT/None.h\"\n"
+               "#include \"llvm/Support/Debug.h\"\n"
+               "#include \"gtest/gtest.h\"",
+               sort("#include \"clang/Format/Format.h\"\n"
+                    "#include \"llvm/ADT/None.h\"\n"
+                    "#include \"FormatTestUtils.h\"\n"
+                    "#include \"gtest/gtest.h\"\n"
+                    "#include \"llvm/Support/Debug.h\""));
 }
 
 TEST_F(SortIncludesTest, NoReplacementsForValidIncludes) {
   // Identical #includes have led to a failure with an unstable sort.
-  std::string Code = "#include <a>\n"
-                     "#include <b>\n"
-                     "#include <c>\n"
-                     "#include <d>\n"
-                     "#include <e>\n"
-                     "#include <f>\n";
+  StringRef Code = "#include <a>\n"
+                   "#include <b>\n"
+                   "#include <c>\n"
+                   "#include <d>\n"
+                   "#include <e>\n"
+                   "#include <f>\n";
   EXPECT_TRUE(sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a.cc").empty());
 }
 
 TEST_F(SortIncludesTest, MainFileHeader) {
-  std::string Code = "#include <string>\n"
-                     "\n"
-                     "#include \"a/extra_action.proto.h\"\n";
+  StringRef Code = "#include <string>\n"
+                   "\n"
+                   "#include \"a/extra_action.proto.h\"\n";
   FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
   EXPECT_TRUE(
       sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a/extra_action.cc")
           .empty());
 
-  EXPECT_EQ("#include \"foo.bar.h\"\n"
-            "\n"
-            "#include \"a.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"foo.bar.h\"",
-                 "foo.bar.cc"));
+  verifyFormat("#include \"foo.bar.h\"\n"
+               "\n"
+               "#include \"a.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"foo.bar.h\"",
+                    "foo.bar.cc"));
 }
 
 TEST_F(SortIncludesTest, SortedIncludesInMultipleBlocksAreMerged) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "\n"
+                    "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, SupportClangFormatOff) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "// clang-format off\n"
-            "#include <b>\n"
-            "#include <a>\n"
-            "#include <c>\n"
-            "// clang-format on",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "// clang-format off\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "// clang-format on"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "// clang-format off\n"
+               "#include <b>\n"
+               "#include <a>\n"
+               "#include <c>\n"
+               "// clang-format on",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "// clang-format off\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "// clang-format on"));
 
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "// clang-format off\r\n"
-                     "#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "// clang-format on\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "// clang-format off\r\n"
-                         "#include \"d.h\"\r\n"
-                         "#include \"b.h\"\r\n"
-                         "// clang-format on\r\n"
-                         "\r\n"
-                         "#include \"e.h\"\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"c.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 1));
+  StringRef Code = "// clang-format off\r\n"
+                   "#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "// clang-format on\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "// clang-format off\r\n"
+                       "#include \"d.h\"\r\n"
+                       "#include \"b.h\"\r\n"
+                       "// clang-format on\r\n"
+                       "\r\n"
+                       "#include \"e.h\"\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"c.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 1));
 }
 
 TEST_F(SortIncludesTest, SupportClangFormatOffCStyle) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "/* clang-format off */\n"
-            "#include <b>\n"
-            "#include <a>\n"
-            "#include <c>\n"
-            "/* clang-format on */",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format off */\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format on */"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "/* clang-format off */\n"
+               "#include <b>\n"
+               "#include <a>\n"
+               "#include <c>\n"
+               "/* clang-format on */",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format off */\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format on */"));
 
   // Not really turning it off
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "/* clang-format offically */\n"
-            "#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "/* clang-format onwards */",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format offically */\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format onwards */",
-                 "input.h", 2));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "/* clang-format offically */\n"
+               "#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "/* clang-format onwards */",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format offically */\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format onwards */",
+                    "input.h", 2));
 }
 
 TEST_F(SortIncludesTest, IncludeSortingCanBeDisabled) {
   FmtStyle.SortIncludes = FormatStyle::SI_Never;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"b.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "input.h", 0));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"b.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, MixIncludeAndImport) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#import \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#import \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#import \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#import \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, FixTrailingComments) {
-  EXPECT_EQ("#include \"a.h\"  // comment\n"
-            "#include \"bb.h\" // comment\n"
-            "#include \"ccc.h\"",
-            sort("#include \"a.h\" // comment\n"
-                 "#include \"ccc.h\"\n"
-                 "#include \"bb.h\" // comment"));
+  verifyFormat("#include \"a.h\"  // comment\n"
+               "#include \"bb.h\" // comment\n"
+               "#include \"ccc.h\"",
+               sort("#include \"a.h\" // comment\n"
+                    "#include \"ccc.h\"\n"
+                    "#include \"bb.h\" // comment"));
 }
 
 TEST_F(SortIncludesTest, LeadingWhitespace) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort(" #include \"a.h\"\n"
-                 "  #include \"c.h\"\n"
-                 "   #include \"b.h\""));
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("# include \"a.h\"\n"
-                 "#  include \"c.h\"\n"
-                 "#   include \"b.h\""));
-  EXPECT_EQ("#include \"a.h\"", sort("#include \"a.h\"\n"
-                                     " #include \"a.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort(" #include \"a.h\"\n"
+                    "  #include \"c.h\"\n"
+                    "   #include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("# include \"a.h\"\n"
+                    "#  include \"c.h\"\n"
+                    "#   include \"b.h\""));
+  verifyFormat("#include \"a.h\"", sort("#include \"a.h\"\n"
+                                        " #include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, TrailingWhitespace) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\" \n"
-                 "#include \"c.h\"  \n"
-                 "#include \"b.h\"   "));
-  EXPECT_EQ("#include \"a.h\"", sort("#include \"a.h\"\n"
-                                     "#include \"a.h\" "));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\" \n"
+                    "#include \"c.h\"  \n"
+                    "#include \"b.h\"   "));
+  verifyFormat("#include \"a.h\"", sort("#include \"a.h\"\n"
+                                        "#include \"a.h\" "));
 }
 
 TEST_F(SortIncludesTest, GreaterInComment) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\" // >\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\" // >"));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\" // >\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\" // >"));
 }
 
 TEST_F(SortIncludesTest, SortsLocallyInEachBlock) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "\n"
-            "#include \"b.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "#include \"b.h\"",
-                 "input.h", 0));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "\n"
+               "#include \"b.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "#include \"b.h\"",
+                    "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, SortsAllBlocksWhenMerging) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, CommentsAlwaysSeparateGroups) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "// comment\n"
-            "#include \"b.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"a.h\"\n"
-                 "// comment\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "// comment\n"
+               "#include \"b.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"a.h\"\n"
+                    "// comment\n"
+                    "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "// comment\n"
-            "#include \"b.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"a.h\"\n"
-                 "// comment\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "// comment\n"
+               "#include \"b.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"a.h\"\n"
+                    "// comment\n"
+                    "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "// comment\n"
-            "#include \"b.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"a.h\"\n"
-                 "// comment\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "// comment\n"
+               "#include \"b.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"a.h\"\n"
+                    "// comment\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesAngledIncludesAsSeparateBlocks) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "#include <array>\n"
-            "#include <b.h>\n"
-            "#include <d.h>\n"
-            "#include <vector>",
-            sort("#include <vector>\n"
-                 "#include <d.h>\n"
-                 "#include <array>\n"
-                 "#include <b.h>\n"
-                 "#include \"c.h\"\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "#include <array>\n"
+               "#include <b.h>\n"
+               "#include <d.h>\n"
+               "#include <vector>",
+               sort("#include <vector>\n"
+                    "#include <d.h>\n"
+                    "#include <array>\n"
+                    "#include <b.h>\n"
+                    "#include \"c.h\"\n"
+                    "#include \"a.h\""));
 
   FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
-  EXPECT_EQ("#include <b.h>\n"
-            "#include <d.h>\n"
-            "\n"
-            "#include <array>\n"
-            "#include <vector>\n"
-            "\n"
-            "#include \"a.h\"\n"
-            "#include \"c.h\"",
-            sort("#include <vector>\n"
-                 "#include <d.h>\n"
-                 "#include <array>\n"
-                 "#include <b.h>\n"
-                 "#include \"c.h\"\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include <b.h>\n"
+               "#include <d.h>\n"
+               "\n"
+               "#include <array>\n"
+               "#include <vector>\n"
+               "\n"
+               "#include \"a.h\"\n"
+               "#include \"c.h\"",
+               sort("#include <vector>\n"
+                    "#include <d.h>\n"
+                    "#include <array>\n"
+                    "#include <b.h>\n"
+                    "#include \"c.h\"\n"
+                    "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, RegroupsAngledIncludesInSeparateBlocks) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "\n"
-            "#include <b.h>\n"
-            "#include <d.h>",
-            sort("#include <d.h>\n"
-                 "#include <b.h>\n"
-                 "#include \"c.h\"\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "\n"
+               "#include <b.h>\n"
+               "#include <d.h>",
+               sort("#include <d.h>\n"
+                    "#include <b.h>\n"
+                    "#include \"c.h\"\n"
+                    "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesMultilineIncludes) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \\\n"
-                 "\"c.h\"\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \\\n"
+                    "\"c.h\"\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesTrailingCommentsWithAngleBrackets) {
   // Regression test from the discussion at https://reviews.llvm.org/D121370.
-  EXPECT_EQ("#include <cstdint>\n"
-            "\n"
-            "#include \"util/bar.h\"\n"
-            "#include \"util/foo/foo.h\" // foo<T>",
-            sort("#include <cstdint>\n"
-                 "\n"
-                 "#include \"util/bar.h\"\n"
-                 "#include \"util/foo/foo.h\" // foo<T>",
-                 /*FileName=*/"input.cc",
-                 /*ExpectedNumRanges=*/0));
+  verifyFormat("#include <cstdint>\n"
+               "\n"
+               "#include \"util/bar.h\"\n"
+               "#include \"util/foo/foo.h\" // foo<T>",
+               sort("#include <cstdint>\n"
+                    "\n"
+                    "#include \"util/bar.h\"\n"
+                    "#include \"util/foo/foo.h\" // foo<T>",
+                    /*FileName=*/"input.cc",
+                    /*ExpectedNumRanges=*/0));
 }
 
 TEST_F(SortIncludesTest, LeavesMainHeaderFirst) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.cc"));
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.cc"));
-  EXPECT_EQ("#include \"llvm/input.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/input.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "input.mm"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.cc"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.cc"));
+  verifyFormat("#include \"llvm/input.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/input.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "input.mm"));
 
   // Don't allow prefixes.
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/not_a.h\"",
-            sort("#include \"llvm/not_a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.cc"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/not_a.h\"",
+               sort("#include \"llvm/not_a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.cc"));
 
   // Don't do this for _main and other suffixes.
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_main.cc"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_main.cc"));
 
   // Don't do this in headers.
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.h"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.h"));
 
   // Only do this in the first #include block.
-  EXPECT_EQ("#include <a>\n"
-            "\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include <a>\n"
-                 "\n"
-                 "#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.cc"));
+  verifyFormat("#include <a>\n"
+               "\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include <a>\n"
+                    "\n"
+                    "#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.cc"));
 
   // Only recognize the first #include with a matching basename as main include.
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"b.h\"\n"
-                 "#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"llvm/a.h\"",
-                 "a.cc"));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"b.h\"\n"
+                    "#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"llvm/a.h\"",
+                    "a.cc"));
 }
 
 TEST_F(SortIncludesTest, LeavesMainHeaderFirstInAdditionalExtensions) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?|(Impl)?$";
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.xxx"));
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "aImpl.hpp"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.xxx"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "aImpl.hpp"));
 
   // .cpp extension is considered "main" by default
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "aImpl.cpp"));
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.cpp"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "aImpl.cpp"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.cpp"));
 
   // Allow additional filenames / extensions
   Style.IncludeIsMainSourceRegex = "(Impl\\.hpp)|(\\.xxx)$";
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.xxx"));
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "aImpl.hpp"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.xxx"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "aImpl.hpp"));
 }
 
 TEST_F(SortIncludesTest, RecognizeMainHeaderInAllGroups) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
 
-  EXPECT_EQ("#include \"c.h\"\n"
-            "#include \"a.h\"\n"
-            "#include \"b.h\"",
-            sort("#include \"b.h\"\n"
-                 "\n"
-                 "#include \"a.h\"\n"
-                 "#include \"c.h\"",
-                 "c.cc"));
+  verifyFormat("#include \"c.h\"\n"
+               "#include \"a.h\"\n"
+               "#include \"b.h\"",
+               sort("#include \"b.h\"\n"
+                    "\n"
+                    "#include \"a.h\"\n"
+                    "#include \"c.h\"",
+                    "c.cc"));
 }
 
 TEST_F(SortIncludesTest, MainHeaderIsSeparatedWhenRegroupping) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
 
-  EXPECT_EQ("#include \"a.h\"\n"
-            "\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"b.h\"\n"
-                 "\n"
-                 "#include \"a.h\"\n"
-                 "#include \"c.h\"",
-                 "a.cc"));
+  verifyFormat("#include \"a.h\"\n"
+               "\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"b.h\"\n"
+                    "\n"
+                    "#include \"a.h\"\n"
+                    "#include \"c.h\"",
+                    "a.cc"));
 }
 
 TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
@@ -632,17 +632,17 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
 
   FmtStyle.SortIncludes = FormatStyle::SI_CaseInsensitive;
 
-  EXPECT_EQ("#include \"A/B.h\"\n"
-            "#include \"A/b.h\"\n"
-            "#include \"a/b.h\"\n"
-            "#include \"B/A.h\"\n"
-            "#include \"B/a.h\"",
-            sort("#include \"B/a.h\"\n"
-                 "#include \"B/A.h\"\n"
-                 "#include \"A/B.h\"\n"
-                 "#include \"a/b.h\"\n"
-                 "#include \"A/b.h\"",
-                 "a.h"));
+  verifyFormat("#include \"A/B.h\"\n"
+               "#include \"A/b.h\"\n"
+               "#include \"a/b.h\"\n"
+               "#include \"B/A.h\"\n"
+               "#include \"B/a.h\"",
+               sort("#include \"B/a.h\"\n"
+                    "#include \"B/A.h\"\n"
+                    "#include \"A/B.h\"\n"
+                    "#include \"a/b.h\"\n"
+                    "#include \"A/b.h\"",
+                    "a.h"));
 
   Style.IncludeBlocks = clang::tooling::IncludeStyle::IBS_Regroup;
   Style.IncludeCategories = {
@@ -657,17 +657,17 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
                            "#include \"Vlib.h\"\n"
                            "#include \"AST.h\"";
 
-  EXPECT_EQ("#include \"AST.h\"\n"
-            "#include \"qt.h\"\n"
-            "#include \"Vlib.h\"\n"
-            "#include \"vlib.h\"\n"
-            "\n"
-            "#include <Qtwhatever.h>\n"
-            "#include <qtwhatever.h>\n"
-            "\n"
-            "#include <Algorithm>\n"
-            "#include <algorithm>",
-            sort(UnsortedCode));
+  verifyFormat("#include \"AST.h\"\n"
+               "#include \"qt.h\"\n"
+               "#include \"Vlib.h\"\n"
+               "#include \"vlib.h\"\n"
+               "\n"
+               "#include <Qtwhatever.h>\n"
+               "#include <qtwhatever.h>\n"
+               "\n"
+               "#include <Algorithm>\n"
+               "#include <algorithm>",
+               sort(UnsortedCode));
 }
 
 TEST_F(SortIncludesTest, SupportCaseInsensitiveMatching) {
@@ -676,21 +676,21 @@ TEST_F(SortIncludesTest, SupportCaseInsensitiveMatching) {
 
   // Ensure both main header detection and grouping work in a case insensitive
   // manner.
-  EXPECT_EQ("#include \"llvm/A.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"LLVM/z.h\"\n"
-            "#include \"llvm/X.h\"\n"
-            "#include \"GTest/GTest.h\"\n"
-            "#include \"gmock/gmock.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"b.h\"\n"
-                 "#include \"GTest/GTest.h\"\n"
-                 "#include \"llvm/A.h\"\n"
-                 "#include \"gmock/gmock.h\"\n"
-                 "#include \"llvm/X.h\"\n"
-                 "#include \"LLVM/z.h\"",
-                 "a_TEST.cc"));
+  verifyFormat("#include \"llvm/A.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"LLVM/z.h\"\n"
+               "#include \"llvm/X.h\"\n"
+               "#include \"GTest/GTest.h\"\n"
+               "#include \"gmock/gmock.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"b.h\"\n"
+                    "#include \"GTest/GTest.h\"\n"
+                    "#include \"llvm/A.h\"\n"
+                    "#include \"gmock/gmock.h\"\n"
+                    "#include \"llvm/X.h\"\n"
+                    "#include \"LLVM/z.h\"",
+                    "a_TEST.cc"));
 }
 
 TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
@@ -711,57 +711,57 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
                            "#include <qtwhatever.h>\n"
                            "#include <QtGlobal>";
 
-  EXPECT_EQ("#include \"qa.h\"\n"
-            "#include \"qt.h\"\n"
-            "\n"
-            "#include <qtwhatever.h>\n"
-            "#include <windows.h>\n"
-            "\n"
-            "#include <QLabel>\n"
-            "#include <QWidget>\n"
-            "#include <QtGlobal>\n"
-            "#include <queue>\n"
-            "\n"
-            "#include <algorithm>",
-            sort(UnsortedCode));
+  verifyFormat("#include \"qa.h\"\n"
+               "#include \"qt.h\"\n"
+               "\n"
+               "#include <qtwhatever.h>\n"
+               "#include <windows.h>\n"
+               "\n"
+               "#include <QLabel>\n"
+               "#include <QWidget>\n"
+               "#include <QtGlobal>\n"
+               "#include <queue>\n"
+               "\n"
+               "#include <algorithm>",
+               sort(UnsortedCode));
 
   Style.IncludeCategories[2].RegexIsCaseSensitive = true;
   Style.IncludeCategories[3].RegexIsCaseSensitive = true;
-  EXPECT_EQ("#include \"qa.h\"\n"
-            "#include \"qt.h\"\n"
-            "\n"
-            "#include <qtwhatever.h>\n"
-            "#include <windows.h>\n"
-            "\n"
-            "#include <QLabel>\n"
-            "#include <QWidget>\n"
-            "\n"
-            "#include <QtGlobal>\n"
-            "\n"
-            "#include <algorithm>\n"
-            "#include <queue>",
-            sort(UnsortedCode));
+  verifyFormat("#include \"qa.h\"\n"
+               "#include \"qt.h\"\n"
+               "\n"
+               "#include <qtwhatever.h>\n"
+               "#include <windows.h>\n"
+               "\n"
+               "#include <QLabel>\n"
+               "#include <QWidget>\n"
+               "\n"
+               "#include <QtGlobal>\n"
+               "\n"
+               "#include <algorithm>\n"
+               "#include <queue>",
+               sort(UnsortedCode));
 }
 
 TEST_F(SortIncludesTest, NegativePriorities) {
   Style.IncludeCategories = {{".*important_os_header.*", -1, 0, false},
                              {".*", 1, 0, false}};
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "#include \"c_main.h\"\n"
-            "#include \"a_other.h\"",
-            sort("#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"\n"
-                 "#include \"important_os_header.h\"",
-                 "c_main.cc"));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "#include \"c_main.h\"\n"
+               "#include \"a_other.h\"",
+               sort("#include \"c_main.h\"\n"
+                    "#include \"a_other.h\"\n"
+                    "#include \"important_os_header.h\"",
+                    "c_main.cc"));
 
   // check stable when re-run
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "#include \"c_main.h\"\n"
-            "#include \"a_other.h\"",
-            sort("#include \"important_os_header.h\"\n"
-                 "#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"",
-                 "c_main.cc", 0));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "#include \"c_main.h\"\n"
+               "#include \"a_other.h\"",
+               sort("#include \"important_os_header.h\"\n"
+                    "#include \"c_main.h\"\n"
+                    "#include \"a_other.h\"",
+                    "c_main.cc", 0));
 }
 
 TEST_F(SortIncludesTest, PriorityGroupsAreSeparatedWhenRegroupping) {
@@ -769,34 +769,34 @@ TEST_F(SortIncludesTest, PriorityGroupsAreSeparatedWhenRegroupping) {
                              {".*", 1, 0, false}};
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
 
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "\n"
-            "#include \"c_main.h\"\n"
-            "\n"
-            "#include \"a_other.h\"",
-            sort("#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"\n"
-                 "#include \"important_os_header.h\"",
-                 "c_main.cc"));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "\n"
+               "#include \"c_main.h\"\n"
+               "\n"
+               "#include \"a_other.h\"",
+               sort("#include \"c_main.h\"\n"
+                    "#include \"a_other.h\"\n"
+                    "#include \"important_os_header.h\"",
+                    "c_main.cc"));
 
   // check stable when re-run
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "\n"
-            "#include \"c_main.h\"\n"
-            "\n"
-            "#include \"a_other.h\"",
-            sort("#include \"important_os_header.h\"\n"
-                 "\n"
-                 "#include \"c_main.h\"\n"
-                 "\n"
-                 "#include \"a_other.h\"",
-                 "c_main.cc", 0));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "\n"
+               "#include \"c_main.h\"\n"
+               "\n"
+               "#include \"a_other.h\"",
+               sort("#include \"important_os_header.h\"\n"
+                    "\n"
+                    "#include \"c_main.h\"\n"
+                    "\n"
+                    "#include \"a_other.h\"",
+                    "c_main.cc", 0));
 }
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPosition) {
-  std::string Code = "#include <ccc>\n"    // Start of line: 0
-                     "#include <bbbbbb>\n" // Start of line: 15
-                     "#include <a>\n";     // Start of line: 33
+  StringRef Code = "#include <ccc>\n"    // Start of line: 0
+                   "#include <bbbbbb>\n" // Start of line: 15
+                   "#include <a>\n";     // Start of line: 33
   EXPECT_EQ(31u, newCursor(Code, 0));
   EXPECT_EQ(13u, newCursor(Code, 15));
   EXPECT_EQ(0u, newCursor(Code, 33));
@@ -808,14 +808,14 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPosition) {
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionWithRegrouping) {
   Style.IncludeBlocks = Style.IBS_Regroup;
-  std::string Code = "#include \"b\"\n"      // Start of line: 0
-                     "\n"                    // Start of line: 13
-                     "#include \"aa\"\n"     // Start of line: 14
-                     "int i;";               // Start of line: 28
-  std::string Expected = "#include \"aa\"\n" // Start of line: 0
-                         "#include \"b\"\n"  // Start of line: 14
-                         "int i;";           // Start of line: 27
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"b\"\n"      // Start of line: 0
+                   "\n"                    // Start of line: 13
+                   "#include \"aa\"\n"     // Start of line: 14
+                   "int i;";               // Start of line: 28
+  StringRef Expected = "#include \"aa\"\n" // Start of line: 0
+                       "#include \"b\"\n"  // Start of line: 14
+                       "int i;";           // Start of line: 27
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(12u, newCursor(Code, 26)); // Closing quote of "aa"
   EXPECT_EQ(26u, newCursor(Code, 27)); // Newline after "aa"
   EXPECT_EQ(27u, newCursor(Code, 28)); // Start of last line
@@ -827,14 +827,14 @@ TEST_F(SortIncludesTest,
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {
       {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
-  std::string Code = "#include \"a\"\r\n" // Start of line: 0
-                     "\r\n"               // Start of line: 14
-                     "#include \"b\"\r\n" // Start of line: 16
-                     "\r\n"               // Start of line: 30
-                     "#include \"c\"\r\n" // Start of line: 32
-                     "\r\n"               // Start of line: 46
-                     "int i;";            // Start of line: 48
-  verifyNoChange(Code);
+  StringRef Code = "#include \"a\"\r\n" // Start of line: 0
+                   "\r\n"               // Start of line: 14
+                   "#include \"b\"\r\n" // Start of line: 16
+                   "\r\n"               // Start of line: 30
+                   "#include \"c\"\r\n" // Start of line: 32
+                   "\r\n"               // Start of line: 46
+                   "int i;";            // Start of line: 48
+  verifyFormat(Code);
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(14u, newCursor(Code, 14));
   EXPECT_EQ(16u, newCursor(Code, 16));
@@ -850,19 +850,19 @@ TEST_F(
   Style.IncludeBlocks = Style.IBS_Regroup;
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {{".*", 0, 0, false}};
-  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
-                     "\r\n"                   // Start of line: 14
-                     "#include \"b\"\r\n"     // Start of line: 16
-                     "\r\n"                   // Start of line: 30
-                     "#include \"c\"\r\n"     // Start of line: 32
-                     "\r\n"                   // Start of line: 46
-                     "int i;";                // Start of line: 48
-  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
-                         "#include \"b\"\r\n" // Start of line: 14
-                         "#include \"c\"\r\n" // Start of line: 28
-                         "\r\n"               // Start of line: 42
-                         "int i;";            // Start of line: 44
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"a\"\r\n"     // Start of line: 0
+                   "\r\n"                   // Start of line: 14
+                   "#include \"b\"\r\n"     // Start of line: 16
+                   "\r\n"                   // Start of line: 30
+                   "#include \"c\"\r\n"     // Start of line: 32
+                   "\r\n"                   // Start of line: 46
+                   "int i;";                // Start of line: 48
+  StringRef Expected = "#include \"a\"\r\n" // Start of line: 0
+                       "#include \"b\"\r\n" // Start of line: 14
+                       "#include \"c\"\r\n" // Start of line: 28
+                       "\r\n"               // Start of line: 42
+                       "int i;";            // Start of line: 44
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(
       14u,
@@ -885,19 +885,19 @@ TEST_F(
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {
       {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
-  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
-                     "#include \"b\"\r\n"     // Start of line: 14
-                     "#include \"c\"\r\n"     // Start of line: 28
-                     "\r\n"                   // Start of line: 42
-                     "int i;";                // Start of line: 44
-  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
-                         "\r\n"               // Start of line: 14
-                         "#include \"b\"\r\n" // Start of line: 16
-                         "\r\n"               // Start of line: 30
-                         "#include \"c\"\r\n" // Start of line: 32
-                         "\r\n"               // Start of line: 46
-                         "int i;";            // Start of line: 48
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"a\"\r\n"     // Start of line: 0
+                   "#include \"b\"\r\n"     // Start of line: 14
+                   "#include \"c\"\r\n"     // Start of line: 28
+                   "\r\n"                   // Start of line: 42
+                   "int i;";                // Start of line: 44
+  StringRef Expected = "#include \"a\"\r\n" // Start of line: 0
+                       "\r\n"               // Start of line: 14
+                       "#include \"b\"\r\n" // Start of line: 16
+                       "\r\n"               // Start of line: 30
+                       "#include \"c\"\r\n" // Start of line: 32
+                       "\r\n"               // Start of line: 46
+                       "int i;";            // Start of line: 48
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(15u, newCursor(Code, 16));
   EXPECT_EQ(30u, newCursor(Code, 32));
@@ -912,21 +912,21 @@ TEST_F(
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {
       {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
-  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
-                     "\r\n"                   // Start of line: 14
-                     "#include \"c\"\r\n"     // Start of line: 16
-                     "\r\n"                   // Start of line: 30
-                     "#include \"b\"\r\n"     // Start of line: 32
-                     "\r\n"                   // Start of line: 46
-                     "int i;";                // Start of line: 48
-  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
-                         "\r\n"               // Start of line: 14
-                         "#include \"b\"\r\n" // Start of line: 16
-                         "\r\n"               // Start of line: 30
-                         "#include \"c\"\r\n" // Start of line: 32
-                         "\r\n"               // Start of line: 46
-                         "int i;";            // Start of line: 48
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"a\"\r\n"     // Start of line: 0
+                   "\r\n"                   // Start of line: 14
+                   "#include \"c\"\r\n"     // Start of line: 16
+                   "\r\n"                   // Start of line: 30
+                   "#include \"b\"\r\n"     // Start of line: 32
+                   "\r\n"                   // Start of line: 46
+                   "int i;";                // Start of line: 48
+  StringRef Expected = "#include \"a\"\r\n" // Start of line: 0
+                       "\r\n"               // Start of line: 14
+                       "#include \"b\"\r\n" // Start of line: 16
+                       "\r\n"               // Start of line: 30
+                       "#include \"c\"\r\n" // Start of line: 32
+                       "\r\n"               // Start of line: 46
+                       "int i;";            // Start of line: 48
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(14u, newCursor(Code, 14));
   EXPECT_EQ(30u, newCursor(Code, 32));
@@ -938,88 +938,88 @@ TEST_F(
 #endif
 
 TEST_F(SortIncludesTest, DeduplicateIncludes) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <c>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <c>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "#include <c>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "#include <c>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "#include <c>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "#include <c>"));
 }
 
 TEST_F(SortIncludesTest, SortAndDeduplicateIncludes) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <c>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <c>\n"
+                    "#include <b>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <c>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <c>\n"
+                    "#include <b>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <c>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <c>\n"
+                    "#include <b>"));
 }
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionAfterDeduplicate) {
-  std::string Code = "#include <b>\n"      // Start of line: 0
-                     "#include <a>\n"      // Start of line: 13
-                     "#include <b>\n"      // Start of line: 26
-                     "#include <b>\n"      // Start of line: 39
-                     "#include <c>\n"      // Start of line: 52
-                     "#include <b>\n";     // Start of line: 65
-  std::string Expected = "#include <a>\n"  // Start of line: 0
-                         "#include <b>\n"  // Start of line: 13
-                         "#include <c>\n"; // Start of line: 26
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include <b>\n"      // Start of line: 0
+                   "#include <a>\n"      // Start of line: 13
+                   "#include <b>\n"      // Start of line: 26
+                   "#include <b>\n"      // Start of line: 39
+                   "#include <c>\n"      // Start of line: 52
+                   "#include <b>\n";     // Start of line: 65
+  StringRef Expected = "#include <a>\n"  // Start of line: 0
+                       "#include <b>\n"  // Start of line: 13
+                       "#include <c>\n"; // Start of line: 26
+  verifyFormat(Expected, sort(Code));
   // Cursor on 'i' in "#include <a>".
   EXPECT_EQ(1u, newCursor(Code, 14));
   // Cursor on 'b' in "#include <b>".
@@ -1033,26 +1033,26 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionAfterDeduplicate) {
 }
 
 TEST_F(SortIncludesTest, DeduplicateLocallyInEachBlock) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <c>\n"
-                 "#include <b>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <c>\n"
+                    "#include <b>\n"
+                    "#include <b>"));
 }
 
 TEST_F(SortIncludesTest, ValidAffactedRangesAfterDeduplicatingIncludes) {
-  std::string Code = "#include <a>\n"
-                     "#include <b>\n"
-                     "#include <a>\n"
-                     "#include <a>\n"
-                     "\n"
-                     "   int     x ;";
+  StringRef Code = "#include <a>\n"
+                   "#include <b>\n"
+                   "#include <a>\n"
+                   "#include <a>\n"
+                   "\n"
+                   "   int     x ;";
   std::vector<tooling::Range> Ranges = {tooling::Range(0, 52)};
   auto Replaces = sortIncludes(FmtStyle, Code, Ranges, "input.cpp");
   Ranges = tooling::calculateRangesAfterReplacements(Replaces, Ranges);
@@ -1062,80 +1062,78 @@ TEST_F(SortIncludesTest, ValidAffactedRangesAfterDeduplicatingIncludes) {
 }
 
 TEST_F(SortIncludesTest, DoNotSortLikelyXml) {
-  EXPECT_EQ("<!--;\n"
-            "#include <b>\n"
-            "#include <a>\n"
-            "-->",
-            sort("<!--;\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "-->",
-                 "input.h", 0));
+  verifyFormat("<!--;\n"
+               "#include <b>\n"
+               "#include <a>\n"
+               "-->",
+               sort("<!--;\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "-->",
+                    "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, DoNotOutputReplacementsForSortedBlocksWithRegrouping) {
   Style.IncludeBlocks = Style.IBS_Regroup;
-  std::string Code = R"(
-#include "b.h"
-
-#include <a.h>
-)";
-  EXPECT_EQ(Code, sort(Code, "input.h", 0));
+  StringRef Code = "#include \"b.h\"\n"
+                   "\n"
+                   "#include <a.h>";
+  verifyFormat(Code, sort(Code, "input.h", 0));
 }
 
 TEST_F(SortIncludesTest,
        DoNotOutputReplacementsForSortedBlocksWithRegroupingWindows) {
   Style.IncludeBlocks = Style.IBS_Regroup;
-  std::string Code = "#include \"b.h\"\r\n"
-                     "\r\n"
-                     "#include <a.h>\r\n";
-  EXPECT_EQ(Code, sort(Code, "input.h", 0));
+  StringRef Code = "#include \"b.h\"\r\n"
+                   "\r\n"
+                   "#include <a.h>\r\n";
+  verifyFormat(Code, sort(Code, "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, MainIncludeChar) {
-  std::string Code = "#include <a>\n"
-                     "#include \"quote/input.h\"\n"
-                     "#include <angle-bracket/input.h>\n";
+  StringRef Code = "#include <a>\n"
+                   "#include \"quote/input.h\"\n"
+                   "#include <angle-bracket/input.h>\n";
 
   // Default behavior
-  EXPECT_EQ("#include \"quote/input.h\"\n"
-            "#include <a>\n"
-            "#include <angle-bracket/input.h>\n",
-            sort(Code, "input.cc", 1));
+  verifyFormat("#include \"quote/input.h\"\n"
+               "#include <a>\n"
+               "#include <angle-bracket/input.h>\n",
+               sort(Code, "input.cc", 1));
 
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Quote;
-  EXPECT_EQ("#include \"quote/input.h\"\n"
-            "#include <a>\n"
-            "#include <angle-bracket/input.h>\n",
-            sort(Code, "input.cc", 1));
+  verifyFormat("#include \"quote/input.h\"\n"
+               "#include <a>\n"
+               "#include <angle-bracket/input.h>\n",
+               sort(Code, "input.cc", 1));
 
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_AngleBracket;
-  EXPECT_EQ("#include <angle-bracket/input.h>\n"
-            "#include \"quote/input.h\"\n"
-            "#include <a>\n",
-            sort(Code, "input.cc", 1));
+  verifyFormat("#include <angle-bracket/input.h>\n"
+               "#include \"quote/input.h\"\n"
+               "#include <a>\n",
+               sort(Code, "input.cc", 1));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharAnyPickQuote) {
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Any;
-  EXPECT_EQ("#include \"input.h\"\n"
-            "#include <a>\n"
-            "#include <b>\n",
-            sort("#include <a>\n"
-                 "#include \"input.h\"\n"
-                 "#include <b>\n",
-                 "input.cc", 1));
+  verifyFormat("#include \"input.h\"\n"
+               "#include <a>\n"
+               "#include <b>\n",
+               sort("#include <a>\n"
+                    "#include \"input.h\"\n"
+                    "#include <b>\n",
+                    "input.cc", 1));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharAnyPickAngleBracket) {
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Any;
-  EXPECT_EQ("#include <input.h>\n"
-            "#include <a>\n"
-            "#include <b>\n",
-            sort("#include <a>\n"
-                 "#include <input.h>\n"
-                 "#include <b>\n",
-                 "input.cc", 1));
+  verifyFormat("#include <input.h>\n"
+               "#include <a>\n"
+               "#include <b>\n",
+               sort("#include <a>\n"
+                    "#include <input.h>\n"
+                    "#include <b>\n",
+                    "input.cc", 1));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharQuoteAndRegroup) {
@@ -1144,28 +1142,28 @@ TEST_F(SortIncludesTest, MainIncludeCharQuoteAndRegroup) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Quote;
 
-  EXPECT_EQ("#include \"lib-b/input.h\"\n"
-            "\n"
-            "#include <lib-a/h-1.h>\n"
-            "#include <lib-a/h-3.h>\n"
-            "#include <lib-a/input.h>\n"
-            "\n"
-            "#include <lib-b/h-1.h>\n"
-            "#include <lib-b/h-3.h>\n"
-            "\n"
-            "#include <lib-c/h-1.h>\n"
-            "#include <lib-c/h-2.h>\n"
-            "#include <lib-c/h-3.h>\n",
-            sort("#include <lib-c/h-1.h>\n"
-                 "#include <lib-c/h-2.h>\n"
-                 "#include <lib-c/h-3.h>\n"
-                 "#include <lib-b/h-1.h>\n"
-                 "#include \"lib-b/input.h\"\n"
-                 "#include <lib-b/h-3.h>\n"
-                 "#include <lib-a/h-1.h>\n"
-                 "#include <lib-a/input.h>\n"
-                 "#include <lib-a/h-3.h>\n",
-                 "input.cc"));
+  verifyFormat("#include \"lib-b/input.h\"\n"
+               "\n"
+               "#include <lib-a/h-1.h>\n"
+               "#include <lib-a/h-3.h>\n"
+               "#include <lib-a/input.h>\n"
+               "\n"
+               "#include <lib-b/h-1.h>\n"
+               "#include <lib-b/h-3.h>\n"
+               "\n"
+               "#include <lib-c/h-1.h>\n"
+               "#include <lib-c/h-2.h>\n"
+               "#include <lib-c/h-3.h>\n",
+               sort("#include <lib-c/h-1.h>\n"
+                    "#include <lib-c/h-2.h>\n"
+                    "#include <lib-c/h-3.h>\n"
+                    "#include <lib-b/h-1.h>\n"
+                    "#include \"lib-b/input.h\"\n"
+                    "#include <lib-b/h-3.h>\n"
+                    "#include <lib-a/h-1.h>\n"
+                    "#include <lib-a/input.h>\n"
+                    "#include <lib-a/h-3.h>\n",
+                    "input.cc"));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharAngleBracketAndRegroup) {
@@ -1174,60 +1172,60 @@ TEST_F(SortIncludesTest, MainIncludeCharAngleBracketAndRegroup) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_AngleBracket;
 
-  EXPECT_EQ("#include <lib-a/input.h>\n"
-            "\n"
-            "#include <lib-a/h-1.h>\n"
-            "#include <lib-a/h-3.h>\n"
-            "\n"
-            "#include \"lib-b/input.h\"\n"
-            "#include <lib-b/h-1.h>\n"
-            "#include <lib-b/h-3.h>\n"
-            "\n"
-            "#include <lib-c/h-1.h>\n"
-            "#include <lib-c/h-2.h>\n"
-            "#include <lib-c/h-3.h>\n",
-            sort("#include <lib-c/h-1.h>\n"
-                 "#include <lib-c/h-2.h>\n"
-                 "#include <lib-c/h-3.h>\n"
-                 "#include <lib-b/h-1.h>\n"
-                 "#include \"lib-b/input.h\"\n"
-                 "#include <lib-b/h-3.h>\n"
-                 "#include <lib-a/h-1.h>\n"
-                 "#include <lib-a/input.h>\n"
-                 "#include <lib-a/h-3.h>\n",
-                 "input.cc"));
+  verifyFormat("#include <lib-a/input.h>\n"
+               "\n"
+               "#include <lib-a/h-1.h>\n"
+               "#include <lib-a/h-3.h>\n"
+               "\n"
+               "#include \"lib-b/input.h\"\n"
+               "#include <lib-b/h-1.h>\n"
+               "#include <lib-b/h-3.h>\n"
+               "\n"
+               "#include <lib-c/h-1.h>\n"
+               "#include <lib-c/h-2.h>\n"
+               "#include <lib-c/h-3.h>\n",
+               sort("#include <lib-c/h-1.h>\n"
+                    "#include <lib-c/h-2.h>\n"
+                    "#include <lib-c/h-3.h>\n"
+                    "#include <lib-b/h-1.h>\n"
+                    "#include \"lib-b/input.h\"\n"
+                    "#include <lib-b/h-3.h>\n"
+                    "#include <lib-a/h-1.h>\n"
+                    "#include <lib-a/input.h>\n"
+                    "#include <lib-a/h-3.h>\n",
+                    "input.cc"));
 }
 
 TEST_F(SortIncludesTest, DoNotRegroupGroupsInGoogleObjCStyle) {
   FmtStyle = getGoogleStyle(FormatStyle::LK_ObjC);
 
-  EXPECT_EQ("#include <a.h>\n"
-            "#include <b.h>\n"
-            "#include \"a.h\"",
-            sort("#include <b.h>\n"
-                 "#include <a.h>\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include <a.h>\n"
+               "#include <b.h>\n"
+               "#include \"a.h\"",
+               sort("#include <b.h>\n"
+                    "#include <a.h>\n"
+                    "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, DoNotTreatPrecompiledHeadersAsFirstBlock) {
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "#pragma hdrstop\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "#include \"b.h\"\r\n"
-                         "#include \"d.h\"\r\n"
-                         "#pragma hdrstop\r\n"
-                         "\r\n"
-                         "#include \"e.h\"\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"c.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 2));
+  StringRef Code = "#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "#pragma hdrstop\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "#include \"b.h\"\r\n"
+                       "#include \"d.h\"\r\n"
+                       "#pragma hdrstop\r\n"
+                       "\r\n"
+                       "#include \"e.h\"\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"c.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 2));
 
   Code = "#include \"d.h\"\n"
          "#include \"b.h\"\n"
@@ -1245,59 +1243,59 @@ TEST_F(SortIncludesTest, DoNotTreatPrecompiledHeadersAsFirstBlock) {
              "#include \"a.h\"\n"
              "#include \"c.h\"\n";
 
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 2));
+  verifyFormat(Expected, sort(Code, "e.cpp", 2));
 }
 
 TEST_F(SortIncludesTest, skipUTF8ByteOrderMarkMerge) {
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "\xEF\xBB\xBF#include \"e.h\"\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"b.h\"\r\n"
-                         "#include \"c.h\"\r\n"
-                         "#include \"d.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 1));
+  StringRef Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "\xEF\xBB\xBF#include \"e.h\"\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"b.h\"\r\n"
+                       "#include \"c.h\"\r\n"
+                       "#include \"d.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 1));
 }
 
 TEST_F(SortIncludesTest, skipUTF8ByteOrderMarkPreserve) {
   Style.IncludeBlocks = Style.IBS_Preserve;
-  std::string Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "\xEF\xBB\xBF#include \"b.h\"\r\n"
-                         "#include \"d.h\"\r\n"
-                         "\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"c.h\"\r\n"
-                         "#include \"e.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 2));
+  StringRef Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "\xEF\xBB\xBF#include \"b.h\"\r\n"
+                       "#include \"d.h\"\r\n"
+                       "\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"c.h\"\r\n"
+                       "#include \"e.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 2));
 }
 
 TEST_F(SortIncludesTest, MergeLines) {
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "#include \"c.h\"\r\n"
-                     "#include \"b\\\r\n"
-                     ".h\"\r\n"
-                     "#include \"a.h\"\r\n";
+  StringRef Code = "#include \"c.h\"\r\n"
+                   "#include \"b\\\r\n"
+                   ".h\"\r\n"
+                   "#include \"a.h\"\r\n";
 
-  std::string Expected = "#include \"a.h\"\r\n"
-                         "#include \"b\\\r\n"
-                         ".h\"\r\n"
-                         "#include \"c.h\"\r\n";
+  StringRef Expected = "#include \"a.h\"\r\n"
+                       "#include \"b\\\r\n"
+                       ".h\"\r\n"
+                       "#include \"c.h\"\r\n";
 
-  EXPECT_EQ(Expected, sort(Code, "a.cpp", 1));
+  verifyFormat(Expected, sort(Code, "a.cpp", 1));
 }
 
 TEST_F(SortIncludesTest, DisableFormatDisablesIncludeSorting) {
@@ -1305,154 +1303,154 @@ TEST_F(SortIncludesTest, DisableFormatDisablesIncludeSorting) {
                      "#include <b.h>\n";
   StringRef Unsorted = "#include <b.h>\n"
                        "#include <a.h>\n";
-  EXPECT_EQ(Sorted, sort(Unsorted));
+  verifyFormat(Sorted, sort(Unsorted));
   FmtStyle.DisableFormat = true;
-  EXPECT_EQ(Unsorted, sort(Unsorted, "input.cpp", 0));
+  verifyFormat(Unsorted, sort(Unsorted, "input.cpp", 0));
 }
 
 TEST_F(SortIncludesTest, DisableRawStringLiteralSorting) {
 
-  EXPECT_EQ("const char *t = R\"(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")\";",
-            sort("const char *t = R\"(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")\";",
-                 "test.cxx", 0));
-  EXPECT_EQ("const char *t = R\"x(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")x\";",
-            sort("const char *t = R\"x(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")x\";",
-                 "test.cxx", 0));
-  EXPECT_EQ("const char *t = R\"xyz(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")xyz\";",
-            sort("const char *t = R\"xyz(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")xyz\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("#include <a.h>\n"
-            "#include <b.h>\n"
-            "const char *t = R\"(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")\";\n"
-            "#include <c.h>\n"
-            "#include <d.h>\n"
-            "const char *t = R\"x(\n"
-            "#include <f.h>\n"
-            "#include <e.h>\n"
-            ")x\";\n"
-            "#include <g.h>\n"
-            "#include <h.h>\n"
-            "const char *t = R\"xyz(\n"
-            "#include <j.h>\n"
-            "#include <i.h>\n"
-            ")xyz\";\n"
-            "#include <k.h>\n"
-            "#include <l.h>",
-            sort("#include <b.h>\n"
-                 "#include <a.h>\n"
-                 "const char *t = R\"(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")\";\n"
-                 "#include <d.h>\n"
-                 "#include <c.h>\n"
-                 "const char *t = R\"x(\n"
-                 "#include <f.h>\n"
-                 "#include <e.h>\n"
-                 ")x\";\n"
-                 "#include <h.h>\n"
-                 "#include <g.h>\n"
-                 "const char *t = R\"xyz(\n"
-                 "#include <j.h>\n"
-                 "#include <i.h>\n"
-                 ")xyz\";\n"
-                 "#include <l.h>\n"
-                 "#include <k.h>",
-                 "test.cc", 4));
-
-  EXPECT_EQ("const char *t = R\"AMZ029amz(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AMZ029amz\";",
-            sort("const char *t = R\"AMZ029amz(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AMZ029amz\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"-AMZ029amz(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")-AMZ029amz\";",
-            sort("const char *t = R\"-AMZ029amz(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")-AMZ029amz\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AMZ029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AMZ029amz-\";",
-            sort("const char *t = R\"AMZ029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AMZ029amz-\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AM|029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AM|029amz-\";",
-            sort("const char *t = R\"AM|029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AM|029amz-\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AM[029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AM[029amz-\";",
-            sort("const char *t = R\"AM[029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AM[029amz-\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AM]029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AM]029amz-\";",
-            sort("const char *t = R\"AM]029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AM]029amz-\";",
-                 "test.cxx", 0));
+  verifyFormat("const char *t = R\"(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")\";",
+               sort("const char *t = R\"(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")\";",
+                    "test.cxx", 0));
+  verifyFormat("const char *t = R\"x(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")x\";",
+               sort("const char *t = R\"x(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")x\";",
+                    "test.cxx", 0));
+  verifyFormat("const char *t = R\"xyz(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")xyz\";",
+               sort("const char *t = R\"xyz(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")xyz\";",
+                    "test.cxx", 0));
+
+  verifyFormat("#include <a.h>\n"
+               "#include <b.h>\n"
+               "const char *t = R\"(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")\";\n"
+               "#include <c.h>\n"
+               "#include <d.h>\n"
+               "const char *t = R\"x(\n"
+               "#include <f.h>\n"
+               "#include <e.h>\n"
+               ")x\";\n"
+               "#include <g.h>\n"
+               "#include <h.h>\n"
+               "const char *t = R\"xyz(\n"
+               "#include <j.h>\n"
+               "#include <i.h>\n"
+               ")xyz\";\n"
+               "#include <k.h>\n"
+               "#include <l.h>",
+               sort("#include <b.h>\n"
+                    "#include <a.h>\n"
+                    "const char *t = R\"(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")\";\n"
+                    "#include <d.h>\n"
+                    "#include <c.h>\n"
+                    "const char *t = R\"x(\n"
+                    "#include <f.h>\n"
+                    "#include <e.h>\n"
+                    ")x\";\n"
+                    "#include <h.h>\n"
+                    "#include <g.h>\n"
+                    "const char *t = R\"xyz(\n"
+                    "#include <j.h>\n"
+                    "#include <i.h>\n"
+                    ")xyz\";\n"
+                    "#include <l.h>\n"
+                    "#include <k.h>",
+                    "test.cc", 4));
+
+  verifyFormat("const char *t = R\"AMZ029amz(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AMZ029amz\";",
+               sort("const char *t = R\"AMZ029amz(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AMZ029amz\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"-AMZ029amz(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")-AMZ029amz\";",
+               sort("const char *t = R\"-AMZ029amz(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")-AMZ029amz\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AMZ029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AMZ029amz-\";",
+               sort("const char *t = R\"AMZ029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AMZ029amz-\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AM|029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AM|029amz-\";",
+               sort("const char *t = R\"AM|029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AM|029amz-\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AM[029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AM[029amz-\";",
+               sort("const char *t = R\"AM[029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AM[029amz-\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AM]029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AM]029amz-\";",
+               sort("const char *t = R\"AM]029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AM]029amz-\";",
+                    "test.cxx", 0));
 
 #define X "AMZ029amz{}+!%*=_:;',.<>|/?#~-$"
 
-  EXPECT_EQ("const char *t = R\"" X "(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")" X "\";",
-            sort("const char *t = R\"" X "(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")" X "\";",
-                 "test.cxx", 0));
+  verifyFormat("const char *t = R\"" X "(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")" X "\";",
+               sort("const char *t = R\"" X "(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")" X "\";",
+                    "test.cxx", 0));
 
 #undef X
 }

>From 2f52bbeb6f6f3b7abef19cb5297773d95aa0b434 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Sun, 19 May 2024 15:20:46 -0700
Subject: [PATCH 34/44] [mlir] Use operator==(StringRef, StringRef) (NFC)
 (#92706)

---
 .../SparseTensor/IR/Detail/LvlTypeParser.cpp       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 39f5cf1a75082..bb6c65a6f6ca0 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -37,7 +37,7 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   uint64_t properties = 0;
   SmallVector<unsigned> structured;
 
-  if (base.compare("structured") == 0) {
+  if (base == "structured") {
     ParseResult res = parser.parseCommaSeparatedList(
         mlir::OpAsmParser::Delimiter::OptionalSquare,
         [&]() -> ParseResult { return parseStructured(parser, &structured); },
@@ -60,18 +60,18 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   FAILURE_IF_FAILED(res)
 
   // Set the base bit for properties.
-  if (base.compare("dense") == 0) {
+  if (base == "dense") {
     properties |= static_cast<uint64_t>(LevelFormat::Dense);
-  } else if (base.compare("batch") == 0) {
+  } else if (base == "batch") {
     properties |= static_cast<uint64_t>(LevelFormat::Batch);
-  } else if (base.compare("compressed") == 0) {
+  } else if (base == "compressed") {
     properties |= static_cast<uint64_t>(LevelFormat::Compressed);
-  } else if (base.compare("structured") == 0) {
+  } else if (base == "structured") {
     properties |= static_cast<uint64_t>(LevelFormat::NOutOfM);
     properties |= nToBits(structured[0]) | mToBits(structured[1]);
-  } else if (base.compare("loose_compressed") == 0) {
+  } else if (base == "loose_compressed") {
     properties |= static_cast<uint64_t>(LevelFormat::LooseCompressed);
-  } else if (base.compare("singleton") == 0) {
+  } else if (base == "singleton") {
     properties |= static_cast<uint64_t>(LevelFormat::Singleton);
   } else {
     parser.emitError(loc, "unknown level format: ") << base;

>From 5d3f296733b66281a53dd451a983e69ae0bb482f Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl at google.com>
Date: Sun, 19 May 2024 16:33:17 -0700
Subject: [PATCH 35/44] [CallPromotionUtils]Implement conditional indirect call
 promotion with vtable-based comparison (#81378)

* Given the code sequence
   ```
   bb:
     %vtable = load ptr, ptr %d, !prof !8
     %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
     %1 = load ptr, ptr %vfn
     %call = tail call i32 %1(ptr %d), !prof !9
  ```
   The transformation looks like

   ```
   bb:
    %vtable = load ptr, ptr %d, align 8
    %vfn = getelementptr inbounds i8, ptr %vtable, i64 8  <-- Inst 1
    %func-addr = load ptr, ptr %vfn, align 8  <-- Inst 2
    # compare loaded pointers with address point of vtables
%1 = icmp eq ptr %vtable, getelementptr inbounds (i8, ptr @_ZTV<VTable>,
i32 16)
br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect,
!prof !18

  if.true.direct_targ:                              ; preds = %bb
    %2 = tail call i32 @<direct-call>(ptr nonnull %d)
    br label %if.end.icp

  if.false.orig_indirect:                           ; preds = %bb
    %call = tail call i32 %func-addr(ptr nonnull %d)
    br label %if.end.icp

if.end.icp: ; preds = %if.false.orig_indirect, %if.true.direct_targ
%4 = phi i32 [ %call, %if.false.orig_indirect ], [ %2,
%if.true.direct_targ ]

   ```
It's intentional that `Inst 1` and `Inst2` remains in `bb` (not in
`if.false.orig_indirect`). A follow up patch will implement code to sink
them (something like how `instcombine` would
[sink](https://github.com/llvm/llvm-project/blob/2fcfc9754a16805b81e541dc8222a8b5cf17a121/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp#L4293)
instructions along with [debug
intrinsics](https://github.com/llvm/llvm-project/blob/2fcfc9754a16805b81e541dc8222a8b5cf17a121/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp#L4356-L4368)
if possible)

* The parent patch is https://github.com/llvm/llvm-project/pull/81181
---
 .../Transforms/Utils/CallPromotionUtils.h     | 33 +++++--
 .../Transforms/Utils/CallPromotionUtils.cpp   | 32 ++++++-
 .../Utils/CallPromotionUtilsTest.cpp          | 88 +++++++++++++++++++
 3 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
index fcb384ec36133..385831f457038 100644
--- a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
@@ -15,9 +15,12 @@
 #define LLVM_TRANSFORMS_UTILS_CALLPROMOTIONUTILS_H
 
 namespace llvm {
+template <typename T> class ArrayRef;
+class Constant;
 class CallBase;
 class CastInst;
 class Function;
+class Instruction;
 class MDNode;
 class Value;
 
@@ -41,7 +44,9 @@ bool isLegalToPromote(const CallBase &CB, Function *Callee,
 CallBase &promoteCall(CallBase &CB, Function *Callee,
                       CastInst **RetBitCast = nullptr);
 
-/// Promote the given indirect call site to conditionally call \p Callee.
+/// Promote the given indirect call site to conditionally call \p Callee. The
+/// promoted direct call instruction is predicated on `CB.getCalledOperand() ==
+/// Callee`.
 ///
 /// This function creates an if-then-else structure at the location of the call
 /// site. The original call site is moved into the "else" block. A clone of the
@@ -51,6 +56,22 @@ CallBase &promoteCall(CallBase &CB, Function *Callee,
 CallBase &promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
                                     MDNode *BranchWeights = nullptr);
 
+/// This is similar to `promoteCallWithIfThenElse` except that the condition to
+/// promote a virtual call is that \p VPtr is the same as any of \p
+/// AddressPoints.
+///
+/// This function is expected to be used on virtual calls (a subset of indirect
+/// calls). \p VPtr is the virtual table address stored in the objects, and
+/// \p AddressPoints contains vtable address points. A vtable address point is
+/// a location inside the vtable that's referenced by vpointer in C++ objects.
+///
+/// TODO: sink the address-calculation instructions of indirect callee to the
+/// indirect call fallback after transformation.
+CallBase &promoteCallWithVTableCmp(CallBase &CB, Instruction *VPtr,
+                                   Function *Callee,
+                                   ArrayRef<Constant *> AddressPoints,
+                                   MDNode *BranchWeights);
+
 /// Try to promote (devirtualize) a virtual call on an Alloca. Return true on
 /// success.
 ///
@@ -76,11 +97,11 @@ bool tryPromoteCall(CallBase &CB);
 
 /// Predicate and clone the given call site.
 ///
-/// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition compares the call site's called value to the given
-/// callee. The original call site is moved into the "else" block, and a clone
-/// of the call site is placed in the "then" block. The cloned instruction is
-/// returned.
+/// This function creates an if-then-else structure at the location of the
+/// call site. The "if" condition compares the call site's called value to
+/// the given callee. The original call site is moved into the "else" block,
+/// and a clone of the call site is placed in the "then" block. The cloned
+/// instruction is returned.
 CallBase &versionCallSite(CallBase &CB, Value *Callee, MDNode *BranchWeights);
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 9ca9aaf9ee9df..dda80d419999d 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -12,9 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -188,9 +190,9 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 /// Predicate and clone the given call site.
 ///
 /// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition is specified by `Cond`. The original call site is
-/// moved into the "else" block, and a clone of the call site is placed in the
-/// "then" block. The cloned instruction is returned.
+/// site. The "if" condition is specified by `Cond`.
+/// The original call site is moved into the "else" block, and a clone of the
+/// call site is placed in the "then" block. The cloned instruction is returned.
 ///
 /// For example, the call instruction below:
 ///
@@ -518,7 +520,8 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CB.getIterator());
+      auto *Cast =
+          CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CB.getIterator());
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
@@ -568,6 +571,27 @@ CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
   return promoteCall(NewInst, Callee);
 }
 
+CallBase &llvm::promoteCallWithVTableCmp(CallBase &CB, Instruction *VPtr,
+                                         Function *Callee,
+                                         ArrayRef<Constant *> AddressPoints,
+                                         MDNode *BranchWeights) {
+  assert(!AddressPoints.empty() && "Caller should guarantee");
+  IRBuilder<> Builder(&CB);
+  SmallVector<Value *, 2> ICmps;
+  for (auto &AddressPoint : AddressPoints)
+    ICmps.push_back(Builder.CreateICmpEQ(VPtr, AddressPoint));
+
+  // TODO: Perform tree height reduction if the number of ICmps is high.
+  Value *Cond = Builder.CreateOr(ICmps);
+
+  // Version the indirect call site. If Cond is true, 'NewInst' will be
+  // executed, otherwise the original call site will be executed.
+  CallBase &NewInst = versionCallSiteWithCond(CB, Cond, BranchWeights);
+
+  // Promote 'NewInst' so that it directly calls the desired function.
+  return promoteCall(NewInst, Callee);
+}
+
 bool llvm::tryPromoteCall(CallBase &CB) {
   assert(!CB.getCalledFunction());
   Module *M = CB.getCaller()->getParent();
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index 0e9641c5846f3..2d457eb3b678a 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -8,9 +8,12 @@
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
@@ -24,6 +27,21 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   return Mod;
 }
 
+// Returns a constant representing the vtable's address point specified by the
+// offset.
+static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
+                                             uint32_t AddressPointOffset) {
+  Module &M = *VTable->getParent();
+  LLVMContext &Context = M.getContext();
+  assert(AddressPointOffset <
+             M.getDataLayout().getTypeAllocSize(VTable->getValueType()) &&
+         "Out-of-bound access");
+
+  return ConstantExpr::getInBoundsGetElementPtr(
+      Type::getInt8Ty(Context), VTable,
+      llvm::ConstantInt::get(Type::getInt32Ty(Context), AddressPointOffset));
+}
+
 TEST(CallPromotionUtilsTest, TryPromoteCall) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C,
@@ -368,3 +386,73 @@ declare %struct2 @_ZN4Impl3RunEv(%class.Impl* %this)
   bool IsPromoted = tryPromoteCall(*CI);
   EXPECT_FALSE(IsPromoted);
 }
+
+TEST(CallPromotionUtilsTest, promoteCallWithVTableCmp) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C,
+                                      R"IR(
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV5Base1 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }, !type !0
+ at _ZTV8Derived1 = constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev], [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base25func2Ev] }, !type !0, !type !1, !type !2
+ at _ZTV8Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base35func3Ev], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base25func2Ev], [4 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }, !type !3, !type !4, !type !5, !type !6
+
+define i32 @testfunc(ptr %d) {
+entry:
+  %vtable = load ptr, ptr %d, !prof !7
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %0 = load ptr, ptr %vfn
+  %call = tail call i32 %0(ptr %d), !prof !8
+  ret i32 %call
+}
+
+define i32 @_ZN5Base15func1Ev(ptr %this) {
+entry:
+  ret i32 2
+}
+
+declare i32 @_ZN5Base25func2Ev(ptr)
+declare i32 @_ZN5Base15func0Ev(ptr)
+declare void @_ZN5Base35func3Ev(ptr)
+
+!0 = !{i64 16, !"_ZTS5Base1"}
+!1 = !{i64 48, !"_ZTS5Base2"}
+!2 = !{i64 16, !"_ZTS8Derived1"}
+!3 = !{i64 64, !"_ZTS5Base1"}
+!4 = !{i64 40, !"_ZTS5Base2"}
+!5 = !{i64 16, !"_ZTS5Base3"}
+!6 = !{i64 16, !"_ZTS8Derived2"}
+!7 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 800, i64 5035968517245772950, i64 500, i64 3215870116411581797, i64 300}
+!8 = !{!"VP", i32 0, i64 1600, i64 6804820478065511155, i64 1600})IR");
+
+  Function *F = M->getFunction("testfunc");
+  CallInst *CI = dyn_cast<CallInst>(&*std::next(F->front().rbegin()));
+  ASSERT_TRUE(CI && CI->isIndirectCall());
+
+  // Create the constant and the branch weights
+  SmallVector<Constant *, 3> VTableAddressPoints;
+
+  for (auto &[VTableName, AddressPointOffset] : {std::pair{"_ZTV5Base1", 16},
+                                                 {"_ZTV8Derived1", 16},
+                                                 {"_ZTV8Derived2", 64}})
+    VTableAddressPoints.push_back(getVTableAddressPointOffset(
+        M->getGlobalVariable(VTableName), AddressPointOffset));
+
+  MDBuilder MDB(C);
+  MDNode *BranchWeights = MDB.createBranchWeights(1600, 0);
+
+  size_t OrigEntryBBSize = F->front().size();
+
+  LoadInst *VPtr = dyn_cast<LoadInst>(&*F->front().begin());
+
+  Function *Callee = M->getFunction("_ZN5Base15func1Ev");
+  // Tests that promoted direct call is returned.
+  CallBase &DirectCB = promoteCallWithVTableCmp(
+      *CI, VPtr, Callee, VTableAddressPoints, BranchWeights);
+  EXPECT_EQ(DirectCB.getCalledOperand(), Callee);
+
+  // Promotion inserts 3 icmp instructions and 2 or instructions, and removes
+  // 1 call instruction from the entry block.
+  EXPECT_EQ(F->front().size(), OrigEntryBBSize + 4);
+}

>From d102ee63e849cdaa586fd1aaae900c1399bf2b76 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Sun, 19 May 2024 16:51:07 -0700
Subject: [PATCH 36/44] [clang] Use operator==(StringRef, StringRef) (NFC)
 (#92708)

---
 clang-tools-extra/modularize/ModularizeUtilities.cpp | 6 ++----
 clang/lib/Driver/ToolChains/Clang.cpp                | 2 +-
 clang/utils/TableGen/ClangAttrEmitter.cpp            | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 53e8a49d1a548..b202b3aae8f8a 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -435,11 +435,9 @@ static std::string replaceDotDot(StringRef Path) {
   llvm::sys::path::const_iterator B = llvm::sys::path::begin(Path),
     E = llvm::sys::path::end(Path);
   while (B != E) {
-    if (B->compare(".") == 0) {
-    }
-    else if (B->compare("..") == 0)
+    if (*B == "..")
       llvm::sys::path::remove_filename(Buffer);
-    else
+    else if (*B != ".")
       llvm::sys::path::append(Buffer, *B);
     ++B;
   }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c3e6d563f3bd2..6d2015b2cd156 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1522,7 +1522,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
       auto isPAuthLR = [](const char *member) {
         llvm::AArch64::ExtensionInfo pauthlr_extension =
             llvm::AArch64::getExtensionByID(llvm::AArch64::AEK_PAUTHLR);
-        return (pauthlr_extension.Feature.compare(member) == 0);
+        return pauthlr_extension.Feature == member;
       };
 
       if (std::any_of(CmdArgs.begin(), CmdArgs.end(), isPAuthLR))
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index aafbf1f40949a..ca7630adfbb7b 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -1845,7 +1845,7 @@ static LateAttrParseKind getLateAttrParseKind(const Record *Attr) {
     PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) +
                               "`should only have one super class");
 
-  if (SuperClasses[0]->getName().compare(LateAttrParseKindStr) != 0)
+  if (SuperClasses[0]->getName() != LateAttrParseKindStr)
     PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) +
                               "`should only have type `" +
                               llvm::Twine(LateAttrParseKindStr) +

>From 0bced10f290bb96d675874a89f1b6789a2384e30 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye at intel.com>
Date: Mon, 20 May 2024 08:53:21 +0800
Subject: [PATCH 37/44] [SDAG][X86] Extend SplitVecOp_VSETCC for STRICT_FSETCC.
 (#92509)

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 19 +++++++--
 .../CodeGen/X86/vec-strict-cmp-512-skx.ll     | 40 +++++++++++++++++++
 2 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index cd858003cf03b..dca5a481fbd0e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3033,6 +3033,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
                        "operand!\n");
 
   case ISD::VP_SETCC:
+  case ISD::STRICT_FSETCC:
   case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
   case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
@@ -3997,14 +3998,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
+  bool isStrict = N->getOpcode() == ISD::STRICT_FSETCC;
   assert(N->getValueType(0).isVector() &&
-         N->getOperand(0).getValueType().isVector() &&
+         N->getOperand(isStrict ? 1 : 0).getValueType().isVector() &&
          "Operand types must be vectors");
   // The result has a legal vector type, but the input needs splitting.
   SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
   SDLoc DL(N);
-  GetSplitVector(N->getOperand(0), Lo0, Hi0);
-  GetSplitVector(N->getOperand(1), Lo1, Hi1);
+  GetSplitVector(N->getOperand(isStrict ? 1 : 0), Lo0, Hi0);
+  GetSplitVector(N->getOperand(isStrict ? 2 : 1), Lo1, Hi1);
+
   auto PartEltCnt = Lo0.getValueType().getVectorElementCount();
 
   LLVMContext &Context = *DAG.getContext();
@@ -4014,6 +4017,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   if (N->getOpcode() == ISD::SETCC) {
     LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
     HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  } else if (N->getOpcode() == ISD::STRICT_FSETCC) {
+    LoRes = DAG.getNode(ISD::STRICT_FSETCC, DL,
+                        DAG.getVTList(PartResVT, N->getValueType(1)),
+                        N->getOperand(0), Lo0, Lo1, N->getOperand(3));
+    HiRes = DAG.getNode(ISD::STRICT_FSETCC, DL,
+                        DAG.getVTList(PartResVT, N->getValueType(1)),
+                        N->getOperand(0), Hi0, Hi1, N->getOperand(3));
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                   LoRes.getValue(1), HiRes.getValue(1));
+    ReplaceValueWith(SDValue(N, 1), NewChain);
   } else {
     assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
     SDValue MaskLo, MaskHi, EVLLo, EVLHi;
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll
new file mode 100644
index 0000000000000..3028b74967378
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mcpu=skx | FileCheck %s --check-prefixes=SKX
+
+;; Test no crash for AVX512 targets without prefer-vector-width=512.
+
+define <16 x i32> @test_v16f32_oeq_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
+; SKX-LABEL: test_v16f32_oeq_q:
+; SKX:       # %bb.0:
+; SKX-NEXT:    vcmpeqps %ymm7, %ymm5, %k1
+; SKX-NEXT:    vcmpeqps %ymm6, %ymm4, %k2
+; SKX-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k2}
+; SKX-NEXT:    vpblendmd %ymm1, %ymm3, %ymm1 {%k1}
+; SKX-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
+                                               <16 x float> %f1, <16 x float> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @test_v8f64_oeq_q(<8 x i32> %a, <8 x i32> %b, <8 x double> %f1, <8 x double> %f2) #0 {
+; SKX-LABEL: test_v8f64_oeq_q:
+; SKX:       # %bb.0:
+; SKX-NEXT:    vcmpeqpd %ymm4, %ymm2, %k0
+; SKX-NEXT:    vcmpeqpd %ymm5, %ymm3, %k1
+; SKX-NEXT:    kshiftlb $4, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k1
+; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
+                                               <8 x double> %f1, <8 x double> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %res
+}
+
+declare <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float>, <16 x float>, metadata, metadata)
+declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double>, <8 x double>, metadata, metadata)
+
+attributes #0 = { nounwind strictfp "min-legal-vector-width"="0" }

>From 89d0937348ebd4b55f17d503910be9300aa44a13 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Sun, 19 May 2024 18:17:53 -0700
Subject: [PATCH 38/44] [llvm] Use StringRef::contains (NFC) (#92710)

---
 llvm/lib/IR/Mangler.cpp                 | 2 +-
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 2 +-
 llvm/lib/TextAPI/Utils.cpp              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index 72e2bc1f24ac9..019fe844e286c 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -292,7 +292,7 @@ void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
 
 std::optional<std::string> llvm::getArm64ECMangledFunctionName(StringRef Name) {
   bool IsCppFn = Name[0] == '?';
-  if (IsCppFn && Name.find("$$h") != std::string::npos)
+  if (IsCppFn && Name.contains("$$h"))
     return std::nullopt;
   if (!IsCppFn && Name[0] == '#')
     return std::nullopt;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 32de8b9587b46..9fde26c900f51 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1886,7 +1886,7 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call,
   // Local sizes arguments: Sizes of block invoke arguments. Clang generates
   // local size operands as an array, so we need to unpack them.
   SmallVector<Register, 16> LocalSizes;
-  if (Call->Builtin->Name.find("_varargs") != StringRef::npos || IsSpirvOp) {
+  if (Call->Builtin->Name.contains("_varargs") || IsSpirvOp) {
     const unsigned LocalSizeArrayIdx = HasEvents ? 9 : 6;
     Register GepReg = Call->Arguments[LocalSizeArrayIdx];
     MachineInstr *GepMI = MRI->getUniqueVRegDef(GepReg);
diff --git a/llvm/lib/TextAPI/Utils.cpp b/llvm/lib/TextAPI/Utils.cpp
index 08f14f65177ed..01021e3a264dd 100644
--- a/llvm/lib/TextAPI/Utils.cpp
+++ b/llvm/lib/TextAPI/Utils.cpp
@@ -184,7 +184,7 @@ llvm::Expected<Regex> llvm::MachO::createRegexFromGlob(StringRef Glob) {
       break;
     }
     default:
-      if (RegexMetachars.find(C) != StringRef::npos)
+      if (RegexMetachars.contains(C))
         RegexString.push_back('\\');
       RegexString.push_back(C);
     }

>From fc0144a30cf20d6405411da141d11bfde143d3d2 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd at linux.alibaba.com>
Date: Mon, 20 May 2024 10:36:03 +0800
Subject: [PATCH 39/44] [Serialization] Read the initializer for interesting
 static variables before consuming it (#92353)

Close https://github.com/llvm/llvm-project/issues/91418

Since we load the variable's initializers lazily, it'd be problematic if
the initializers dependent on each other. So here we try to load the
initializers of static variables to make sure they are passed to code
generator by order. If we read any thing interesting, we would consume
that before emitting the current declaration.
---
 clang/lib/Serialization/ASTReaderDecl.cpp    |  29 ++-
 clang/test/Modules/pr91418.cppm              |  65 +++++
 clang/test/OpenMP/nvptx_lambda_capturing.cpp | 246 +++++++++----------
 3 files changed, 214 insertions(+), 126 deletions(-)
 create mode 100644 clang/test/Modules/pr91418.cppm

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 0c647086e304a..a6254b70560c3 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -4186,12 +4186,35 @@ void ASTReader::PassInterestingDeclsToConsumer() {
     GetDecl(ID);
   EagerlyDeserializedDecls.clear();
 
-  while (!PotentiallyInterestingDecls.empty()) {
-    Decl *D = PotentiallyInterestingDecls.front();
-    PotentiallyInterestingDecls.pop_front();
+  auto ConsumingPotentialInterestingDecls = [this]() {
+    while (!PotentiallyInterestingDecls.empty()) {
+      Decl *D = PotentiallyInterestingDecls.front();
+      PotentiallyInterestingDecls.pop_front();
+      if (isConsumerInterestedIn(D))
+        PassInterestingDeclToConsumer(D);
+    }
+  };
+  std::deque<Decl *> MaybeInterestingDecls =
+      std::move(PotentiallyInterestingDecls);
+  assert(PotentiallyInterestingDecls.empty());
+  while (!MaybeInterestingDecls.empty()) {
+    Decl *D = MaybeInterestingDecls.front();
+    MaybeInterestingDecls.pop_front();
+    // Since we load the variable's initializers lazily, it'd be problematic
+    // if the initializers dependent on each other. So here we try to load the
+    // initializers of static variables to make sure they are passed to code
+    // generator by order. If we read anything interesting, we would consume
+    // that before emitting the current declaration.
+    if (auto *VD = dyn_cast<VarDecl>(D);
+        VD && VD->isFileVarDecl() && !VD->isExternallyVisible())
+      VD->getInit();
+    ConsumingPotentialInterestingDecls();
     if (isConsumerInterestedIn(D))
       PassInterestingDeclToConsumer(D);
   }
+
+  // If we add any new potential interesting decl in the last call, consume it.
+  ConsumingPotentialInterestingDecls();
 }
 
 void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) {
diff --git a/clang/test/Modules/pr91418.cppm b/clang/test/Modules/pr91418.cppm
new file mode 100644
index 0000000000000..b507df162643b
--- /dev/null
+++ b/clang/test/Modules/pr91418.cppm
@@ -0,0 +1,65 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 -x c++-header %t/foo.h \
+// RUN:     -emit-pch -o %t/foo.pch
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/use.cpp -include-pch \
+// RUN:     %t/foo.pch -emit-llvm -o - | FileCheck %t/use.cpp
+
+//--- foo.h
+#ifndef FOO_H
+#define FOO_H
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+
+static __inline__ __m128 __attribute__((__always_inline__, __min_vector_width__(128)))
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return __extension__ (__m128){ __z, __y, __x, __w };
+}
+
+typedef __m128 VR;
+
+inline VR MakeVR( float X, float Y, float Z, float W )
+{
+ return _mm_setr_ps( X, Y, Z, W );
+}
+
+extern "C" float sqrtf(float);
+
+namespace VectorSinConstantsSSE
+{
+  float a = (16 * sqrtf(0.225f));
+  VR A = MakeVR(a, a, a, a);
+  static const float b = (16 * sqrtf(0.225f));
+  static const VR B = MakeVR(b, b, b, b);
+}
+
+#endif // FOO_H
+
+//--- use.cpp
+#include "foo.h"
+float use() {
+    return VectorSinConstantsSSE::A[0] + VectorSinConstantsSSE::A[1] +
+           VectorSinConstantsSSE::A[2] + VectorSinConstantsSSE::A[3] +
+           VectorSinConstantsSSE::B[0] + VectorSinConstantsSSE::B[1] +
+           VectorSinConstantsSSE::B[2] + VectorSinConstantsSSE::B[3];
+}
+
+// CHECK: define{{.*}}@__cxx_global_var_init(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSE1aE
+
+// CHECK: define{{.*}}@__cxx_global_var_init.1(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSE1AE
+
+// CHECK: define{{.*}}@__cxx_global_var_init.2(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSEL1BE
+
+// CHECK: define{{.*}}@__cxx_global_var_init.3(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSEL1bE
+
+// CHECK: @_GLOBAL__sub_I_use.cpp
+// CHECK: call{{.*}}@__cxx_global_var_init(
+// CHECK: call{{.*}}@__cxx_global_var_init.1(
+// CHECK: call{{.*}}@__cxx_global_var_init.3(
+// CHECK: call{{.*}}@__cxx_global_var_init.2(
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
index 641fbc38dd6bc..efea8d4a05618 100644
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -1165,8 +1165,113 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    ret void
 //
 //
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment, ptr [[DYN_PTR]])
+// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3:       user_code.entry:
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP3]], i64 8, i1 false)
+// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP6]]) #[[ATTR7:[0-9]+]]
+// CHECK3-NEXT:    call void @__kmpc_target_deinit()
+// CHECK3-NEXT:    ret void
+// CHECK3:       worker.exit:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv
+// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP2]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment, ptr [[DYN_PTR]])
+// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3:       user_code.entry:
+// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK3-NEXT:    store ptr [[TMP4]], ptr [[TMP6]], align 8
+// CHECK3-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2)
+// CHECK3-NEXT:    call void @__kmpc_target_deinit()
+// CHECK3-NEXT:    ret void
+// CHECK3:       worker.exit:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP2]], i64 8, i1 false)
+// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP4]], align 8
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR7]]
+// CHECK3-NEXT:    ret void
+//
+//
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
@@ -1178,7 +1283,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[B5:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
@@ -1214,20 +1319,20 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[C7]], align 4
 // CHECK3-NEXT:    store ptr [[C7]], ptr [[_TMP8]], align 8
 // CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[_TMP4]], align 8
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP11]], align 8
-// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP8]], align 8
 // CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
-// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[D_ADDR]], ptr [[TMP16]], align 8
-// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 4
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP17]], align 8
 // CHECK3-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP4]], align 8
-// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP18]]) #[[ATTR7:[0-9]+]]
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP18]]) #[[ATTR7]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -1235,7 +1340,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
@@ -1267,7 +1372,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP5]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
-// CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 8
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
@@ -1292,7 +1397,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43_omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1305,7 +1410,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC5:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[B6:%.*]] = alloca i32, align 4
@@ -1345,128 +1450,23 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK3-NEXT:    store i32 [[TMP11]], ptr [[A10]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[_TMP4]], align 8
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[ARGC5]], ptr [[TMP13]], align 8
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8
 // CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
-// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP9]], align 8
 // CHECK3-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 8
-// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[D_ADDR]], ptr [[TMP18]], align 8
-// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 4
 // CHECK3-NEXT:    store ptr [[A10]], ptr [[TMP19]], align 8
 // CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[_TMP4]], align 8
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP20]]) #[[ATTR7]]
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0]] {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
-// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment, ptr [[DYN_PTR]])
-// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
-// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK3:       user_code.entry:
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP3]], i64 8, i1 false)
-// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP4]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP6]]) #[[ATTR7]]
-// CHECK3-NEXT:    call void @__kmpc_target_deinit()
-// CHECK3-NEXT:    ret void
-// CHECK3:       worker.exit:
-// CHECK3-NEXT:    ret void
-//
-//
-// CHECK3-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv
-// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_1:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
-// CHECK3-NEXT:    ret i32 [[TMP2]]
-//
-//
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3]] {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
-// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment, ptr [[DYN_PTR]])
-// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
-// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK3:       user_code.entry:
-// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
-// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK3-NEXT:    store ptr [[TMP4]], ptr [[TMP6]], align 8
-// CHECK3-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2)
-// CHECK3-NEXT:    call void @__kmpc_target_deinit()
-// CHECK3-NEXT:    ret void
-// CHECK3:       worker.exit:
-// CHECK3-NEXT:    ret void
-//
-//
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR4]] {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
-// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP2]], i64 8, i1 false)
-// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP3]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP4]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR7]]
-// CHECK3-NEXT:    ret void
-//
-//
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18
 // CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
@@ -1500,7 +1500,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[T1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
+// CHECK3-NEXT:    [[T1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8

>From 91423d71938d7a1dba27188e6d854148a750a3dd Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Sun, 19 May 2024 20:15:31 -0700
Subject: [PATCH 40/44] [BOLT][NFC] Don't assign YAML profile to functions with
 no CFG (#92487)

YAML profile for non-simple functions without CFG is
  1) useless for optimizations,
  2) can't be attached, similar to fdata profile,
  3) would be reported as invalid/stale even if the profile is valid.

Don't attempt to attach the profile in this case, aligning the behavior
to DataReader.

Test Plan: added yaml-non-simple.test
---
 bolt/lib/Profile/YAMLProfileReader.cpp |  3 ++
 bolt/test/X86/yaml-non-simple.test     | 71 ++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 bolt/test/X86/yaml-non-simple.test

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 978a7cadfe798..29d94067f459f 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -99,6 +99,9 @@ bool YAMLProfileReader::parseFunctionProfile(
       FuncRawBranchCount += YamlSI.Count;
   BF.setRawBranchCount(FuncRawBranchCount);
 
+  if (BF.empty())
+    return true;
+
   if (!opts::IgnoreHash &&
       YamlBF.Hash != BF.computeHash(IsDFSOrder, HashFunction)) {
     if (opts::Verbosity >= 1)
diff --git a/bolt/test/X86/yaml-non-simple.test b/bolt/test/X86/yaml-non-simple.test
new file mode 100644
index 0000000000000..fef98f692a710
--- /dev/null
+++ b/bolt/test/X86/yaml-non-simple.test
@@ -0,0 +1,71 @@
+## Check that YAML profile for non-simple function is not reported as stale.
+
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml --profile-ignore-hash -v=1 \
+# RUN:   --report-stale 2>&1 | FileCheck %s
+
+# CHECK: BOLT-INFO: could not disassemble function main. Will ignore.
+# CHECK: BOLT-INFO: could not disassemble function main.cold. Will ignore.
+# CHECK: BOLT-INFO: 0 out of 2 functions in the binary (0.0%) have non-empty execution profile
+# CHECK: BOLT-INFO: 1 function with profile could not be optimized
+
+#--- main.s
+.globl main
+.type	main, @function
+main:
+  .cfi_startproc
+.LBB00:
+  pushq   %rbp
+  movq    %rsp, %rbp
+  subq    $16, %rsp
+  testq   %rax, %rax
+  js      .LBB03
+.LBB01:
+  jne     .LBB04
+.LBB02:
+  nop
+.LBB03:
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+.LBB04:
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+  .cfi_endproc
+  .size	main, .-main
+
+.globl main.cold
+.type	main.cold, @function
+main.cold:
+  .cfi_startproc
+  nop
+  .cfi_endproc
+  .size	main.cold, .-main.cold
+
+#--- yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'yaml-non-simple.s.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func:       xxh3
+functions:
+  - name:            main
+    fid:             0
+    hash:            0x0000000000000000
+    exec:            1
+    nblocks:         5
+    blocks:
+      - bid:             1
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+...

>From 6bf1601a0d9a01fe663442096466d46800483e0c Mon Sep 17 00:00:00 2001
From: Monad <yanwqmonad at gmail.com>
Date: Mon, 20 May 2024 12:20:47 +0800
Subject: [PATCH 41/44] [InstCombine] Fold pointer adding in integer to
 arithmetic add (#91596)

Fold
``` llvm
define i32 @src(i32 %x, i32 %y) {
  %base = inttoptr i32 %x to ptr
  %ptr = getelementptr inbounds i8, ptr %base, i32 %y
  %r = ptrtoint ptr %ptr to i32
  ret i32 %r
}
```
where both `%base` and `%ptr` have only one use, to
``` llvm
define i32 @tgt(i32 %x, i32 %y) {
  %r = add i32 %x, %y
  ret i32 %r
}
```

The `add` can be `nuw` if the GEP is `inbounds` and the offset is
non-negative. The relevant Alive2 proof is
https://alive2.llvm.org/ce/z/nP3RWy.

### Motivation

It seems unnecessary to convert `int` to `ptr` just to get its offset.
In most cases, they generates the same assembly, but sometimes it may
miss some optimizations since the analysis of `GEP` is not as perfect as
that of arithmetic operation. One example is


https://github.com/dtcxzyw/llvm-opt-benchmark/blob/e3c822bf41df3a88ca38eba884a52b0cc7e70bf2/bench/protobuf/optimized/generated_message_reflection.cc.ll#L39860-L39873

``` llvm
  %conv.i188 = zext i32 %145 to i64
  %add.i189 = add i64 %conv.i188, %125
  %146 = load i16, ptr %num_aux_entries10.i, align 2
  %conv2.i191 = zext i16 %146 to i64
  %mul.i192 = shl nuw nsw i64 %conv2.i191, 3
  %add3.i193 = add i64 %add.i189, %mul.i192
  %147 = inttoptr i64 %add3.i193 to ptr
  %sub.ptr.lhs.cast.i195 = ptrtoint ptr %144 to i64
  %sub.ptr.rhs.cast.i196 = ptrtoint ptr %143 to i64
  %sub.ptr.sub.i197 = sub i64 %sub.ptr.lhs.cast.i195, %sub.ptr.rhs.cast.i196
  %add.ptr = getelementptr inbounds i8, ptr %147, i64 %sub.ptr.sub.i197
  %sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %125
```

where `%conv.i188` first adds `%125` and then subtracts `%125` (the
result is `%sub.ptr.sub`), which can be optimized.
---
 .../InstCombine/InstCombineCasts.cpp          |  20 ++-
 llvm/test/Transforms/InstCombine/cast_ptr.ll  | 151 ++++++++++++++++++
 2 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 11e31877de38c..1b4c319032cab 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2049,16 +2049,28 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
       Mask->getType() == Ty)
     return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(SrcOp)) {
+  if (auto *GEP = dyn_cast<GEPOperator>(SrcOp)) {
     // Fold ptrtoint(gep null, x) to multiply + constant if the GEP has one use.
     // While this can increase the number of instructions it doesn't actually
     // increase the overall complexity since the arithmetic is just part of
     // the GEP otherwise.
     if (GEP->hasOneUse() &&
         isa<ConstantPointerNull>(GEP->getPointerOperand())) {
-      return replaceInstUsesWith(
-          CI, Builder.CreateIntCast(EmitGEPOffset(cast<GEPOperator>(GEP)), Ty,
-                                    /*isSigned=*/false));
+      return replaceInstUsesWith(CI,
+                                 Builder.CreateIntCast(EmitGEPOffset(GEP), Ty,
+                                                       /*isSigned=*/false));
+    }
+
+    // (ptrtoint (gep (inttoptr Base), ...)) -> Base + Offset
+    Value *Base;
+    if (GEP->hasOneUse() &&
+        match(GEP->getPointerOperand(), m_OneUse(m_IntToPtr(m_Value(Base)))) &&
+        Base->getType() == Ty) {
+      Value *Offset = EmitGEPOffset(GEP);
+      auto *NewOp = BinaryOperator::CreateAdd(Base, Offset);
+      if (GEP->isInBounds() && isKnownNonNegative(Offset, SQ))
+        NewOp->setHasNoUnsignedWrap(true);
+      return NewOp;
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/cast_ptr.ll b/llvm/test/Transforms/InstCombine/cast_ptr.ll
index 5c6c012064e05..786ea876ddea7 100644
--- a/llvm/test/Transforms/InstCombine/cast_ptr.ll
+++ b/llvm/test/Transforms/InstCombine/cast_ptr.ll
@@ -244,3 +244,154 @@ define <2 x i32> @insertelt_extra_use2(<2 x i32> %x, ptr %p) {
   %r = ptrtoint <2 x ptr> %i to <2 x i32>
   ret <2 x i32> %r
 }
+
+define i32 @ptr_add_in_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_2(
+; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nsw i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[P2_IDX]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i32, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_nneg(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_nneg(
+; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.abs.i32(i32 [[Y:%.*]], i1 true)
+; CHECK-NEXT:    [[R:%.*]] = add nuw i32 [[Z]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.abs.i32(i32 %y, i1 true)
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 %z
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i64 @ptr_add_in_int_different_type_1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i64
+  ret i64 %r
+}
+
+define i16 @ptr_add_in_int_different_type_2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i16
+  ret i16 %r
+}
+
+define i32 @ptr_add_in_int_different_type_3(i16 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i16 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_different_type_4(i64 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i64 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_not_inbounds(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_not_inbounds(
+; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.abs.i32(i32 [[Y:%.*]], i1 true)
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Z]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.abs.i32(i32 %y, i1 true)
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %z
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_const(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_const(
+; CHECK-NEXT:    [[R:%.*]] = add nuw i32 [[X:%.*]], 4096
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 4096
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_const_negative(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_const_negative(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X:%.*]], -4096
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 -4096
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+declare void @use_ptr(ptr)
+
+define i32 @ptr_add_in_int_extra_use1(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_extra_use1(
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i32 [[X:%.*]] to ptr
+; CHECK-NEXT:    call void @use_ptr(ptr [[PTR]])
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 4096
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr [[P2]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  call void @use_ptr(ptr %ptr)
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 4096
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_extra_use2(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_extra_use2(
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i32 [[X:%.*]] to ptr
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 4096
+; CHECK-NEXT:    call void @use_ptr(ptr nonnull [[P2]])
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr [[P2]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 4096
+  call void @use_ptr(ptr %p2)
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}

>From ebbbc73667a68dcfbe09392a1d34050592b234fd Mon Sep 17 00:00:00 2001
From: Chaitanya <Krishna.Sankisa at amd.com>
Date: Mon, 20 May 2024 10:24:40 +0530
Subject: [PATCH 42/44] [AMDGPU] Use removeFnAttrFromReachable in
 lower-module-lds pass. (#92686)

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 44 +------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 2c7163a775372..625ac0230f160 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -862,48 +862,6 @@ class AMDGPULowerModuleLDS {
     return N;
   }
 
-  /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
-  /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
-  /// the lack of llvm.amdgcn.lds.kernel.id calls.
-  void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
-    KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");
-
-    SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
-    SmallPtrSet<Function *, 8> Visited;
-    bool SeenUnknownCall = false;
-
-    while (!WorkList.empty()) {
-      Function *F = WorkList.pop_back_val();
-
-      for (auto &CallRecord : *CG[F]) {
-        if (!CallRecord.second)
-          continue;
-
-        Function *Callee = CallRecord.second->getFunction();
-        if (!Callee) {
-          if (!SeenUnknownCall) {
-            SeenUnknownCall = true;
-
-            // If we see any indirect calls, assume nothing about potential
-            // targets.
-            // TODO: This could be refined to possible LDS global users.
-            for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
-              Function *PotentialCallee =
-                  ExternalCallRecord.second->getFunction();
-              assert(PotentialCallee);
-              if (!isKernelLDS(PotentialCallee))
-                PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
-            }
-          }
-        } else {
-          Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
-          if (Visited.insert(Callee).second)
-            WorkList.push_back(Callee);
-        }
-      }
-    }
-  }
-
   DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
       Module &M, LDSUsesInfoTy &LDSUsesInfo,
       DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
@@ -1059,7 +1017,7 @@ class AMDGPULowerModuleLDS {
       //
       // TODO: We could filter out subgraphs that do not access LDS globals.
       for (Function *F : KernelsThatAllocateTableLDS)
-        removeNoLdsKernelIdFromReachable(CG, F);
+        removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id");
     }
 
     DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =

>From f6527774569790b5a5236f6e84f3f839ce6c2fff Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Sun, 19 May 2024 22:01:10 -0700
Subject: [PATCH 43/44] [AMDGPU] Fix kernarg preloading crash with some types
 and alignments (#91625)

Lowering of preloded arguments would fail with half/bfloat if they were
dword aligned in the kernarg segment and not part of a vector. Added
more tests with different alignments and types.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |   20 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 2231 ++++++++++--------
 2 files changed, 1231 insertions(+), 1020 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 89e83babcfef4..c7c4a8faa2fb0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2976,12 +2976,20 @@ SDValue SITargetLowering::LowerFormalArguments(
                                    DL, Elts);
           }
 
-          SDValue CMemVT;
-          if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
-            CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
-          else
-            CMemVT = DAG.getBitcast(MemVT, NewArg);
-          NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
+          // If the argument was preloaded to multiple consecutive 32-bit
+          // registers because of misalignment between addressable SGPR tuples
+          // and the argument size, we can still assume that because of kernarg
+          // segment alignment restrictions that NewArg's size is the same as
+          // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
+          // truncate since we cannot preload to less than a single SGPR and the
+          // MemVT may be smaller.
+          EVT MemVTInt =
+              EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+          if (MemVT.bitsLT(NewArg.getSimpleValueType()))
+            NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
+
+          NewArg = DAG.getBitcast(MemVT, NewArg);
+          NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
                                   Ins[i].Flags.isSExt(), &Ins[i]);
           NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
         }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index f0e709b5a1727..857bb897ead2a 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,18 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
 
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
-define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -23,19 +19,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8:
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -45,17 +29,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8:
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -65,7 +39,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -76,19 +50,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -98,17 +60,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -122,8 +74,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -134,19 +86,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -157,18 +97,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -179,7 +108,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -190,19 +119,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -213,18 +130,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -239,8 +145,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -251,19 +157,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -273,17 +167,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -293,7 +177,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -304,19 +188,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -326,17 +198,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -350,8 +212,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -361,18 +223,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -381,16 +232,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -399,7 +241,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -409,18 +251,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -429,16 +260,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -451,8 +273,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 }
 
 
-define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
-; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
@@ -464,20 +286,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -489,17 +298,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -509,7 +308,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
@@ -521,20 +320,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s6, s2
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -546,17 +332,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s6, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -570,8 +346,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -584,21 +360,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -612,19 +374,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -636,7 +386,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -649,21 +399,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -677,19 +413,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s1, s8, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -707,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
   ret void
 }
 
-define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -718,18 +442,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -740,18 +453,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -762,7 +464,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -772,18 +474,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -794,18 +485,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v1, v0, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -820,8 +500,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 }
 
 
-define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
-; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -835,22 +515,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -865,22 +530,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -895,7 +545,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
@@ -909,22 +559,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -939,22 +574,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 }
 
 
-define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
@@ -995,27 +615,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i32_arg:
+; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1035,27 +635,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i32_arg:
+; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1075,7 +655,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1094,27 +674,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1134,27 +694,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1177,8 +717,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
   ret void
 }
 
-define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1189,20 +729,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1213,18 +740,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1235,7 +751,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1246,20 +762,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1270,18 +773,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1295,8 +787,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1308,20 +800,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1332,18 +811,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1354,7 +822,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1366,20 +834,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1390,18 +845,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1415,8 +859,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1428,20 +872,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1452,18 +883,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1474,7 +894,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1486,20 +906,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1510,18 +917,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1535,8 +931,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1547,20 +943,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1578,25 +961,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1614,7 +979,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1625,20 +990,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1656,25 +1008,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s9
-; GFX90a-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1695,8 +1029,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
   ret void
 }
 
-define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -1718,30 +1052,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5f64_arg:
+; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1764,30 +1075,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5f64_arg:
+; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1810,7 +1098,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -1832,30 +1120,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1878,30 +1143,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1927,8 +1169,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
   ret void
 }
 
-define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
-; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -1937,18 +1179,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1973,32 +1204,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    s_nop 0
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -2023,7 +1229,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -2032,18 +1238,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -2067,31 +1262,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -2129,17 +1300,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2149,15 +1309,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2177,17 +1328,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2197,15 +1337,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2229,17 +1360,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2249,15 +1369,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2277,17 +1388,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2297,15 +1397,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2317,3 +1408,1115 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
   store double %in, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) {
+; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) {
+; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <2 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <3 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <6 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s10, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s10, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[10:11] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  store <7 x bfloat> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) {
+; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i1 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) {
+; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store fp128 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x i8> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[2:3] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[6:7] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x half> %in, ptr addrspace(1) %out
+  ret void
+}
+
+; Test when previous argument was not dword aligned.
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0xc
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i32 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s7, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <3 x i32> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i16 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <2 x i8> %in2, ptr addrspace(1) %out2
+  ret void
+}

>From 8de7890572296830b27b6e6db39b36810bc98c31 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl at google.com>
Date: Sun, 19 May 2024 22:22:47 -0700
Subject: [PATCH 44/44] [ThinLTO] Populate declaration import status except for
 distributed ThinLTO under a default-off new option (#88024)

The goal is to populate `declaration` import status if a new flag`-import-declaration` is on.

* For in-process ThinLTO, the `declaration` status is visible to backend
`function-import` pass, so `FunctionImporter::importFunctions` should
read the import status and be no-op for declaration summaries.
Basically, the postlink pipeline is updated to keep its current behavior
(import definitions), but not updated to handle `declaration` summaries.
Two use cases (better call-graph sort and cross-module auto-init)
would use this bit differently.

* For distributed ThinLTO, the `declaration` status is not serialized to
bitcode. As discussed, https://github.com/llvm/llvm-project/pull/87600
will do this.

[1] https://discourse.llvm.org/t/rfc-for-better-call-graph-sort-build-a-more-complete-call-graph-by-adding-more-indirect-call-edges/74029#support-cross-module-function-declaration-import-5
[2] https://github.com/llvm/llvm-project/pull/87597#discussion_r1556067195
---
 llvm/include/llvm/IR/ModuleSummaryIndex.h     |   7 +
 .../llvm/Transforms/IPO/FunctionImport.h      |  15 +-
 llvm/lib/LTO/LTO.cpp                          |  32 ++-
 llvm/lib/LTO/LTOBackend.cpp                   |   9 +-
 llvm/lib/Transforms/IPO/FunctionImport.cpp    | 270 ++++++++++++++----
 llvm/test/ThinLTO/X86/funcimport-stats.ll     |   4 +-
 .../ThinLTO/X86/import_callee_declaration.ll  | 180 ++++++++++++
 .../Transforms/FunctionImport/funcimport.ll   |   5 +-
 llvm/tools/llvm-link/llvm-link.cpp            |   6 +-
 9 files changed, 443 insertions(+), 85 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/import_callee_declaration.ll

diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 5d137d4b3553c..a6bb261af7522 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -587,6 +587,10 @@ class GlobalValueSummary {
 
   void setImportKind(ImportKind IK) { Flags.ImportType = IK; }
 
+  GlobalValueSummary::ImportKind importType() const {
+    return static_cast<ImportKind>(Flags.ImportType);
+  }
+
   GlobalValue::VisibilityTypes getVisibility() const {
     return (GlobalValue::VisibilityTypes)Flags.Visibility;
   }
@@ -1272,6 +1276,9 @@ using ModulePathStringTableTy = StringMap<ModuleHash>;
 /// a particular module, and provide efficient access to their summary.
 using GVSummaryMapTy = DenseMap<GlobalValue::GUID, GlobalValueSummary *>;
 
+/// A set of global value summary pointers.
+using GVSummaryPtrSet = SmallPtrSet<GlobalValueSummary *, 4>;
+
 /// Map of a type GUID to type id string and summary (multimap used
 /// in case of GUID conflicts).
 using TypeIdSummaryMapTy =
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index c4d19e8641eca..024bba8105b89 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -31,9 +31,9 @@ class Module;
 /// based on the provided summary informations.
 class FunctionImporter {
 public:
-  /// Set of functions to import from a source module. Each entry is a set
-  /// containing all the GUIDs of all functions to import for a source module.
-  using FunctionsToImportTy = std::unordered_set<GlobalValue::GUID>;
+  /// The functions to import from a source module and their import type.
+  using FunctionsToImportTy =
+      DenseMap<GlobalValue::GUID, GlobalValueSummary::ImportKind>;
 
   /// The different reasons selectCallee will chose not to import a
   /// candidate.
@@ -99,8 +99,13 @@ class FunctionImporter {
   /// index's module path string table).
   using ImportMapTy = DenseMap<StringRef, FunctionsToImportTy>;
 
-  /// The set contains an entry for every global value the module exports.
-  using ExportSetTy = DenseSet<ValueInfo>;
+  /// The map contains an entry for every global value the module exports.
+  /// The key is ValueInfo, and the value indicates whether the definition
+  /// or declaration is visible to another module. If a function's definition is
+  /// visible to other modules, the global values this function referenced are
+  /// visible and shouldn't be internalized.
+  /// TODO: Rename to `ExportMapTy`.
+  using ExportSetTy = DenseMap<ValueInfo, GlobalValueSummary::ImportKind>;
 
   /// A function of this type is used to load modules referenced by the index.
   using ModuleLoaderTy =
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 5c603ac6ab472..e2754d74979e8 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -121,6 +121,9 @@ void llvm::computeLTOCacheKey(
     support::endian::write64le(Data, I);
     Hasher.update(Data);
   };
+  auto AddUint8 = [&](const uint8_t I) {
+    Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&I, 1));
+  };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
   // command-line flags (which is unsupported in production), but may set
@@ -156,18 +159,18 @@ void llvm::computeLTOCacheKey(
   auto ModHash = Index.getModuleHash(ModuleID);
   Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
 
-  std::vector<uint64_t> ExportsGUID;
+  std::vector<std::pair<uint64_t, uint8_t>> ExportsGUID;
   ExportsGUID.reserve(ExportList.size());
-  for (const auto &VI : ExportList) {
-    auto GUID = VI.getGUID();
-    ExportsGUID.push_back(GUID);
-  }
+  for (const auto &[VI, ExportType] : ExportList)
+    ExportsGUID.push_back(
+        std::make_pair(VI.getGUID(), static_cast<uint8_t>(ExportType)));
 
   // Sort the export list elements GUIDs.
   llvm::sort(ExportsGUID);
-  for (uint64_t GUID : ExportsGUID) {
+  for (auto [GUID, ExportType] : ExportsGUID) {
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
+    AddUint8(ExportType);
   }
 
   // Include the hash for every module we import functions from. The set of
@@ -199,7 +202,7 @@ void llvm::computeLTOCacheKey(
              [](const ImportModule &Lhs, const ImportModule &Rhs) -> bool {
                return Lhs.getHash() < Rhs.getHash();
              });
-  std::vector<uint64_t> ImportedGUIDs;
+  std::vector<std::pair<uint64_t, uint8_t>> ImportedGUIDs;
   for (const ImportModule &Entry : ImportModulesVector) {
     auto ModHash = Entry.getHash();
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
@@ -207,11 +210,13 @@ void llvm::computeLTOCacheKey(
     AddUint64(Entry.getFunctions().size());
 
     ImportedGUIDs.clear();
-    for (auto &Fn : Entry.getFunctions())
-      ImportedGUIDs.push_back(Fn);
+    for (auto &[Fn, ImportType] : Entry.getFunctions())
+      ImportedGUIDs.push_back(std::make_pair(Fn, ImportType));
     llvm::sort(ImportedGUIDs);
-    for (auto &GUID : ImportedGUIDs)
+    for (auto &[GUID, Type] : ImportedGUIDs) {
       AddUint64(GUID);
+      AddUint8(Type);
+    }
   }
 
   // Include the hash for the resolved ODR.
@@ -281,9 +286,9 @@ void llvm::computeLTOCacheKey(
   // Imported functions may introduce new uses of type identifier resolutions,
   // so we need to collect their used resolutions as well.
   for (const ImportModule &ImpM : ImportModulesVector)
-    for (auto &ImpF : ImpM.getFunctions()) {
+    for (auto &[GUID, UnusedImportType] : ImpM.getFunctions()) {
       GlobalValueSummary *S =
-          Index.findSummaryInModule(ImpF, ImpM.getIdentifier());
+          Index.findSummaryInModule(GUID, ImpM.getIdentifier());
       AddUsedThings(S);
       // If this is an alias, we also care about any types/etc. that the aliasee
       // may reference.
@@ -1395,6 +1400,7 @@ class lto::ThinBackendProc {
                   llvm::StringRef ModulePath,
                   const std::string &NewModulePath) {
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+
     std::error_code EC;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
                                      ImportList, ModuleToSummariesForIndex);
@@ -1403,6 +1409,8 @@ class lto::ThinBackendProc {
                       sys::fs::OpenFlags::OF_None);
     if (EC)
       return errorCodeToError(EC);
+
+    // TODO: Serialize declaration bits to bitcode.
     writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
 
     if (ShouldEmitImportsFiles) {
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d4b89ede2d713..58434feec6f96 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -721,7 +721,14 @@ bool lto::initImportList(const Module &M,
       if (Summary->modulePath() == M.getModuleIdentifier())
         continue;
       // Add an entry to provoke importing by thinBackend.
-      ImportList[Summary->modulePath()].insert(GUID);
+      // Try emplace the entry first. If an entry with the same key already
+      // exists, set the value to 'std::min(existing-value, new-value)' to make
+      // sure a definition takes precedence over a declaration.
+      auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace(
+          GUID, Summary->importType());
+
+      if (!Inserted)
+        Iter->second = std::min(Iter->second, Summary->importType());
     }
   }
   return true;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 68f9799616ae6..a116fd6535347 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -140,6 +140,17 @@ static cl::opt<bool>
     ImportAllIndex("import-all-index",
                    cl::desc("Import all external functions in index."));
 
+/// This is a test-only option.
+/// If this option is enabled, the ThinLTO indexing step will import each
+/// function declaration as a fallback. In a real build this may increase ram
+/// usage of the indexing step unnecessarily.
+/// TODO: Implement selective import (based on combined summary analysis) to
+/// ensure the imported function has a use case in the postlink pipeline.
+static cl::opt<bool> ImportDeclaration(
+    "import-declaration", cl::init(false), cl::Hidden,
+    cl::desc("If true, import function declaration as fallback if the function "
+             "definition is not imported."));
+
 /// Pass a workload description file - an example of workload would be the
 /// functions executed to satisfy a RPC request. A workload is defined by a root
 /// function and the list of functions that are (frequently) needed to satisfy
@@ -245,8 +256,12 @@ static auto qualifyCalleeCandidates(
 }
 
 /// Given a list of possible callee implementation for a call site, select one
-/// that fits the \p Threshold. If none are found, the Reason will give the last
-/// reason for the failure (last, in the order of CalleeSummaryList entries).
+/// that fits the \p Threshold for function definition import. If none are
+/// found, the Reason will give the last reason for the failure (last, in the
+/// order of CalleeSummaryList entries). While looking for a callee definition,
+/// sets \p TooLargeOrNoInlineSummary to the last seen too-large or noinline
+/// candidate; other modules may want to know the function summary or
+/// declaration even if a definition is not needed.
 ///
 /// FIXME: select "best" instead of first that fits. But what is "best"?
 /// - The smallest: more likely to be inlined.
@@ -259,24 +274,32 @@ static const GlobalValueSummary *
 selectCallee(const ModuleSummaryIndex &Index,
              ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
              unsigned Threshold, StringRef CallerModulePath,
+             const GlobalValueSummary *&TooLargeOrNoInlineSummary,
              FunctionImporter::ImportFailureReason &Reason) {
+  // Records the last summary with reason noinline or too-large.
+  TooLargeOrNoInlineSummary = nullptr;
   auto QualifiedCandidates =
       qualifyCalleeCandidates(Index, CalleeSummaryList, CallerModulePath);
   for (auto QualifiedValue : QualifiedCandidates) {
     Reason = QualifiedValue.first;
+    // Skip a summary if its import is not (proved to be) legal.
     if (Reason != FunctionImporter::ImportFailureReason::None)
       continue;
     auto *Summary =
         cast<FunctionSummary>(QualifiedValue.second->getBaseObject());
 
+    // Don't bother importing the definition if the chance of inlining it is
+    // not high enough (except under `--force-import-all`).
     if ((Summary->instCount() > Threshold) && !Summary->fflags().AlwaysInline &&
         !ForceImportAll) {
+      TooLargeOrNoInlineSummary = Summary;
       Reason = FunctionImporter::ImportFailureReason::TooLarge;
       continue;
     }
 
-    // Don't bother importing if we can't inline it anyway.
+    // Don't bother importing the definition if we can't inline it anyway.
     if (Summary->fflags().NoInline && !ForceImportAll) {
+      TooLargeOrNoInlineSummary = Summary;
       Reason = FunctionImporter::ImportFailureReason::NoInline;
       continue;
     }
@@ -358,17 +381,27 @@ class GlobalsImporter final {
         if (!GVS || !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true) ||
             LocalNotInModule(GVS))
           continue;
-        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+
+        // If there isn't an entry for GUID, insert <GUID, Definition> pair.
+        // Otherwise, definition should take precedence over declaration.
+        auto [Iter, Inserted] =
+            ImportList[RefSummary->modulePath()].try_emplace(
+                VI.getGUID(), GlobalValueSummary::Definition);
         // Only update stat and exports if we haven't already imported this
         // variable.
-        if (!ILI.second)
+        if (!Inserted) {
+          // Set the value to 'std::min(existing-value, new-value)' to make
+          // sure a definition takes precedence over a declaration.
+          Iter->second = std::min(GlobalValueSummary::Definition, Iter->second);
           break;
+        }
         NumImportedGlobalVarsThinLink++;
         // Any references made by this variable will be marked exported
         // later, in ComputeCrossModuleImport, after import decisions are
         // complete, which is more efficient than adding them here.
         if (ExportLists)
-          (*ExportLists)[RefSummary->modulePath()].insert(VI);
+          (*ExportLists)[RefSummary->modulePath()][VI] =
+              GlobalValueSummary::Definition;
 
         // If variable is not writeonly we attempt to recursively analyze
         // its references in order to import referenced constants.
@@ -545,10 +578,11 @@ class WorkloadImportsManager : public ModuleImportsManager {
       LLVM_DEBUG(dbgs() << "[Workload][Including]" << VI.name() << " from "
                         << ExportingModule << " : "
                         << Function::getGUID(VI.name()) << "\n");
-      ImportList[ExportingModule].insert(VI.getGUID());
+      ImportList[ExportingModule][VI.getGUID()] =
+          GlobalValueSummary::Definition;
       GVI.onImportingSummary(*GVS);
       if (ExportLists)
-        (*ExportLists)[ExportingModule].insert(VI);
+        (*ExportLists)[ExportingModule][VI] = GlobalValueSummary::Definition;
     }
     LLVM_DEBUG(dbgs() << "[Workload] Done\n");
   }
@@ -769,9 +803,28 @@ static void computeImportForFunction(
       }
 
       FunctionImporter::ImportFailureReason Reason{};
-      CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
-                                   Summary.modulePath(), Reason);
+
+      // `SummaryForDeclImport` is an summary eligible for declaration import.
+      const GlobalValueSummary *SummaryForDeclImport = nullptr;
+      CalleeSummary =
+          selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                       Summary.modulePath(), SummaryForDeclImport, Reason);
       if (!CalleeSummary) {
+        // There isn't a callee for definition import but one for declaration
+        // import.
+        if (ImportDeclaration && SummaryForDeclImport) {
+          StringRef DeclSourceModule = SummaryForDeclImport->modulePath();
+
+          // Since definition takes precedence over declaration for the same VI,
+          // try emplace <VI, declaration> pair without checking insert result.
+          // If insert doesn't happen, there must be an existing entry keyed by
+          // VI.
+          if (ExportLists)
+            (*ExportLists)[DeclSourceModule].try_emplace(
+                VI, GlobalValueSummary::Declaration);
+          ImportList[DeclSourceModule].try_emplace(
+              VI.getGUID(), GlobalValueSummary::Declaration);
+        }
         // Update with new larger threshold if this was a retry (otherwise
         // we would have already inserted with NewThreshold above). Also
         // update failure info if requested.
@@ -816,11 +869,15 @@ static void computeImportForFunction(
              "selectCallee() didn't honor the threshold");
 
       auto ExportModulePath = ResolvedCalleeSummary->modulePath();
-      auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
+
+      // Try emplace the definition entry, and update stats based on insertion
+      // status.
+      auto [Iter, Inserted] = ImportList[ExportModulePath].try_emplace(
+          VI.getGUID(), GlobalValueSummary::Definition);
+
       // We previously decided to import this GUID definition if it was already
       // inserted in the set of imports from the exporting module.
-      bool PreviouslyImported = !ILI.second;
-      if (!PreviouslyImported) {
+      if (Inserted || Iter->second == GlobalValueSummary::Declaration) {
         NumImportedFunctionsThinLink++;
         if (IsHotCallsite)
           NumImportedHotFunctionsThinLink++;
@@ -828,11 +885,14 @@ static void computeImportForFunction(
           NumImportedCriticalFunctionsThinLink++;
       }
 
+      if (Iter->second == GlobalValueSummary::Declaration)
+        Iter->second = GlobalValueSummary::Definition;
+
       // Any calls/references made by this function will be marked exported
       // later, in ComputeCrossModuleImport, after import decisions are
       // complete, which is more efficient than adding them here.
       if (ExportLists)
-        (*ExportLists)[ExportModulePath].insert(VI);
+        (*ExportLists)[ExportModulePath][VI] = GlobalValueSummary::Definition;
     }
 
     auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
@@ -939,12 +999,20 @@ static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
 }
 
 template <class T>
-static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
-                                      T &Cont) {
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, T &Cont,
+                                      unsigned &DefinedGVS,
+                                      unsigned &DefinedFS) {
   unsigned NumGVS = 0;
-  for (auto &V : Cont)
-    if (isGlobalVarSummary(Index, V))
+  DefinedGVS = 0;
+  DefinedFS = 0;
+  for (auto &[GUID, Type] : Cont) {
+    if (isGlobalVarSummary(Index, GUID)) {
+      if (Type == GlobalValueSummary::Definition)
+        ++DefinedGVS;
       ++NumGVS;
+    } else if (Type == GlobalValueSummary::Definition)
+      ++DefinedFS;
+  }
   return NumGVS;
 }
 #endif
@@ -954,13 +1022,12 @@ static bool checkVariableImport(
     const ModuleSummaryIndex &Index,
     DenseMap<StringRef, FunctionImporter::ImportMapTy> &ImportLists,
     DenseMap<StringRef, FunctionImporter::ExportSetTy> &ExportLists) {
-
   DenseSet<GlobalValue::GUID> FlattenedImports;
 
   for (auto &ImportPerModule : ImportLists)
     for (auto &ExportPerModule : ImportPerModule.second)
-      FlattenedImports.insert(ExportPerModule.second.begin(),
-                              ExportPerModule.second.end());
+      for (auto &[GUID, Type] : ExportPerModule.second)
+        FlattenedImports.insert(GUID);
 
   // Checks that all GUIDs of read/writeonly vars we see in export lists
   // are also in the import lists. Otherwise we my face linker undefs,
@@ -979,7 +1046,7 @@ static bool checkVariableImport(
   };
 
   for (auto &ExportPerModule : ExportLists)
-    for (auto &VI : ExportPerModule.second)
+    for (auto &[VI, Unused] : ExportPerModule.second)
       if (!FlattenedImports.count(VI.getGUID()) &&
           IsReadOrWriteOnlyVarNeedingImporting(ExportPerModule.first, VI))
         return false;
@@ -1015,7 +1082,11 @@ void llvm::ComputeCrossModuleImport(
     FunctionImporter::ExportSetTy NewExports;
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ELI.first);
-    for (auto &EI : ELI.second) {
+    for (auto &[EI, Type] : ELI.second) {
+      // If a variable is exported as a declaration, its 'refs' and 'calls' are
+      // not further exported.
+      if (Type == GlobalValueSummary::Declaration)
+        continue;
       // Find the copy defined in the exporting module so that we can mark the
       // values it references in that specific definition as exported.
       // Below we will add all references and called values, without regard to
@@ -1034,22 +1105,31 @@ void llvm::ComputeCrossModuleImport(
         // we convert such variables initializers to "zeroinitializer".
         // See processGlobalForThinLTO.
         if (!Index.isWriteOnly(GVS))
-          for (const auto &VI : GVS->refs())
-            NewExports.insert(VI);
+          for (const auto &VI : GVS->refs()) {
+            // Try to emplace the declaration entry. If a definition entry
+            // already exists for key `VI`, this is a no-op.
+            NewExports.try_emplace(VI, GlobalValueSummary::Declaration);
+          }
       } else {
         auto *FS = cast<FunctionSummary>(S);
-        for (const auto &Edge : FS->calls())
-          NewExports.insert(Edge.first);
-        for (const auto &Ref : FS->refs())
-          NewExports.insert(Ref);
+        for (const auto &Edge : FS->calls()) {
+          // Try to emplace the declaration entry. If a definition entry
+          // already exists for key `VI`, this is a no-op.
+          NewExports.try_emplace(Edge.first, GlobalValueSummary::Declaration);
+        }
+        for (const auto &Ref : FS->refs()) {
+          // Try to emplace the declaration entry. If a definition entry
+          // already exists for key `VI`, this is a no-op.
+          NewExports.try_emplace(Ref, GlobalValueSummary::Declaration);
+        }
       }
     }
-    // Prune list computed above to only include values defined in the exporting
-    // module. We do this after the above insertion since we may hit the same
-    // ref/call target multiple times in above loop, and it is more efficient to
-    // avoid a set lookup each time.
+    // Prune list computed above to only include values defined in the
+    // exporting module. We do this after the above insertion since we may hit
+    // the same ref/call target multiple times in above loop, and it is more
+    // efficient to avoid a set lookup each time.
     for (auto EI = NewExports.begin(); EI != NewExports.end();) {
-      if (!DefinedGVSummaries.count(EI->getGUID()))
+      if (!DefinedGVSummaries.count(EI->first.getGUID()))
         NewExports.erase(EI++);
       else
         ++EI;
@@ -1064,18 +1144,29 @@ void llvm::ComputeCrossModuleImport(
   for (auto &ModuleImports : ImportLists) {
     auto ModName = ModuleImports.first;
     auto &Exports = ExportLists[ModName];
-    unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
-    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
-                      << Exports.size() - NumGVS << " functions and " << NumGVS
-                      << " vars. Imports from " << ModuleImports.second.size()
-                      << " modules.\n");
+    unsigned DefinedGVS = 0, DefinedFS = 0;
+    unsigned NumGVS =
+        numGlobalVarSummaries(Index, Exports, DefinedGVS, DefinedFS);
+    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports " << DefinedFS
+                      << " function as definitions, "
+                      << Exports.size() - NumGVS - DefinedFS
+                      << " functions as declarations, " << DefinedGVS
+                      << " var definitions and " << NumGVS - DefinedGVS
+                      << " var declarations. Imports from "
+                      << ModuleImports.second.size() << " modules.\n");
     for (auto &Src : ModuleImports.second) {
       auto SrcModName = Src.first;
-      unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
-      LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
-                        << " functions imported from " << SrcModName << "\n");
-      LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
-                        << " global vars imported from " << SrcModName << "\n");
+      unsigned DefinedGVS = 0, DefinedFS = 0;
+      unsigned NumGVSPerMod =
+          numGlobalVarSummaries(Index, Src.second, DefinedGVS, DefinedFS);
+      LLVM_DEBUG(dbgs() << " - " << DefinedFS << " function definitions and "
+                        << Src.second.size() - NumGVSPerMod - DefinedFS
+                        << " function declarations imported from " << SrcModName
+                        << "\n");
+      LLVM_DEBUG(dbgs() << " - " << DefinedGVS << " global vars definition and "
+                        << NumGVSPerMod - DefinedGVS
+                        << " global vars declaration imported from "
+                        << SrcModName << "\n");
     }
   }
 #endif
@@ -1089,11 +1180,17 @@ static void dumpImportListForModule(const ModuleSummaryIndex &Index,
                     << ImportList.size() << " modules.\n");
   for (auto &Src : ImportList) {
     auto SrcModName = Src.first;
-    unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
-    LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
-                      << " functions imported from " << SrcModName << "\n");
-    LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
-                      << SrcModName << "\n");
+    unsigned DefinedGVS = 0, DefinedFS = 0;
+    unsigned NumGVSPerMod =
+        numGlobalVarSummaries(Index, Src.second, DefinedGVS, DefinedFS);
+    LLVM_DEBUG(dbgs() << " - " << DefinedFS << " function definitions and "
+                      << Src.second.size() - DefinedFS - NumGVSPerMod
+                      << " function declarations imported from " << SrcModName
+                      << "\n");
+    LLVM_DEBUG(dbgs() << " - " << DefinedGVS << " var definitions and "
+                      << NumGVSPerMod - DefinedGVS
+                      << " var declarations imported from " << SrcModName
+                      << "\n");
   }
 }
 #endif
@@ -1149,7 +1246,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
     if (Summary->modulePath() == ModulePath)
       continue;
     // Add an entry to provoke importing by thinBackend.
-    ImportList[Summary->modulePath()].insert(GUID);
+    auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace(
+        GUID, Summary->importType());
+    if (!Inserted) {
+      // Use 'std::min' to make sure definition (with enum value 0) takes
+      // precedence over declaration (with enum value 1).
+      Iter->second = std::min(Iter->second, Summary->importType());
+    }
   }
 #ifndef NDEBUG
   dumpImportListForModule(Index, ModulePath, ImportList);
@@ -1339,13 +1442,17 @@ void llvm::gatherImportedSummariesForModule(
   // Include summaries for imports.
   for (const auto &ILI : ImportList) {
     auto &SummariesForIndex = ModuleToSummariesForIndex[std::string(ILI.first)];
+
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ILI.first);
-    for (const auto &GI : ILI.second) {
-      const auto &DS = DefinedGVSummaries.find(GI);
+    for (const auto &[GUID, Type] : ILI.second) {
+      const auto &DS = DefinedGVSummaries.find(GUID);
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
-      SummariesForIndex[GI] = DS->second;
+      if (Type == GlobalValueSummary::Declaration)
+        continue;
+
+      SummariesForIndex[GUID] = DS->second;
     }
   }
 }
@@ -1617,6 +1724,16 @@ Expected<bool> FunctionImporter::importFunctions(
   for (const auto &FunctionsToImportPerModule : ImportList) {
     ModuleNameOrderedList.insert(FunctionsToImportPerModule.first);
   }
+
+  auto getImportType = [&](const FunctionsToImportTy &GUIDToImportType,
+                           GlobalValue::GUID GUID)
+      -> std::optional<GlobalValueSummary::ImportKind> {
+    auto Iter = GUIDToImportType.find(GUID);
+    if (Iter == GUIDToImportType.end())
+      return std::nullopt;
+    return Iter->second;
+  };
+
   for (const auto &Name : ModuleNameOrderedList) {
     // Get the module for the import
     const auto &FunctionsToImportPerModule = ImportList.find(Name);
@@ -1634,17 +1751,27 @@ Expected<bool> FunctionImporter::importFunctions(
       return std::move(Err);
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
+
     // Find the globals to import
     SetVector<GlobalValue *> GlobalsToImport;
     for (Function &F : *SrcModule) {
       if (!F.hasName())
         continue;
       auto GUID = F.getGUID();
-      auto Import = ImportGUIDs.count(GUID);
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
+      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
+
+      bool ImportDefinition =
+          (MaybeImportType &&
+           (*MaybeImportType == GlobalValueSummary::Definition));
+
+      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                        << " importing function"
+                        << (ImportDefinition
+                                ? " definition "
+                                : (MaybeImportType ? " declaration " : " "))
                         << GUID << " " << F.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (Import) {
+      if (ImportDefinition) {
         if (Error Err = F.materialize())
           return std::move(Err);
         // MemProf should match function's definition and summary,
@@ -1670,11 +1797,20 @@ Expected<bool> FunctionImporter::importFunctions(
       if (!GV.hasName())
         continue;
       auto GUID = GV.getGUID();
-      auto Import = ImportGUIDs.count(GUID);
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
+      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
+
+      bool ImportDefinition =
+          (MaybeImportType &&
+           (*MaybeImportType == GlobalValueSummary::Definition));
+
+      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                        << " importing global"
+                        << (ImportDefinition
+                                ? " definition "
+                                : (MaybeImportType ? " declaration " : " "))
                         << GUID << " " << GV.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (Import) {
+      if (ImportDefinition) {
         if (Error Err = GV.materialize())
           return std::move(Err);
         ImportedGVCount += GlobalsToImport.insert(&GV);
@@ -1684,11 +1820,20 @@ Expected<bool> FunctionImporter::importFunctions(
       if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
         continue;
       auto GUID = GA.getGUID();
-      auto Import = ImportGUIDs.count(GUID);
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
+      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
+
+      bool ImportDefinition =
+          (MaybeImportType &&
+           (*MaybeImportType == GlobalValueSummary::Definition));
+
+      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                        << " importing alias"
+                        << (ImportDefinition
+                                ? " definition "
+                                : (MaybeImportType ? " declaration " : " "))
                         << GUID << " " << GA.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (Import) {
+      if (ImportDefinition) {
         if (Error Err = GA.materialize())
           return std::move(Err);
         // Import alias as a copy of its aliasee.
@@ -1754,6 +1899,7 @@ Expected<bool> FunctionImporter::importFunctions(
   NumImportedFunctions += (ImportedCount - ImportedGVCount);
   NumImportedGlobalVars += ImportedGVCount;
 
+  // TODO: Print counters for definitions and declarations in the debugging log.
   LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
                     << " functions for Module "
                     << DestModule.getModuleIdentifier() << "\n");
diff --git a/llvm/test/ThinLTO/X86/funcimport-stats.ll b/llvm/test/ThinLTO/X86/funcimport-stats.ll
index 913b13004c1c2..7fcd33855fe1a 100644
--- a/llvm/test/ThinLTO/X86/funcimport-stats.ll
+++ b/llvm/test/ThinLTO/X86/funcimport-stats.ll
@@ -9,8 +9,8 @@
 ; RUN: cat %t4 | grep 'Is importing aliasee' | count 1
 ; RUN: cat %t4 | FileCheck %s
 
-; CHECK:      - [[NUM_FUNCS:[0-9]+]] functions imported from
-; CHECK-NEXT: - [[NUM_VARS:[0-9]+]] global vars imported from
+; CHECK:      - [[NUM_FUNCS:[0-9]+]] function definitions and 0 function declarations imported from
+; CHECK-NEXT: - [[NUM_VARS:[0-9]+]] global vars definition and 0 global vars declaration imported from
 
 ; CHECK:      [[NUM_FUNCS]] function-import - Number of functions imported in backend
 ; CHECK-NEXT: [[NUM_FUNCS]] function-import - Number of functions thin link decided to import
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
new file mode 100644
index 0000000000000..df8a9ce6f7109
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -0,0 +1,180 @@
+; "-debug-only" requires asserts.
+; REQUIRES: asserts
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; Generate per-module summaries.
+; RUN: opt -module-summary main.ll -o main.bc
+; RUN: opt -module-summary lib.ll -o lib.bc
+
+; Generate the combined summary and distributed indices.
+
+; - For function import, set 'import-instr-limit' to 7 and fall back to import
+;   function declarations.
+; - In main.ll, function 'main' calls 'small_func' and 'large_func'. Both callees
+;   are defined in lib.ll. 'small_func' has two indirect callees, one is smaller
+;   and the other one is larger. Both callees of 'small_func' are defined in lib.ll.
+; - Given the import limit, in main's combined summary, the import type of 'small_func'
+;   and 'small_indirect_callee' will be 'definition', and the import type of
+;   'large_func' and 'large_indirect_callee' will be 'declaration'.
+;
+; The test will disassemble combined summaries and check the import type is
+; correct. Right now postlink optimizer pipeline doesn't do anything (e.g.,
+; import the declaration or de-serialize summary attributes yet) so there is
+; nothing to test more than the summary content.
+;
+; RUN: llvm-lto2 run \
+; RUN:   -debug-only=function-import \
+; RUN:   -import-instr-limit=7 \
+; RUN:   -import-declaration \
+; RUN:   -thinlto-distributed-indexes \
+; RUN:   -r=main.bc,main,px \
+; RUN:   -r=main.bc,small_func, \
+; RUN:   -r=main.bc,large_func, \
+; RUN:   -r=lib.bc,callee,pl \
+; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,small_func,px \
+; RUN:   -r=lib.bc,large_func,px \
+; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+;
+; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7  -o combined.index.bc main.bc lib.bc
+; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; DUMP: - 2 function definitions and 3 function declarations imported from lib.bc
+
+; First disassemble per-module summary and find out the GUID for {large_func, large_indirect_callee}.
+;
+; RUN: llvm-dis lib.bc -o - | FileCheck %s --check-prefix=LIB-DIS
+; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
+; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
+; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
+;
+; Secondly disassemble main's combined summary and test that large callees are
+; not imported as declarations yet.
+;
+; RUN: llvm-dis main.bc.thinlto.bc -o - | FileCheck %s --check-prefix=MAIN-DIS
+;
+; MAIN-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
+; MAIN-DIS-NOT: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS-NOT: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS-NOT: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration)
+
+; Run in-process ThinLTO and tests that
+; 1. `callee` remains internalized even if the symbols of its callers
+; (large_func and large_indirect_callee) are exported as declarations and visible to main module.
+; 2. the debugging logs from `function-import` pass are expected.
+
+; RUN: llvm-lto2 run \
+; RUN:   -debug-only=function-import \
+; RUN:   -save-temps \
+; RUN:   -import-instr-limit=7 \
+; RUN:   -import-declaration \
+; RUN:   -r=main.bc,main,px \
+; RUN:   -r=main.bc,small_func, \
+; RUN:   -r=main.bc,large_func, \
+; RUN:   -r=lib.bc,callee,pl \
+; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,small_func,px \
+; RUN:   -r=lib.bc,large_func,px \
+; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
+
+; Test import status from debugging logs.
+; TODO: Serialize declaration bit and test declaration bits are correctly set,
+; and extend this test case to test IR once postlink optimizer makes use of
+; the import type for declarations.
+; IMPORTDUMP-DAG: Not importing function 11825436545918268459 callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function definition 13568239288960714650 small_indirect_callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function definition 6976996067367342685 small_func from lib.cc
+; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
+; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
+; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
+
+; RUN: llvm-dis in-process.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+
+; RUN: llvm-dis in-process.2.2.internalize.bc -o - | FileCheck %s --check-prefix=INTERNALIZE
+
+; IMPORT-DAG: define available_externally void @small_func
+; IMPORT-DAG: define available_externally hidden void @small_indirect_callee
+; IMPORT-DAG: declare void @large_func
+; IMPORT-NOT: large_indirect_callee
+; IMPORT-NOT: large_indirect_callee_alias
+
+; INTERNALIZE: define internal void @callee()
+
+;--- main.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+  call void @small_func()
+  call void @large_func()
+  ret i32 0
+}
+
+declare void @small_func()
+
+; large_func without attributes
+declare void @large_func()
+
+;--- lib.ll
+source_filename = "lib.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at calleeAddrs = global [3 x ptr] [ptr @large_indirect_callee, ptr @small_indirect_callee, ptr @large_indirect_callee_alias]
+
+define void @callee() #1 {
+  ret void
+}
+
+define void @large_indirect_callee()#2 {
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  ret void
+}
+
+define internal void @small_indirect_callee() #0 {
+  ret void
+}
+
+ at large_indirect_callee_alias = alias void(), ptr @large_indirect_callee
+
+define void @small_func() {
+entry:
+  %0 = load ptr, ptr @calleeAddrs
+  call void %0(), !prof !0
+  %1 = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @calleeAddrs, i64 0, i64 1)
+  call void %1(), !prof !1
+  %2 = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @calleeAddrs, i64 0, i64 2)
+  call void %2(), !prof !2
+  ret void
+}
+
+define void @large_func() #0 {
+entry:
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  ret void
+}
+
+attributes #0 = { nounwind norecurse }
+
+attributes #1 = { noinline }
+
+attributes #2 = { norecurse }
+
+!0 = !{!"VP", i32 0, i64 1, i64 14343440786664691134, i64 1}
+!1 = !{!"VP", i32 0, i64 1, i64 13568239288960714650, i64 1}
+!2 = !{!"VP", i32 0, i64 1, i64 16730173943625350469, i64 1}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport.ll b/llvm/test/Transforms/FunctionImport/funcimport.ll
index a0968a67f5ce8..635750b33fff0 100644
--- a/llvm/test/Transforms/FunctionImport/funcimport.ll
+++ b/llvm/test/Transforms/FunctionImport/funcimport.ll
@@ -166,7 +166,8 @@ declare void @variadic_va_start(...)
 ; GUID-DAG: GUID {{.*}} is linkoncefunc
 
 ; DUMP:       Module [[M1:.*]] imports from 1 module
-; DUMP-NEXT:  15 functions imported from [[M2:.*]]
-; DUMP-NEXT:  4 vars imported from [[M2]]
+; DUMP-NEXT:  15 function definitions and 0 function declarations imported from [[M2:.*]]
+; DUMP-NEXT:  4 var definitions and 0 var declarations imported from [[M2]]
+
 ; DUMP:       Imported 15 functions for Module [[M1]]
 ; DUMP-NEXT:  Imported 4 global variables for Module [[M1]]
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 7794f2d81ed06..1b90fce76fbd1 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -377,9 +377,13 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
     if (Verbose)
       errs() << "Importing " << FunctionName << " from " << FileName << "\n";
 
+    // `-import` specifies the `<filename,function-name>` pairs to import as
+    // definition, so make the import type definition directly.
+    // FIXME: A follow-up patch should add test coverage for import declaration
+    // in `llvm-link` CLI (e.g., by introducing a new command line option).
     auto &Entry =
         ImportList[FileNameStringCache.insert(FileName).first->getKey()];
-    Entry.insert(F->getGUID());
+    Entry[F->getGUID()] = GlobalValueSummary::Definition;
   }
   auto CachedModuleLoader = [&](StringRef Identifier) {
     return ModuleLoaderCache.takeModule(std::string(Identifier));