[llvm-branch-commits] [clang] [libclc] [libunwind] [lldb] [llvm] [RegAlloc][NewPM] Plug Greedy RA in codegen pipeline (PR #120557)

Thu Feb 20 02:02:02 PST 2025

https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/120557

>From 507e413a2d85fb25f70caa9ef843f7c7cffb94c6 Mon Sep 17 00:00:00 2001
From: Ming-Yi Lai <ming-yi.lai at mediatek.com>
Date: Thu, 20 Feb 2025 16:12:16 +0800
Subject: [PATCH 01/26] [libunwind][NFC] Remove the CET keyword in shadow
 stack-related stuffs (#126663)

libunwind currently supports shadow stack based on the Intel CET and
AArch64 GCS technology, but throughout related codes, the Intel-specific
keyword, "CET", is used to refer to the generic concept of control-flow
integrity/shadow stack. This patch replaces such wordings with
architecture-neutral term "shadow stack" (abbr. "shstk") to allow future
implementation to avoid using the Intel-specific "CET" term.
---
 libunwind/src/CMakeLists.txt                  |  2 +-
 libunwind/src/Registers.hpp                   |  8 +--
 libunwind/src/UnwindCursor.hpp                |  4 +-
 libunwind/src/UnwindLevel1.c                  | 63 ++++++++++---------
 .../{cet_unwind.h => shadow_stack_unwind.h}   | 12 ++--
 5 files changed, 46 insertions(+), 43 deletions(-)
 rename libunwind/src/{cet_unwind.h => shadow_stack_unwind.h} (88%)

diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index ecbd019bb29ea..d69013e5dace1 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -36,7 +36,6 @@ set(LIBUNWIND_HEADERS
     AddressSpace.hpp
     assembly.h
     CompactUnwinder.hpp
-    cet_unwind.h
     config.h
     dwarf2.h
     DwarfInstructions.hpp
@@ -46,6 +45,7 @@ set(LIBUNWIND_HEADERS
     libunwind_ext.h
     Registers.hpp
     RWMutex.hpp
+    shadow_stack_unwind.h
     Unwind-EHABI.h
     UnwindCursor.hpp
     ../include/libunwind.h
diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp
index 861e6b5f6f2c5..452f46a0d56ea 100644
--- a/libunwind/src/Registers.hpp
+++ b/libunwind/src/Registers.hpp
@@ -15,9 +15,9 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "cet_unwind.h"
 #include "config.h"
 #include "libunwind.h"
+#include "shadow_stack_unwind.h"
 
 namespace libunwind {
 
@@ -48,7 +48,7 @@ class _LIBUNWIND_HIDDEN Registers_x86;
 extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *);
 
 #if defined(_LIBUNWIND_USE_CET)
-extern "C" void *__libunwind_cet_get_jump_target() {
+extern "C" void *__libunwind_shstk_get_jump_target() {
   return reinterpret_cast<void *>(&__libunwind_Registers_x86_jumpto);
 }
 #endif
@@ -268,7 +268,7 @@ class _LIBUNWIND_HIDDEN Registers_x86_64;
 extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *);
 
 #if defined(_LIBUNWIND_USE_CET)
-extern "C" void *__libunwind_cet_get_jump_target() {
+extern "C" void *__libunwind_shstk_get_jump_target() {
   return reinterpret_cast<void *>(&__libunwind_Registers_x86_64_jumpto);
 }
 #endif
@@ -1817,7 +1817,7 @@ class _LIBUNWIND_HIDDEN Registers_arm64;
 extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
 
 #if defined(_LIBUNWIND_USE_GCS)
-extern "C" void *__libunwind_cet_get_jump_target() {
+extern "C" void *__libunwind_shstk_get_jump_target() {
   return reinterpret_cast<void *>(&__libunwind_Registers_arm64_jumpto);
 }
 #endif
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 0923052b1b588..ca9927edc9990 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -11,7 +11,7 @@
 #ifndef __UNWINDCURSOR_HPP__
 #define __UNWINDCURSOR_HPP__
 
-#include "cet_unwind.h"
+#include "shadow_stack_unwind.h"
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -3122,7 +3122,7 @@ bool UnwindCursor<A, R>::isReadableAddr(const pint_t addr) const {
 #endif
 
 #if defined(_LIBUNWIND_USE_CET) || defined(_LIBUNWIND_USE_GCS)
-extern "C" void *__libunwind_cet_get_registers(unw_cursor_t *cursor) {
+extern "C" void *__libunwind_shstk_get_registers(unw_cursor_t *cursor) {
   AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor;
   return co->get_registers();
 }
diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index 7e785f4d31e71..a258a832a9c31 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -25,10 +25,10 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "cet_unwind.h"
 #include "config.h"
 #include "libunwind.h"
 #include "libunwind_ext.h"
+#include "shadow_stack_unwind.h"
 #include "unwind.h"
 
 #if !defined(_LIBUNWIND_ARM_EHABI) && !defined(__USING_SJLJ_EXCEPTIONS__) &&   \
@@ -36,14 +36,17 @@
 
 #ifndef _LIBUNWIND_SUPPORT_SEH_UNWIND
 
-// When CET is enabled, each "call" instruction will push return address to
-// CET shadow stack, each "ret" instruction will pop current CET shadow stack
-// top and compare it with target address which program will return.
-// In exception handing, some stack frames will be skipped before jumping to
-// landing pad and we must adjust CET shadow stack accordingly.
-// _LIBUNWIND_POP_CET_SSP is used to adjust CET shadow stack pointer and we
-// directly jump to __libunwind_Registers_x86/x86_64_jumpto instead of using
-// a regular function call to avoid pushing to CET shadow stack again.
+// When shadow stack is enabled, a separate stack containing only return
+// addresses would be maintained. On function return, the return address would
+// be compared to the popped address from shadow stack to ensure the return
+// target is not tempered with. When unwinding, we're skipping the normal return
+// procedure for multiple frames and thus need to pop the return addresses of
+// the skipped frames from shadow stack to avoid triggering an exception (using
+// `_LIBUNWIND_POP_SHSTK_SSP()`). Also, some architectures, like the x86-family
+// CET, push the return adddresses onto shadow stack with common call
+// instructions, so for these architectures, normal function calls should be
+// avoided when invoking the `jumpto()` function. To do this, we use inline
+// assemblies to "goto" the `jumpto()` for these architectures.
 #if !defined(_LIBUNWIND_USE_CET) && !defined(_LIBUNWIND_USE_GCS)
 #define __unw_phase2_resume(cursor, fn)                                        \
   do {                                                                         \
@@ -51,38 +54,38 @@
     __unw_resume((cursor));                                                    \
   } while (0)
 #elif defined(_LIBUNWIND_TARGET_I386)
-#define __cet_ss_step_size 4
+#define __shstk_step_size (4)
 #define __unw_phase2_resume(cursor, fn)                                        \
   do {                                                                         \
-    _LIBUNWIND_POP_CET_SSP((fn));                                              \
-    void *cetRegContext = __libunwind_cet_get_registers((cursor));             \
-    void *cetJumpAddress = __libunwind_cet_get_jump_target();                  \
+    _LIBUNWIND_POP_SHSTK_SSP((fn));                                            \
+    void *shstkRegContext = __libunwind_shstk_get_registers((cursor));         \
+    void *shstkJumpAddress = __libunwind_shstk_get_jump_target();              \
     __asm__ volatile("push %%edi\n\t"                                          \
                      "sub $4, %%esp\n\t"                                       \
-                     "jmp *%%edx\n\t" :: "D"(cetRegContext),                   \
-                     "d"(cetJumpAddress));                                     \
+                     "jmp *%%edx\n\t" ::"D"(shstkRegContext),                  \
+                     "d"(shstkJumpAddress));                                   \
   } while (0)
 #elif defined(_LIBUNWIND_TARGET_X86_64)
-#define __cet_ss_step_size 8
+#define __shstk_step_size (8)
 #define __unw_phase2_resume(cursor, fn)                                        \
   do {                                                                         \
-    _LIBUNWIND_POP_CET_SSP((fn));                                              \
-    void *cetRegContext = __libunwind_cet_get_registers((cursor));             \
-    void *cetJumpAddress = __libunwind_cet_get_jump_target();                  \
-    __asm__ volatile("jmpq *%%rdx\n\t" :: "D"(cetRegContext),                  \
-                     "d"(cetJumpAddress));                                     \
+    _LIBUNWIND_POP_SHSTK_SSP((fn));                                            \
+    void *shstkRegContext = __libunwind_shstk_get_registers((cursor));         \
+    void *shstkJumpAddress = __libunwind_shstk_get_jump_target();              \
+    __asm__ volatile("jmpq *%%rdx\n\t" ::"D"(shstkRegContext),                 \
+                     "d"(shstkJumpAddress));                                   \
   } while (0)
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
-#define __cet_ss_step_size 8
+#define __shstk_step_size (8)
 #define __unw_phase2_resume(cursor, fn)                                        \
   do {                                                                         \
-    _LIBUNWIND_POP_CET_SSP((fn));                                              \
-    void *cetRegContext = __libunwind_cet_get_registers((cursor));             \
-    void *cetJumpAddress = __libunwind_cet_get_jump_target();                  \
+    _LIBUNWIND_POP_SHSTK_SSP((fn));                                            \
+    void *shstkRegContext = __libunwind_shstk_get_registers((cursor));         \
+    void *shstkJumpAddress = __libunwind_shstk_get_jump_target();              \
     __asm__ volatile("mov x0, %0\n\t"                                          \
                      "br %1\n\t"                                               \
                      :                                                         \
-                     : "r"(cetRegContext), "r"(cetJumpAddress)                 \
+                     : "r"(shstkRegContext), "r"(shstkJumpAddress)             \
                      : "x0");                                                  \
   } while (0)
 #endif
@@ -255,16 +258,16 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except
     }
 #endif
 
-// In CET enabled environment, we check return address stored in normal stack
-// against return address stored in CET shadow stack, if the 2 addresses don't
+// In shadow stack enabled environment, we check return address stored in normal
+// stack against return address stored in shadow stack, if the 2 addresses don't
 // match, it means return address in normal stack has been corrupted, we return
 // _URC_FATAL_PHASE2_ERROR.
 #if defined(_LIBUNWIND_USE_CET) || defined(_LIBUNWIND_USE_GCS)
     if (shadowStackTop != 0) {
       unw_word_t retInNormalStack;
       __unw_get_reg(cursor, UNW_REG_IP, &retInNormalStack);
-      unsigned long retInShadowStack = *(
-          unsigned long *)(shadowStackTop + __cet_ss_step_size * framesWalked);
+      unsigned long retInShadowStack =
+          *(unsigned long *)(shadowStackTop + __shstk_step_size * framesWalked);
       if (retInNormalStack != retInShadowStack)
         return _URC_FATAL_PHASE2_ERROR;
     }
diff --git a/libunwind/src/cet_unwind.h b/libunwind/src/shadow_stack_unwind.h
similarity index 88%
rename from libunwind/src/cet_unwind.h
rename to libunwind/src/shadow_stack_unwind.h
index 47d7616a7322c..1f229d8317116 100644
--- a/libunwind/src/cet_unwind.h
+++ b/libunwind/src/shadow_stack_unwind.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LIBUNWIND_CET_UNWIND_H
-#define LIBUNWIND_CET_UNWIND_H
+#ifndef LIBUNWIND_SHADOW_STACK_UNWIND_H
+#define LIBUNWIND_SHADOW_STACK_UNWIND_H
 
 #include "libunwind.h"
 
@@ -21,7 +21,7 @@
 #include <cet.h>
 #include <immintrin.h>
 
-#define _LIBUNWIND_POP_CET_SSP(x)                                              \
+#define _LIBUNWIND_POP_SHSTK_SSP(x)                                            \
   do {                                                                         \
     unsigned long ssp = _get_ssp();                                            \
     if (ssp != 0) {                                                            \
@@ -46,7 +46,7 @@
 #define _LIBUNWIND_USE_GCS 1
 #endif
 
-#define _LIBUNWIND_POP_CET_SSP(x)                                              \
+#define _LIBUNWIND_POP_SHSTK_SSP(x)                                            \
   do {                                                                         \
     if (__chkfeat(_CHKFEAT_GCS)) {                                             \
       unsigned tmp = (x);                                                      \
@@ -57,7 +57,7 @@
 
 #endif
 
-extern void *__libunwind_cet_get_registers(unw_cursor_t *);
-extern void *__libunwind_cet_get_jump_target(void);
+extern void *__libunwind_shstk_get_registers(unw_cursor_t *);
+extern void *__libunwind_shstk_get_jump_target(void);
 
 #endif

>From 44dc5729b07890cbedae6b1bdb6fcef038021ebc Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot at gmail.com>
Date: Thu, 20 Feb 2025 08:12:26 +0000
Subject: [PATCH 02/26] [gn build] Port 507e413a2d85

---
 llvm/utils/gn/secondary/libunwind/src/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libunwind/src/BUILD.gn b/llvm/utils/gn/secondary/libunwind/src/BUILD.gn
index 2396300f94717..f63b22822ca96 100644
--- a/llvm/utils/gn/secondary/libunwind/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libunwind/src/BUILD.gn
@@ -40,11 +40,11 @@ unwind_sources = [
   "UnwindRegistersRestore.S",
   "UnwindRegistersSave.S",
   "assembly.h",
-  "cet_unwind.h",
   "config.h",
   "dwarf2.h",
   "libunwind.cpp",
   "libunwind_ext.h",
+  "shadow_stack_unwind.h",
 ]
 if (current_os == "aix") {
   unwind_sources += [ "Unwind_AIXExtras.cpp" ]

>From 611a648327e9f6dad174e5c4427b27b8b7830fc0 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus at amd.com>
Date: Thu, 20 Feb 2025 09:25:48 +0100
Subject: [PATCH 03/26] [AMDGPU] Add llvm.amdgcn.dead intrinsic (#123190)

Shaders that use the llvm.amdgcn.init.whole.wave intrinsic need to
explicitly preserve the inactive lanes of VGPRs of interest by adding
them as dummy arguments. The code usually looks something like this:

```
define amdgcn_cs_chain void f(active vgpr args..., i32 %inactive.vgpr1, ..., i32 %inactive.vgprN) {
entry:
  %c = call i1 @llvm.amdgcn.init.whole.wave()
  br i1 %c, label %shader, label %tail

shader:
  [...]

tail:
  %inactive.vgpr.arg1 = phi i32 [ %inactive.vgpr1, %entry], [poison, %shader]
  [...]
  ; %inactive.vgpr* then get passed into a llvm.amdgcn.cs.chain call
```

Unfortunately, this kind of phi node will get optimized away and the
backend won't be able to figure out that it's ok to use the active lanes
of `%inactive.vgpr*` inside `shader`.

This patch fixes the issue by introducing a llvm.amdgcn.dead intrinsic,
whose result can be used as a PHI operand instead of the poison. This
will be selected to an IMPLICIT_DEF, which the backend can work with.

At the moment, the llvm.amdgcn.dead intrinsic works only on i32 values.
Support for other types can be added later if needed.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   7 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   6 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   1 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   6 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |   9 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll  |  64 ++++++++
 .../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 137 ++++++++++++++++++
 8 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 1e4f25c642493..876a6f816ad3f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3463,4 +3463,11 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
   [llvm_anyptr_ty], [llvm_anyptr_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
+
+/// Make it clear to the backend that this value is really dead. For instance,
+/// when used as an input to a phi node, it will make it possible for the
+/// backend to allocate the dead lanes for operations within the corresponding
+/// incoming block.
+def int_amdgcn_dead: DefaultAttrsIntrinsic<[llvm_any_ty], [],
+    [IntrNoMem, IntrWillReturn, IntrNoCallback]>;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cf3843869808b..28c5a53508556 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1190,6 +1190,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
   case Intrinsic::amdgcn_permlane16_swap:
   case Intrinsic::amdgcn_permlane32_swap:
     return selectPermlaneSwapIntrin(I, IntrinsicID);
+  case Intrinsic::amdgcn_dead: {
+    I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+    I.removeOperand(1); // drop intrinsic ID
+    return RBI.constrainGenericRegister(I.getOperand(0).getReg(),
+                                        AMDGPU::VGPR_32RegClass, *MRI);
+  }
   default:
     return selectImpl(I, *CoverageInfo);
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 2e5f42c3bdc40..2693ad3894cca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4676,6 +4676,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_set_inactive_chain_arg:
     case Intrinsic::amdgcn_permlane64:
     case Intrinsic::amdgcn_ds_bpermute_fi_b32:
+    case Intrinsic::amdgcn_dead:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_cvt_pkrtz:
       if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 10175557fadc7..3b62dcf3c92cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -362,6 +362,8 @@ def : SourceOfDivergence<int_amdgcn_inverse_ballot>;
 foreach intr = AMDGPUImageDimAtomicIntrinsics in
 def : SourceOfDivergence<intr>;
 
+def : SourceOfDivergence<int_amdgcn_dead>;
+
 class AlwaysUniform<Intrinsic intr> {
   Intrinsic Intr = intr;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3faf0795157dc..598475763d02d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4276,3 +4276,9 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
   let hasSideEffects = 1;
   let SubtargetPredicate = isGFX10Plus;
 }
+
+// FIXME: Would be nice if we could set the register class for the destination
+// register too.
+def IMP_DEF_FROM_INTRINSIC: Pat<
+  (i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>;
+
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index aa5208560817f..bb840023daf5d 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -520,7 +520,12 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i
   ret void
 }
 
-
+; CHECK: DIVERGENT:   %v = call i32 @llvm.amdgcn.dead.i32()
+define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) {
+  %v = call i32 @llvm.amdgcn.dead.i32()
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
 
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
@@ -558,5 +563,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
 declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
 declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
 
+declare i32 @llvm.amdgcn.dead.i32()
+
 attributes #0 = { nounwind convergent }
 attributes #1 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
new file mode 100644
index 0000000000000..a009854542f21
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-DAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s
+
+; Test that we can use v0 for temporaries in the if.then block.
+define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 {
+; ASM-DAG-LABEL: dead:
+; ASM-DAG:       ; %bb.0: ; %entry
+; ASM-DAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; ASM-DAG-NEXT:    s_wait_expcnt 0x0
+; ASM-DAG-NEXT:    s_wait_samplecnt 0x0
+; ASM-DAG-NEXT:    s_wait_bvhcnt 0x0
+; ASM-DAG-NEXT:    s_wait_kmcnt 0x0
+; ASM-DAG-NEXT:    v_mov_b32_e32 v4, v0
+; ASM-DAG-NEXT:    v_mov_b32_e32 v0, v1
+; ASM-DAG-NEXT:    s_mov_b32 s0, exec_lo
+; ASM-DAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; ASM-DAG-NEXT:    v_and_b32_e32 v1, 1, v4
+; ASM-DAG-NEXT:    v_cmpx_eq_u32_e32 1, v1
+; ASM-DAG-NEXT:    s_cbranch_execz .LBB0_2
+; ASM-DAG-NEXT:  ; %bb.1: ; %if.then
+; ASM-DAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; ASM-DAG-NEXT:    global_store_b32 v[2:3], v0, off
+; ASM-DAG-NEXT:    ; implicit-def: $vgpr0
+; ASM-DAG-NEXT:  .LBB0_2: ; %if.end
+; ASM-DAG-NEXT:    s_wait_alu 0xfffe
+; ASM-DAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; ASM-DAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; ASM-GISEL-LABEL: dead:
+; ASM-GISEL:       ; %bb.0: ; %entry
+; ASM-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; ASM-GISEL-NEXT:    s_wait_expcnt 0x0
+; ASM-GISEL-NEXT:    s_wait_samplecnt 0x0
+; ASM-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; ASM-GISEL-NEXT:    s_wait_kmcnt 0x0
+; ASM-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; ASM-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; ASM-GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; ASM-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; ASM-GISEL-NEXT:    v_and_b32_e32 v1, 1, v4
+; ASM-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; ASM-GISEL-NEXT:    s_cbranch_execz .LBB0_2
+; ASM-GISEL-NEXT:  ; %bb.1: ; %if.then
+; ASM-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; ASM-GISEL-NEXT:    global_store_b32 v[2:3], v0, off
+; ASM-GISEL-NEXT:    ; implicit-def: $vgpr0
+; ASM-GISEL-NEXT:  .LBB0_2: ; %if.end
+; ASM-GISEL-NEXT:    s_wait_alu 0xfffe
+; ASM-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; ASM-GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %dead = call i32 @llvm.amdgcn.dead.i32()
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %temp = add i32 %x, 1
+  store i32 %temp, ptr addrspace(1) %ptr1
+  br label %if.end
+
+if.end:
+  %res = phi i32 [ %x, %entry ], [ %dead, %if.then ]
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1bdaa4c98127d..110192ecefe55 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -1115,4 +1115,141 @@ tail:
   unreachable
 }
 
+; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
+; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
+; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
+; it clear to the backend that it's safe to allocate v9's active lanes inside
+; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
+define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, i32 %active.vgpr, i32 %inactive.vgpr) {
+; GISEL12-LABEL: with_inactive_vgprs:
+; GISEL12:       ; %bb.0: ; %entry
+; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT:    s_wait_expcnt 0x0
+; GISEL12-NEXT:    s_wait_samplecnt 0x0
+; GISEL12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL12-NEXT:    s_wait_kmcnt 0x0
+; GISEL12-NEXT:    s_or_saveexec_b32 s6, -1
+; GISEL12-NEXT:    s_mov_b32 s4, s0
+; GISEL12-NEXT:    s_mov_b32 s5, s1
+; GISEL12-NEXT:    s_mov_b32 s0, s3
+; GISEL12-NEXT:    s_wait_alu 0xfffe
+; GISEL12-NEXT:    s_and_saveexec_b32 s1, s6
+; GISEL12-NEXT:    s_cbranch_execz .LBB6_2
+; GISEL12-NEXT:  ; %bb.1: ; %shader
+; GISEL12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; GISEL12-NEXT:    flat_load_b32 v11, v[9:10]
+; GISEL12-NEXT:    ;;#ASMSTART
+; GISEL12-NEXT:    ; use v0-7
+; GISEL12-NEXT:    ;;#ASMEND
+; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GISEL12-NEXT:    flat_store_b32 v[9:10], v11
+; GISEL12-NEXT:    ; implicit-def: $vgpr9
+; GISEL12-NEXT:  .LBB6_2: ; %tail.block
+; GISEL12-NEXT:    s_wait_alu 0xfffe
+; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GISEL12-NEXT:    s_mov_b32 exec_lo, s2
+; GISEL12-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL12-LABEL: with_inactive_vgprs:
+; DAGISEL12:       ; %bb.0: ; %entry
+; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT:    s_wait_expcnt 0x0
+; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL12-NEXT:    s_or_saveexec_b32 s6, -1
+; DAGISEL12-NEXT:    s_mov_b32 s5, s1
+; DAGISEL12-NEXT:    s_mov_b32 s4, s0
+; DAGISEL12-NEXT:    s_wait_alu 0xfffe
+; DAGISEL12-NEXT:    s_and_saveexec_b32 s0, s6
+; DAGISEL12-NEXT:    s_cbranch_execz .LBB6_2
+; DAGISEL12-NEXT:  ; %bb.1: ; %shader
+; DAGISEL12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
+; DAGISEL12-NEXT:    flat_load_b32 v11, v[9:10]
+; DAGISEL12-NEXT:    ;;#ASMSTART
+; DAGISEL12-NEXT:    ; use v0-7
+; DAGISEL12-NEXT:    ;;#ASMEND
+; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL12-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; DAGISEL12-NEXT:    flat_store_b32 v[9:10], v11
+; DAGISEL12-NEXT:    ; implicit-def: $vgpr9
+; DAGISEL12-NEXT:  .LBB6_2: ; %tail.block
+; DAGISEL12-NEXT:    s_wait_alu 0xfffe
+; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; DAGISEL12-NEXT:    s_mov_b32 s0, s3
+; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s2
+; DAGISEL12-NEXT:    s_wait_alu 0xfffe
+; DAGISEL12-NEXT:    s_setpc_b64 s[4:5]
+;
+; GISEL10-LABEL: with_inactive_vgprs:
+; GISEL10:       ; %bb.0: ; %entry
+; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT:    s_or_saveexec_b32 s6, -1
+; GISEL10-NEXT:    s_mov_b32 s4, s0
+; GISEL10-NEXT:    s_mov_b32 s5, s1
+; GISEL10-NEXT:    s_mov_b32 s0, s3
+; GISEL10-NEXT:    s_and_saveexec_b32 s1, s6
+; GISEL10-NEXT:    s_cbranch_execz .LBB6_2
+; GISEL10-NEXT:  ; %bb.1: ; %shader
+; GISEL10-NEXT:    v_mov_b32_e32 v10, s5
+; GISEL10-NEXT:    v_mov_b32_e32 v9, s4
+; GISEL10-NEXT:    flat_load_dword v11, v[9:10]
+; GISEL10-NEXT:    ;;#ASMSTART
+; GISEL10-NEXT:    ; use v0-7
+; GISEL10-NEXT:    ;;#ASMEND
+; GISEL10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL10-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GISEL10-NEXT:    flat_store_dword v[9:10], v11
+; GISEL10-NEXT:    ; implicit-def: $vgpr9
+; GISEL10-NEXT:  .LBB6_2: ; %tail.block
+; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s2
+; GISEL10-NEXT:    s_setpc_b64 s[4:5]
+;
+; DAGISEL10-LABEL: with_inactive_vgprs:
+; DAGISEL10:       ; %bb.0: ; %entry
+; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s6, -1
+; DAGISEL10-NEXT:    s_mov_b32 s5, s1
+; DAGISEL10-NEXT:    s_mov_b32 s4, s0
+; DAGISEL10-NEXT:    s_and_saveexec_b32 s0, s6
+; DAGISEL10-NEXT:    s_cbranch_execz .LBB6_2
+; DAGISEL10-NEXT:  ; %bb.1: ; %shader
+; DAGISEL10-NEXT:    v_mov_b32_e32 v10, s5
+; DAGISEL10-NEXT:    v_mov_b32_e32 v9, s4
+; DAGISEL10-NEXT:    flat_load_dword v11, v[9:10]
+; DAGISEL10-NEXT:    ;;#ASMSTART
+; DAGISEL10-NEXT:    ; use v0-7
+; DAGISEL10-NEXT:    ;;#ASMEND
+; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; DAGISEL10-NEXT:    flat_store_dword v[9:10], v11
+; DAGISEL10-NEXT:    ; implicit-def: $vgpr9
+; DAGISEL10-NEXT:  .LBB6_2: ; %tail.block
+; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; DAGISEL10-NEXT:    s_mov_b32 s0, s3
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s2
+; DAGISEL10-NEXT:    s_setpc_b64 s[4:5]
+entry:
+  %imp.def = call i32 @llvm.amdgcn.dead()
+  %initial.exec = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %initial.exec, label %shader, label %tail.block
+
+shader:                                           ; preds = %entry
+  %use.another.vgpr = load i32, ptr %callee ; smth that won't be moved past the inline asm
+  call void asm sideeffect "; use v0-7", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"()
+  store i32 %use.another.vgpr, ptr %callee
+  %active.vgpr.new = add i32 %active.vgpr, %use.another.vgpr
+  br label %tail.block
+
+tail.block:                                       ; preds = %.exit27, %.exit49, %244, %243, %entry
+  %active.vgpr.arg = phi i32 [ %active.vgpr, %entry ], [ %active.vgpr.new, %shader ]
+  %inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ %imp.def, %shader ]
+  %vgprs.0 = insertvalue { i32, i32 } poison, i32 %active.vgpr.arg, 0
+  %vgprs = insertvalue { i32, i32 } %vgprs.0, i32 %inactive.vgpr.arg, 1
+  call void (ptr, i32, i32, { i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, { i32, i32} %vgprs, i32 0)
+  unreachable
+}
+
 declare amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32>)

>From 3df03db689f1072d04a815a8893c395010988c53 Mon Sep 17 00:00:00 2001
From: Narayan <32898329+vortex73 at users.noreply.github.com>
Date: Thu, 20 Feb 2025 14:04:21 +0530
Subject: [PATCH 04/26] [CodeGen] Refactor `warn()` to use StringRef (NFCI)
 (#127537)

closes #100064
---
 llvm/include/llvm/CGData/CodeGenData.h | 2 +-
 llvm/lib/CGData/CodeGenData.cpp        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index da0e412f2a0e0..0e7dd2f5b5c1a 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -265,7 +265,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
 Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjectFiles);
 
 void warn(Error E, StringRef Whence = "");
-void warn(Twine Message, std::string Whence = "", std::string Hint = "");
+void warn(Twine Message, StringRef Whence = "", StringRef Hint = "");
 
 } // end namespace cgdata
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 88dcdfd1f931a..bb6b9c7721d55 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -204,7 +204,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
 
 namespace cgdata {
 
-void warn(Twine Message, std::string Whence, std::string Hint) {
+void warn(Twine Message, StringRef Whence, StringRef Hint) {
   WithColor::warning();
   if (!Whence.empty())
     errs() << Whence << ": ";
@@ -216,7 +216,7 @@ void warn(Twine Message, std::string Whence, std::string Hint) {
 void warn(Error E, StringRef Whence) {
   if (E.isA<CGDataError>()) {
     handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
-      warn(IPE.message(), Whence.str(), "");
+      warn(IPE.message(), Whence, "");
     });
   }
 }

>From 079115e6eac0412214104b593849ee805d8921ce Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Thu, 20 Feb 2025 08:36:46 +0000
Subject: [PATCH 05/26] [libclc] Move modf to the CLC library (#127828)

The "generic" unary_(def|decl)_with_ptr files are intended to be re-used
by the sincos and fract builtins in the future as they share an
identical type signature.
---
 libclc/clc/include/clc/math/clc_modf.h        | 11 +++++++
 .../include/clc/math/unary_decl_with_ptr.inc  |  6 ++++
 .../include/clc/math/unary_def_with_ptr.inc   | 20 ++++++++++++
 libclc/clc/lib/generic/SOURCES                |  1 +
 .../lib/generic/math/clc_modf.cl}             | 13 +++++---
 .../lib/generic/math/clc_modf.inc}            | 31 +++++++------------
 libclc/generic/include/clc/math/modf.h        |  5 ++-
 libclc/generic/lib/math/modf.cl               |  5 +--
 8 files changed, 65 insertions(+), 27 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_modf.h
 create mode 100644 libclc/clc/include/clc/math/unary_decl_with_ptr.inc
 create mode 100644 libclc/clc/include/clc/math/unary_def_with_ptr.inc
 rename libclc/{generic/include/clc/math/modf.inc => clc/lib/generic/math/clc_modf.cl} (76%)
 rename libclc/{generic/lib/math/modf.inc => clc/lib/generic/math/clc_modf.inc} (68%)

diff --git a/libclc/clc/include/clc/math/clc_modf.h b/libclc/clc/include/clc/math/clc_modf.h
new file mode 100644
index 0000000000000..45484b09628a4
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_modf.h
@@ -0,0 +1,11 @@
+#ifndef __CLC_MATH_CLC_MODF_H__
+#define __CLC_MATH_CLC_MODF_H__
+
+#define __CLC_FUNCTION __clc_modf
+#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_MODF_H__
diff --git a/libclc/clc/include/clc/math/unary_decl_with_ptr.inc b/libclc/clc/include/clc/math/unary_decl_with_ptr.inc
new file mode 100644
index 0000000000000..04122108bc1f7
--- /dev/null
+++ b/libclc/clc/include/clc/math/unary_decl_with_ptr.inc
@@ -0,0 +1,6 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     global __CLC_GENTYPE *ptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     local __CLC_GENTYPE *ptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
+__CLC_FUNCTION(__CLC_GENTYPE x, private __CLC_GENTYPE *ptr);
diff --git a/libclc/clc/include/clc/math/unary_def_with_ptr.inc b/libclc/clc/include/clc/math/unary_def_with_ptr.inc
new file mode 100644
index 0000000000000..de7c9af756980
--- /dev/null
+++ b/libclc/clc/include/clc/math/unary_def_with_ptr.inc
@@ -0,0 +1,20 @@
+#include <clc/utils.h>
+
+#ifndef __CLC_FUNCTION
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                              private __CLC_GENTYPE *ptr) {
+  return __CLC_FUNCTION(FUNCTION)(x, ptr);
+}
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                              global __CLC_GENTYPE *ptr) {
+  return __CLC_FUNCTION(FUNCTION)(x, ptr);
+}
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                              local __CLC_GENTYPE *ptr) {
+  return __CLC_FUNCTION(FUNCTION)(x, ptr);
+}
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index b0eaf84c41438..ef0ad006307d7 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -22,6 +22,7 @@ math/clc_copysign.cl
 math/clc_fabs.cl
 math/clc_floor.cl
 math/clc_mad.cl
+math/clc_modf.cl
 math/clc_nextafter.cl
 math/clc_rint.cl
 math/clc_trunc.cl
diff --git a/libclc/generic/include/clc/math/modf.inc b/libclc/clc/lib/generic/math/clc_modf.cl
similarity index 76%
rename from libclc/generic/include/clc/math/modf.inc
rename to libclc/clc/lib/generic/math/clc_modf.cl
index 42bcf625686d2..27d2a08515257 100644
--- a/libclc/generic/include/clc/math/modf.inc
+++ b/libclc/clc/lib/generic/math/clc_modf.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -20,6 +20,11 @@
  * THE SOFTWARE.
  */
 
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, global __CLC_GENTYPE *iptr);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, local __CLC_GENTYPE *iptr);
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, private __CLC_GENTYPE *iptr);
+#include <clc/internal/clc.h>
+#include <clc/math/clc_copysign.h>
+#include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_isinf.h>
+
+#define __CLC_BODY <clc_modf.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/modf.inc b/libclc/clc/lib/generic/math/clc_modf.inc
similarity index 68%
rename from libclc/generic/lib/math/modf.inc
rename to libclc/clc/lib/generic/math/clc_modf.inc
index ff7ef30dd42f8..8242291c98d4e 100644
--- a/libclc/generic/lib/math/modf.inc
+++ b/libclc/clc/lib/generic/math/clc_modf.inc
@@ -19,31 +19,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-
-#if __CLC_FPSIZE == 64
-#define ZERO 0.0
-#elif __CLC_FPSIZE == 32
-#define ZERO 0.0f
-#elif __CLC_FPSIZE == 16
-#define ZERO 0.0h
-#endif
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x,
-                                          private __CLC_GENTYPE *iptr) {
-  *iptr = trunc(x);
-  return copysign(isinf(x) ? ZERO : x - *iptr, x);
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_modf(__CLC_GENTYPE x,
+                                                private __CLC_GENTYPE *iptr) {
+  *iptr = __clc_trunc(x);
+  return __clc_copysign(__clc_isinf(x) ? __CLC_FP_LIT(0.0) : x - *iptr, x);
 }
 
-#define MODF_DEF(addrspace)                                                    \
-  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x,                   \
-                                            addrspace __CLC_GENTYPE *iptr) {   \
+#define CLC_MODF_DEF(addrspace)                                                \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_modf(                             \
+      __CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) {                        \
     __CLC_GENTYPE private_iptr;                                                \
-    __CLC_GENTYPE ret = modf(x, &private_iptr);                                \
+    __CLC_GENTYPE ret = __clc_modf(x, &private_iptr);                          \
     *iptr = private_iptr;                                                      \
     return ret;                                                                \
   }
 
-MODF_DEF(local);
-MODF_DEF(global);
+CLC_MODF_DEF(local);
+CLC_MODF_DEF(global);
 
-#undef ZERO
+#undef CLC_MODF_DEF
diff --git a/libclc/generic/include/clc/math/modf.h b/libclc/generic/include/clc/math/modf.h
index f0fb6ca81920a..76eb1284432e4 100644
--- a/libclc/generic/include/clc/math/modf.h
+++ b/libclc/generic/include/clc/math/modf.h
@@ -20,5 +20,8 @@
  * THE SOFTWARE.
  */
 
-#define __CLC_BODY <clc/math/modf.inc>
+#define __CLC_FUNCTION modf
+#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc>
 #include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
diff --git a/libclc/generic/lib/math/modf.cl b/libclc/generic/lib/math/modf.cl
index 5098a41d079c5..5a01a316132e2 100644
--- a/libclc/generic/lib/math/modf.cl
+++ b/libclc/generic/lib/math/modf.cl
@@ -21,7 +21,8 @@
  */
 
 #include <clc/clc.h>
-#include <clc/math/math.h>
+#include <clc/math/clc_modf.h>
 
-#define __CLC_BODY <modf.inc>
+#define FUNCTION modf
+#define __CLC_BODY <clc/math/unary_def_with_ptr.inc>
 #include <clc/math/gentype.inc>

>From 684ad25dfc487476132b429dc92fca934460a8e3 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Thu, 20 Feb 2025 08:41:45 +0000
Subject: [PATCH 06/26] [libclc] Move frexp to CLC library; optimize half vecs
 (#127836)

This commit moves the frexp builtin to the CLC library.

It simultaneously optimizes the code generated for half vectors, which
was previously scalarizing and casting up to float. With this commit it
still casts up to float, but keeps it in the vector form.
---
 libclc/clc/include/clc/math/clc_frexp.h       | 11 +++
 .../clc/math/unary_decl_with_int_ptr.inc      |  6 ++
 .../clc/math/unary_def_with_int_ptr.inc       | 20 ++++
 .../clc/include/clc/relational/clc_select.h   |  6 +-
 libclc/clc/lib/generic/SOURCES                |  1 +
 libclc/clc/lib/generic/math/clc_frexp.cl      | 42 ++++++++
 libclc/clc/lib/generic/math/clc_frexp.inc     | 99 +++++++++++++++++++
 libclc/generic/lib/math/frexp.cl              | 17 +---
 libclc/generic/lib/math/frexp.inc             | 87 ----------------
 9 files changed, 183 insertions(+), 106 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_frexp.h
 create mode 100644 libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc
 create mode 100644 libclc/clc/include/clc/math/unary_def_with_int_ptr.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_frexp.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_frexp.inc
 delete mode 100644 libclc/generic/lib/math/frexp.inc

diff --git a/libclc/clc/include/clc/math/clc_frexp.h b/libclc/clc/include/clc/math/clc_frexp.h
new file mode 100644
index 0000000000000..f8a88ce69e154
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_frexp.h
@@ -0,0 +1,11 @@
+#ifndef __CLC_MATH_CLC_FREXP_H__
+#define __CLC_MATH_CLC_FREXP_H__
+
+#define __CLC_FUNCTION __clc_frexp
+#define __CLC_BODY <clc/math/unary_decl_with_int_ptr.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_FREXP_H__
diff --git a/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc b/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc
new file mode 100644
index 0000000000000..088e3bf122ee5
--- /dev/null
+++ b/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc
@@ -0,0 +1,6 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     global __CLC_INTN *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     local __CLC_INTN *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
+                                                     private __CLC_INTN *iptr);
diff --git a/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc b/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc
new file mode 100644
index 0000000000000..95f50c27bc34b
--- /dev/null
+++ b/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc
@@ -0,0 +1,20 @@
+#include <clc/utils.h>
+
+#ifndef __CLC_FUNCTION
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                              private __CLC_INTN *iptr) {
+  return __CLC_FUNCTION(FUNCTION)(x, iptr);
+}
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                              global __CLC_INTN *iptr) {
+  return __CLC_FUNCTION(FUNCTION)(x, iptr);
+}
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                              local __CLC_INTN *iptr) {
+  return __CLC_FUNCTION(FUNCTION)(x, iptr);
+}
diff --git a/libclc/clc/include/clc/relational/clc_select.h b/libclc/clc/include/clc/relational/clc_select.h
index a92f2051b577d..480a648c2efc0 100644
--- a/libclc/clc/include/clc/relational/clc_select.h
+++ b/libclc/clc/include/clc/relational/clc_select.h
@@ -1,9 +1,7 @@
 #ifndef __CLC_RELATIONAL_CLC_SELECT_H__
 #define __CLC_RELATIONAL_CLC_SELECT_H__
 
-/* Duplciate these so we don't have to distribute utils.h */
-#define __CLC_CONCAT(x, y) x##y
-#define __CLC_XCONCAT(x, y) __CLC_CONCAT(x, y)
+#include <clc/utils.h>
 
 #define __CLC_SELECT_FN __clc_select
 
@@ -13,7 +11,5 @@
 #include <clc/integer/gentype.inc>
 
 #undef __CLC_SELECT_FN
-#undef __CLC_CONCAT
-#undef __CLC_XCONCAT
 
 #endif // __CLC_RELATIONAL_CLC_SELECT_H__
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index ef0ad006307d7..f7fdba0a341ed 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -21,6 +21,7 @@ math/clc_ceil.cl
 math/clc_copysign.cl
 math/clc_fabs.cl
 math/clc_floor.cl
+math/clc_frexp.cl
 math/clc_mad.cl
 math/clc_modf.cl
 math/clc_nextafter.cl
diff --git a/libclc/clc/lib/generic/math/clc_frexp.cl b/libclc/clc/lib/generic/math/clc_frexp.cl
new file mode 100644
index 0000000000000..ecc3eb6281b1e
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_frexp.cl
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc_convert.h>
+#include <clc/internal/clc.h>
+#include <clc/math/math.h>
+#include <clc/relational/clc_select.h>
+#include <clc/utils.h>
+
+#define __CLC_BODY <clc_frexp.inc>
+#define __CLC_ADDRESS_SPACE private
+#include <clc/math/gentype.inc>
+#undef __CLC_ADDRESS_SPACE
+
+#define __CLC_BODY <clc_frexp.inc>
+#define __CLC_ADDRESS_SPACE global
+#include <clc/math/gentype.inc>
+#undef __CLC_ADDRESS_SPACE
+
+#define __CLC_BODY <clc_frexp.inc>
+#define __CLC_ADDRESS_SPACE local
+#include <clc/math/gentype.inc>
+#undef __CLC_ADDRESS_SPACE
diff --git a/libclc/clc/lib/generic/math/clc_frexp.inc b/libclc/clc/lib/generic/math/clc_frexp.inc
new file mode 100644
index 0000000000000..961b7848bf937
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_frexp.inc
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016 Aaron Watry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/utils.h>
+
+#define __CLC_AS_GENTYPE __CLC_XCONCAT(__clc_as_, __CLC_GENTYPE)
+#define __CLC_AS_INTN __CLC_XCONCAT(__clc_as_, __CLC_INTN)
+
+#if __CLC_FPSIZE == 32
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
+  __CLC_INTN i = __CLC_AS_INTN(x);
+  __CLC_INTN ai = i & 0x7fffffff;
+  __CLC_INTN d = ai > 0 & ai < 0x00800000;
+  /* scale subnormal by 2^26 without multiplying */
+  __CLC_GENTYPE s = __CLC_AS_GENTYPE(ai | 0x0d800000) - 0x1.0p-100f;
+  ai = __clc_select(ai, __CLC_AS_INTN(s), d);
+  __CLC_INTN e =
+      (ai >> 23) - 126 - __clc_select((__CLC_INTN)0, (__CLC_INTN)26, d);
+  __CLC_INTN t = ai == (__CLC_INTN)0 | e == (__CLC_INTN)129;
+  i = (i & (__CLC_INTN)0x80000000) | (__CLC_INTN)0x3f000000 | (ai & 0x007fffff);
+  *ep = __clc_select(e, (__CLC_INTN)0, t);
+  return __clc_select(__CLC_AS_GENTYPE(i), x, t);
+}
+#endif
+
+#if __CLC_FPSIZE == 16
+#ifdef __CLC_SCALAR
+#define __CLC_CONVERT_HALFN __clc_convert_half
+#define __CLC_CONVERT_FLOATN __clc_convert_float
+#else
+#define __CLC_CONVERT_HALFN __CLC_XCONCAT(__clc_convert_half, __CLC_VECSIZE)
+#define __CLC_CONVERT_FLOATN __CLC_XCONCAT(__clc_convert_float, __CLC_VECSIZE)
+#endif
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
+  return __CLC_CONVERT_HALFN(__clc_frexp(__CLC_CONVERT_FLOATN(x), ep));
+}
+#undef __CLC_CONVERT_FLOATN
+#undef __CLC_CONVERT_HALFN
+#endif
+
+#if __CLC_FPSIZE == 64
+#ifdef __CLC_SCALAR
+#define __CLC_AS_LONGN __clc_as_long
+#define __CLC_LONGN long
+#define __CLC_CONVERT_INTN __clc_convert_int
+#else
+#define __CLC_AS_LONGN __CLC_XCONCAT(__clc_as_long, __CLC_VECSIZE)
+#define __CLC_LONGN __CLC_XCONCAT(long, __CLC_VECSIZE)
+#define __CLC_CONVERT_INTN __CLC_XCONCAT(__clc_convert_int, __CLC_VECSIZE)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
+__clc_frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
+  __CLC_LONGN i = __CLC_AS_LONGN(x);
+  __CLC_LONGN ai = i & 0x7fffffffffffffffL;
+  __CLC_LONGN d = ai > 0 & ai < 0x0010000000000000L;
+  // scale subnormal by 2^54 without multiplying
+  __CLC_GENTYPE s = __CLC_AS_GENTYPE(ai | 0x0370000000000000L) - 0x1.0p-968;
+  ai = __clc_select(ai, __CLC_AS_LONGN(s), d);
+  __CLC_LONGN e = (ai >> 52) - (__CLC_LONGN)1022 -
+                  __clc_select((__CLC_LONGN)0, (__CLC_LONGN)54, d);
+  __CLC_LONGN t = ai == 0 | e == 1025;
+  i = (i & (__CLC_LONGN)0x8000000000000000L) |
+      (__CLC_LONGN)0x3fe0000000000000L |
+      (ai & (__CLC_LONGN)0x000fffffffffffffL);
+  *ep = __CLC_CONVERT_INTN(__clc_select(e, 0L, t));
+  return __clc_select(__CLC_AS_GENTYPE(i), x, t);
+}
+
+#undef __CLC_AS_LONGN
+#undef __CLC_LONGN
+#undef __CLC_CONVERT_INTN
+#endif
+
+#undef __CLC_AS_GENTYPE
+#undef __CLC_AS_INTN
diff --git a/libclc/generic/lib/math/frexp.cl b/libclc/generic/lib/math/frexp.cl
index 75a9158ff318f..fa6613ac27459 100644
--- a/libclc/generic/lib/math/frexp.cl
+++ b/libclc/generic/lib/math/frexp.cl
@@ -1,17 +1,6 @@
 #include <clc/clc.h>
-#include <clc/utils.h>
+#include <clc/math/clc_frexp.h>
 
-#define __CLC_BODY <frexp.inc>
-#define __CLC_ADDRESS_SPACE private
+#define FUNCTION frexp
+#define __CLC_BODY <clc/math/unary_def_with_int_ptr.inc>
 #include <clc/math/gentype.inc>
-#undef __CLC_ADDRESS_SPACE
-
-#define __CLC_BODY <frexp.inc>
-#define __CLC_ADDRESS_SPACE global
-#include <clc/math/gentype.inc>
-#undef __CLC_ADDRESS_SPACE
-
-#define __CLC_BODY <frexp.inc>
-#define __CLC_ADDRESS_SPACE local
-#include <clc/math/gentype.inc>
-#undef __CLC_ADDRESS_SPACE
diff --git a/libclc/generic/lib/math/frexp.inc b/libclc/generic/lib/math/frexp.inc
deleted file mode 100644
index 0d938d23c26a1..0000000000000
--- a/libclc/generic/lib/math/frexp.inc
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2014 Advanced Micro Devices, Inc.
- * Copyright (c) 2016 Aaron Watry
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/clcmacro.h>
-
-#define __CLC_AS_GENTYPE __CLC_XCONCAT(as_, __CLC_GENTYPE)
-#define __CLC_AS_INTN __CLC_XCONCAT(as_, __CLC_INTN)
-
-#if __CLC_FPSIZE == 32
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
-    __CLC_INTN i = __CLC_AS_INTN(x);
-    __CLC_INTN ai = i & 0x7fffffff;
-    __CLC_INTN d = ai > 0 & ai < 0x00800000;
-    /* scale subnormal by 2^26 without multiplying */
-    __CLC_GENTYPE s = __CLC_AS_GENTYPE(ai | 0x0d800000) - 0x1.0p-100f;
-    ai = select(ai, __CLC_AS_INTN(s), d);
-    __CLC_INTN e = (ai >> 23) - 126 - select((__CLC_INTN)0, (__CLC_INTN)26, d);
-    __CLC_INTN t = ai == (__CLC_INTN)0 | e == (__CLC_INTN)129;
-    i = (i & (__CLC_INTN)0x80000000) | (__CLC_INTN)0x3f000000 | (ai & 0x007fffff);
-    *ep = select(e, (__CLC_INTN)0, t);
-    return select(__CLC_AS_GENTYPE(i), x, t);
-}
-#endif
-
-#if __CLC_FPSIZE == 16
-#ifdef __CLC_SCALAR
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x,
-                                           __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
-  return (__CLC_GENTYPE)frexp((float)x, ep);
-}
-_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, __CLC_GENTYPE, frexp,
-                      __CLC_GENTYPE, __CLC_ADDRESS_SPACE, __CLC_INTN);
-#endif
-#endif
-
-#if __CLC_FPSIZE == 64
-#ifdef __CLC_SCALAR
-#define __CLC_AS_LONGN as_long
-#define __CLC_LONGN long
-#define __CLC_CONVERT_INTN convert_int
-#else
-#define __CLC_AS_LONGN __CLC_XCONCAT(as_long, __CLC_VECSIZE)
-#define __CLC_LONGN __CLC_XCONCAT(long, __CLC_VECSIZE)
-#define __CLC_CONVERT_INTN __CLC_XCONCAT(convert_int, __CLC_VECSIZE)
-#endif
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE __CLC_INTN *ep) {
-    __CLC_LONGN i = __CLC_AS_LONGN(x);
-    __CLC_LONGN ai = i & 0x7fffffffffffffffL;
-    __CLC_LONGN d = ai > 0 & ai < 0x0010000000000000L;
-    // scale subnormal by 2^54 without multiplying
-    __CLC_GENTYPE s = __CLC_AS_GENTYPE(ai | 0x0370000000000000L) - 0x1.0p-968;
-    ai = select(ai, __CLC_AS_LONGN(s), d);
-    __CLC_LONGN e = (ai >> 52) - (__CLC_LONGN)1022 - select((__CLC_LONGN)0, (__CLC_LONGN)54, d);
-    __CLC_LONGN t = ai == 0 | e == 1025;
-    i = (i & (__CLC_LONGN)0x8000000000000000L) | (__CLC_LONGN)0x3fe0000000000000L | (ai & (__CLC_LONGN)0x000fffffffffffffL);
-    *ep = __CLC_CONVERT_INTN(select(e, 0L, t));
-    return select(__CLC_AS_GENTYPE(i), x, t);
-}
-
-#undef __CLC_AS_LONGN
-#undef __CLC_LONGN
-#undef __CLC_CONVERT_INTN
-#endif
-
-#undef __CLC_AS_GENTYPE
-#undef __CLC_AS_INTN

>From 0cd5a1f3090beee88d34d22c6733c6df473877db Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva at intel.com>
Date: Thu, 20 Feb 2025 09:45:46 +0100
Subject: [PATCH 07/26] [NFC][clang] Cleanup in APValue and SemaInit (#127790)

APValue:
Additional assignment of AllowConstexprUnknown is not required since it
will be handled by copy constructor called above.
SemaInit:
Remove unnecessary null check. DestRecordDecl can't be null due to being
obtained using `cast` and assertion that DestRecordType is present.

Spotted by a static analysis tool.
---
 clang/lib/AST/APValue.cpp   | 1 -
 clang/lib/Sema/SemaInit.cpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp
index 3b814be266330..7c33d3a165a08 100644
--- a/clang/lib/AST/APValue.cpp
+++ b/clang/lib/AST/APValue.cpp
@@ -390,7 +390,6 @@ APValue &APValue::operator=(const APValue &RHS) {
   if (this != &RHS)
     *this = APValue(RHS);
 
-  AllowConstexprUnknown = RHS.AllowConstexprUnknown;
   return *this;
 }
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index a34005bf376aa..340e51adf190d 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4577,7 +4577,6 @@ static void TryConstructorInitialization(Sema &S,
     if (!IsListInit &&
         (Kind.getKind() == InitializationKind::IK_Default ||
          Kind.getKind() == InitializationKind::IK_Direct) &&
-        DestRecordDecl != nullptr &&
         !(CtorDecl->isCopyOrMoveConstructor() && CtorDecl->isImplicit()) &&
         DestRecordDecl->isAggregate() &&
         DestRecordDecl->hasUninitializedExplicitInitFields()) {

>From 62d77fcb3cebe80b3cf88588b5f94778799e86f3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 20 Feb 2025 09:11:29 +0000
Subject: [PATCH 08/26] [X86] combineX86ShuffleChain - don't combine to
 VPERM2W/VPERM2B from just any single variable mask (#127914)

Despite them being more expensive than other variable mask shuffles, we
were combining shuffle chains to VPERM2W/VPERM2B if any shuffle in the
chain was a variable shuffle - including very cheap shuffles like PSHUFB
or AND mask patterns.

This patch adjusts the BWI VPERMV3 threshold - it still always permits
the merge if the chain (of 2 or more shuffles) contains any
X86ISD::VPERMV/VPERMV3 shuffles (including DQ variants), but otherwise
only reduces the depth threshold based off the number of other variable
shuffles we'd fold away.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 16 +++--
 .../X86/avx512-shuffles/partial_permute.ll    | 54 ++++++++++-----
 .../vector-interleaved-load-i16-stride-6.ll   | 10 ++-
 .../vector-interleaved-load-i16-stride-7.ll   | 60 ++++++++--------
 .../vector-interleaved-store-i16-stride-4.ll  | 12 ++--
 .../CodeGen/X86/vector-shuffle-128-v16.ll     | 44 +++---------
 .../test/CodeGen/X86/vector-shuffle-128-v8.ll | 68 +++++++++++++------
 .../CodeGen/X86/vector-shuffle-256-v32.ll     | 24 +++++--
 .../CodeGen/X86/vector-shuffle-512-v64.ll     |  5 +-
 .../zero_extend_vector_inreg_of_broadcast.ll  | 21 +++---
 ...d_vector_inreg_of_broadcast_from_memory.ll | 24 +++----
 11 files changed, 187 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d805a76754c71..429e2b42ab5ca 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40064,9 +40064,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   if (Depth < 1)
     return SDValue();
 
-  bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
+  int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
     return isTargetShuffleVariableMask(N->getOpcode());
   });
+  bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
+    return (N->getOpcode() == X86ISD::VPERMV3 ||
+            N->getOpcode() == X86ISD::VPERMV);
+  });
 
   // Depth threshold above which we can efficiently use variable mask shuffles.
   int VariableCrossLaneShuffleDepth =
@@ -40074,13 +40078,15 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   int VariablePerLaneShuffleDepth =
       Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
   AllowVariableCrossLaneMask &=
-      (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
+      (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
   AllowVariablePerLaneMask &=
-      (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
-  // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
+      (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
+  // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
   // higher depth before combining them.
+  int BWIVPERMV3ShuffleDepth =
+      VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
   bool AllowBWIVPERMV3 =
-      (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
+      (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
 
   // If root was a VPERMV3 node, always allow a variable shuffle.
   if (Root.getOpcode() == X86ISD::VPERMV3)
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index fd9b46e82e0b1..a84424bf7dea9 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -225,15 +225,25 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16
 }
 
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9]
-; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9]
+; CHECK-FAST-NEXT:    vptestnmw %xmm1, %xmm1, %k1
+; CHECK-FAST-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vpsrld $16, (%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 16(%rdi), %xmm3
+; CHECK-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vptestnmw %xmm1, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -242,14 +252,24 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16>
 }
 
 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9]
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-FAST-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9]
+; CHECK-FAST-NEXT:    vptestnmw %xmm0, %xmm0, %k1
+; CHECK-FAST-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-NEXT:    vzeroupper
+; CHECK-FAST-NEXT:    retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
+; CHECK-FAST-PERLANE:       # %bb.0:
+; CHECK-FAST-PERLANE-NEXT:    vpsrld $16, (%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa 16(%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT:    vptestnmw %xmm0, %xmm0, %k1
+; CHECK-FAST-PERLANE-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT:    retq
   %vec = load <16 x i16>, ptr %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 9d0183c816b12..feb75b21d5c8d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -293,8 +293,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11]
-; AVX512BW-FCP-NEXT:    vpermw (%rdi), %ymm5, %ymm5
+; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
 ; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -307,7 +307,6 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%r8)
 ; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r9)
 ; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rax)
-; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride6_vf2:
@@ -347,8 +346,8 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 4(%rdi), %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [3,9,1,9,2,10,3,11]
-; AVX512DQ-BW-FCP-NEXT:    vpermw (%rdi), %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 20(%rdi), %xmm6
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -361,7 +360,6 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%r8)
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r9)
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <12 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 95b5ffde48564..038c73bd9fed2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -321,23 +321,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpsrld $16, %xmm0, %xmm3
 ; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm6
-; AVX512BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm7
-; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15]
-; AVX512BW-FCP-NEXT:    vpermw (%rdi), %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm8
+; AVX512BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm9
+; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
 ; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%r8)
-; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r9)
-; AVX512BW-FCP-NEXT:    vmovd %xmm1, (%r10)
-; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rax)
-; AVX512BW-FCP-NEXT:    vzeroupper
+; AVX512BW-FCP-NEXT:    vmovd %xmm7, (%rcx)
+; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%r8)
+; AVX512BW-FCP-NEXT:    vmovd %xmm8, (%r9)
+; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%r10)
+; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf2:
@@ -379,23 +379,23 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpsrld $16, %xmm0, %xmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [6,13,5,13,6,14,7,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw (%rdi), %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw 8(%rdi), %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpsrlq $48, %xmm1, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
 ; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm1, (%r10)
-; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vzeroupper
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm7, (%rcx)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%r8)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm8, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%r10)
+; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <14 x i16>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index 71eb606a8665d..187a8102095ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -123,9 +123,9 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
-; AVX512BW-NEXT:    vpermi2w %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT:    vmovdqa %xmm2, (%r8)
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%r8)
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: store_i16_stride4_vf2:
@@ -145,9 +145,9 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
-; AVX512DQ-BW-NEXT:    vpermi2w %xmm1, %xmm0, %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%r8)
+; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
+; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%r8)
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride4_vf2:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 9fd8c11ba6c4d..b1c90aa8021b8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -495,23 +495,11 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
-; AVX2-NEXT:    retq
-;
-; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
-; AVX512VLBW-NEXT:    retq
-;
-; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20]
-; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
-; AVX512VLVBMI-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
+; AVX2OR512VL-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
 ; XOP:       # %bb.0:
@@ -853,23 +841,11 @@ define <16 x i8> @shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    retq
-;
-; AVX512VLBW-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT:    retq
-;
-; AVX512VLVBMI-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,20,2,20,2,20,2,20,2,20,2,20,2,20,2,20]
-; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
-; AVX512VLVBMI-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2OR512VL-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; XOP:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index d73cfb379333b..212cde9fcd6b2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -7,8 +7,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX2,AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX2,AVX2-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST,AVX512VL-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL,AVX512VL-FAST,AVX512VL-FAST-PERLANE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=AVX,XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=AVX,XOP,XOPAVX2
 
@@ -1155,8 +1155,8 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,0,9,8,3,2,11,10]
-; AVX512VL-FAST-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11]
 ; AVX512VL-FAST-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v8i16_109832ba:
@@ -1246,11 +1246,18 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512VL-SLOW-NEXT:    retq
 ;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
-; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15]
-; AVX512VL-FAST-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i16_0213cedf:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512VL-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX512VL-FAST-PERLANE-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v8i16_0213cedf:
 ; XOP:       # %bb.0:
@@ -1314,8 +1321,8 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX:
 ; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [4,4,3,10,4,5,6,7]
-; AVX512VL-FAST-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15]
 ; AVX512VL-FAST-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v8i16_443aXXXX:
@@ -1542,11 +1549,23 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v8i16_012dcde3:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3]
-; AVX512VL-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dcde3:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i16_012dcde3:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i16_012dcde3:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v8i16_012dcde3:
 ; XOP:       # %bb.0:
@@ -1645,11 +1664,18 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512VL-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
 ; AVX512VL-SLOW-NEXT:    retq
 ;
-; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579:
-; AVX512VL-FAST:       # %bb.0:
-; AVX512VL-FAST-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9]
-; AVX512VL-FAST-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9]
+; AVX512VL-FAST-ALL-NEXT:    vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i16_XXX1X579:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,8,9,10,11,14,15,u,u]
+; AVX512VL-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512VL-FAST-PERLANE-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v8i16_XXX1X579:
 ; XOP:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 176ba696e6540..d287fb6d5b834 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4641,11 +4641,25 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
 ; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
-; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
-; AVX512VLVBMI-NEXT:    retq
+; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLVBMI-SLOW:       # %bb.0:
+; AVX512VLVBMI-SLOW-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
+; AVX512VLVBMI-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VLVBMI-SLOW-NEXT:    retq
+;
+; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLVBMI-FAST-ALL:       # %bb.0:
+; AVX512VLVBMI-FAST-ALL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI-FAST-ALL-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-FAST-ALL-NEXT:    retq
+;
+; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLVBMI-FAST-PERLANE:       # %bb.0:
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    retq
 ;
 ; XOP-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; XOP:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 4fe50a60b67fa..34f94bcfe3e6f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -494,9 +494,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
-; AVX512VBMI-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zmm0[13],zero,zmm0[11],zero,zmm0[9],zero,zmm0[7],zero,zmm0[5],zero,zmm0[3],zero,zmm0[1],zero,zmm0[31],zero,zmm0[29],zero,zmm0[27],zero,zmm0[25],zero,zmm0[23],zero,zmm0[21],zero,zmm0[19],zero,zmm0[17],zero,zmm0[47],zero,zmm0[45],zero,zmm0[43],zero,zmm0[41],zero,zmm0[39],zero,zmm0[37],zero,zmm0[35],zero,zmm0[33],zero,zmm0[63],zero,zmm0[61],zero,zmm0[59],zero,zmm0[57],zero,zmm0[55],zero,zmm0[53],zero,zmm0[51],zero,zmm0[49],zero
 ; AVX512VBMI-NEXT:    retq
   %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 63, i32 64, i32 61, i32 64, i32 59, i32 64, i32 57, i32 64, i32 55, i32 64, i32 53, i32 64, i32 51, i32 64, i32 49, i32 64, i32 47, i32 64, i32 45, i32 64, i32 43, i32 64, i32 41, i32 64, i32 39, i32 64, i32 37, i32 64, i32 35, i32 64, i32 33, i32 64, i32 31, i32 64, i32 29, i32 64, i32 27, i32 64, i32 25, i32 64, i32 23, i32 64, i32 21, i32 64, i32 19, i32 64, i32 17, i32 64, i32 15, i32 64, i32 13, i32 64, i32 11, i32 64, i32 9, i32 64, i32 7, i32 64, i32 5, i32 64, i32 3, i32 64, i32 1, i32 64>
   ret <64 x i8> %shuffle
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ec7a708fc0b02..758061d456807 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -6181,10 +6181,9 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -6300,10 +6299,9 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -6419,10 +6417,9 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63]
-; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 14c2a60a5b998..a33c4a7e85954 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -4945,10 +4945,10 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [32,1,32,3,32,5,32,7,32,9,32,11,32,13,32,15,32,17,32,19,32,21,32,23,32,25,32,27,32,29,32,31]
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -5048,10 +5048,10 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [32,1,2,3,32,5,6,7,32,9,10,11,32,13,14,15,32,17,18,19,32,21,22,23,32,25,26,27,32,29,30,31]
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -5152,10 +5152,10 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
 ;
 ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,32,25,26,27,28,29,30,31]
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq

>From 2c022e3617ec9bab3c9ab17464610843563ed9ed Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel at labath.sk>
Date: Thu, 20 Feb 2025 10:17:44 +0100
Subject: [PATCH 09/26] [lldb] Replace LineTable::upper_bound with
 GetLineEntryIndexRange (#127806)

After (too) much deliberation, I came to the conclusion that this isn't
the right abstraction, as it doesn't behave completely like the standard
upper_bound method. What I really wanted was to get the range of line
entries for a given address range -- so I implement just that.

lower_bound is still useful as a primitive for building other kinds of
lookups.
---
 lldb/include/lldb/Symbol/LineTable.h    | 23 ++++----
 lldb/source/Symbol/LineTable.cpp        | 24 ++++----
 lldb/unittests/Symbol/LineTableTest.cpp | 74 ++++++++++++++++---------
 3 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/lldb/include/lldb/Symbol/LineTable.h b/lldb/include/lldb/Symbol/LineTable.h
index f66081b6ee110..c1a973635cdd4 100644
--- a/lldb/include/lldb/Symbol/LineTable.h
+++ b/lldb/include/lldb/Symbol/LineTable.h
@@ -102,18 +102,19 @@ class LineTable {
 
   void GetDescription(Stream *s, Target *target, lldb::DescriptionLevel level);
 
-  /// Helper function for line table iteration. \c lower_bound returns the index
-  /// of the first line entry which ends after the given address (i.e., the
-  /// first entry which contains the given address or it comes after it).
-  /// \c upper_bound returns the index of the first line entry which begins on
-  /// or after the given address (i.e., the entry which would come after the
-  /// entry containing the given address, if such an entry exists). Functions
-  /// return <tt>GetSize()</tt> if there is no such entry. The functions are
-  /// most useful in combination: iterating from <tt>lower_bound(a)</tt> to
-  /// <tt>upper_bound(b) returns all line tables which intersect the half-open
-  /// range <tt>[a,b)</tt>.
+  /// Returns the index of the first line entry which ends after the given
+  /// address (i.e., the first entry which contains the given address or it
+  /// comes after it). Returns <tt>GetSize()</tt> if there is no such entry.
   uint32_t lower_bound(const Address &so_addr) const;
-  uint32_t upper_bound(const Address &so_addr) const;
+
+  /// Returns the (half-open) range of line entry indexes which overlap the
+  /// given address range. Line entries partially overlapping the range (on
+  /// either side) are included as well. Returns an empty range
+  /// (<tt>first==second</tt>) pointing to the "right" place in the list if
+  /// there are no such line entries. Empty input ranges always result in an
+  /// empty output range.
+  std::pair<uint32_t, uint32_t>
+  GetLineEntryIndexRange(const AddressRange &range) const;
 
   /// Find a line entry that contains the section offset address \a so_addr.
   ///
diff --git a/lldb/source/Symbol/LineTable.cpp b/lldb/source/Symbol/LineTable.cpp
index aae4ab59ff156..c5914a2719cc9 100644
--- a/lldb/source/Symbol/LineTable.cpp
+++ b/lldb/source/Symbol/LineTable.cpp
@@ -206,25 +206,27 @@ uint32_t LineTable::lower_bound(const Address &so_addr) const {
   return std::distance(m_entries.begin(), pos);
 }
 
-uint32_t LineTable::upper_bound(const Address &so_addr) const {
-  if (so_addr.GetModule() != m_comp_unit->GetModule())
-    return GetSize();
+std::pair<uint32_t, uint32_t>
+LineTable::GetLineEntryIndexRange(const AddressRange &range) const {
+  uint32_t first = lower_bound(range.GetBaseAddress());
+  if (first >= GetSize() || range.GetByteSize() == 0)
+    return {first, first};
 
   Entry search_entry;
-  search_entry.file_addr = so_addr.GetFileAddress();
-  if (search_entry.file_addr == LLDB_INVALID_ADDRESS)
-    return GetSize();
+  search_entry.file_addr =
+      range.GetBaseAddress().GetFileAddress() + range.GetByteSize();
 
-  // This is not a typo. lower_bound returns the first entry which starts on or
-  // after the given address, which is exactly what we want -- *except* if the
-  // entry is a termination entry (in that case, we want the one after it).
+  // lower_bound returns the first entry which starts on or after the given
+  // address, which is exactly what we want -- *except* if the entry is a
+  // termination entry (in that case, we want the one after it).
   auto pos =
-      llvm::lower_bound(m_entries, search_entry, Entry::EntryAddressLessThan);
+      std::lower_bound(std::next(m_entries.begin(), first), m_entries.end(),
+                       search_entry, Entry::EntryAddressLessThan);
   if (pos != m_entries.end() && pos->file_addr == search_entry.file_addr &&
       pos->is_terminal_entry)
     ++pos;
 
-  return std::distance(m_entries.begin(), pos);
+  return {first, std::distance(m_entries.begin(), pos)};
 }
 
 bool LineTable::FindLineEntryByAddress(const Address &so_addr,
diff --git a/lldb/unittests/Symbol/LineTableTest.cpp b/lldb/unittests/Symbol/LineTableTest.cpp
index 2fa2913f67f9e..ef5493138f318 100644
--- a/lldb/unittests/Symbol/LineTableTest.cpp
+++ b/lldb/unittests/Symbol/LineTableTest.cpp
@@ -194,7 +194,7 @@ CreateFakeModule(std::vector<std::unique_ptr<LineSequence>> line_sequences) {
                            std::move(text_sp), line_table};
 }
 
-TEST_F(LineTableTest, LowerAndUpperBound) {
+TEST_F(LineTableTest, lower_bound) {
   LineSequenceBuilder builder;
   builder.Entry(0);
   builder.Entry(10);
@@ -211,41 +211,63 @@ TEST_F(LineTableTest, LowerAndUpperBound) {
 
   auto make_addr = [&](addr_t addr) { return Address(fixture->text_sp, addr); };
 
-  // Both functions return the same value for boundary values. This way the
-  // index range for e.g. [0,10) is [0,1).
   EXPECT_EQ(table->lower_bound(make_addr(0)), 0u);
-  EXPECT_EQ(table->upper_bound(make_addr(0)), 0u);
+  EXPECT_EQ(table->lower_bound(make_addr(9)), 0u);
   EXPECT_EQ(table->lower_bound(make_addr(10)), 1u);
-  EXPECT_EQ(table->upper_bound(make_addr(10)), 1u);
+  EXPECT_EQ(table->lower_bound(make_addr(19)), 1u);
+
+  // Skips over the terminal entry.
   EXPECT_EQ(table->lower_bound(make_addr(20)), 3u);
-  EXPECT_EQ(table->upper_bound(make_addr(20)), 3u);
+  EXPECT_EQ(table->lower_bound(make_addr(29)), 3u);
 
-  // In case there's no "real" entry at this address, they return the first real
-  // entry.
+  // In case there's no "real" entry at this address, the function returns the
+  // first real entry.
   EXPECT_EQ(table->lower_bound(make_addr(30)), 5u);
-  EXPECT_EQ(table->upper_bound(make_addr(30)), 5u);
-
   EXPECT_EQ(table->lower_bound(make_addr(40)), 5u);
-  EXPECT_EQ(table->upper_bound(make_addr(40)), 5u);
-
-  // For in-between values, their result differs by one. [9,19) maps to [0,2)
-  // because the first two entries contain a part of that range.
-  EXPECT_EQ(table->lower_bound(make_addr(9)), 0u);
-  EXPECT_EQ(table->upper_bound(make_addr(9)), 1u);
-  EXPECT_EQ(table->lower_bound(make_addr(19)), 1u);
-  EXPECT_EQ(table->upper_bound(make_addr(19)), 2u);
-  EXPECT_EQ(table->lower_bound(make_addr(29)), 3u);
-  EXPECT_EQ(table->upper_bound(make_addr(29)), 4u);
 
-  // In a gap, they both return the first entry after the gap.
-  EXPECT_EQ(table->upper_bound(make_addr(39)), 5u);
-  EXPECT_EQ(table->upper_bound(make_addr(39)), 5u);
+  // In a gap, return the first entry after the gap.
+  EXPECT_EQ(table->lower_bound(make_addr(39)), 5u);
 
-  // And if there's no such entry, they return the size of the list.
+  // And if there's no such entry, return the size of the list.
   EXPECT_EQ(table->lower_bound(make_addr(50)), table->GetSize());
-  EXPECT_EQ(table->upper_bound(make_addr(50)), table->GetSize());
   EXPECT_EQ(table->lower_bound(make_addr(59)), table->GetSize());
-  EXPECT_EQ(table->upper_bound(make_addr(59)), table->GetSize());
+}
+
+TEST_F(LineTableTest, GetLineEntryIndexRange) {
+  LineSequenceBuilder builder;
+  builder.Entry(0);
+  builder.Entry(10);
+  builder.Entry(20, LineSequenceBuilder::Terminal);
+
+  llvm::Expected<FakeModuleFixture> fixture = CreateFakeModule(builder.Build());
+  ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded());
+
+  LineTable *table = fixture->line_table;
+
+  auto make_range = [&](addr_t addr, addr_t size) {
+    return AddressRange(fixture->text_sp, addr, size);
+  };
+
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(0, 10)),
+              testing::Pair(0, 1));
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(0, 20)),
+              testing::Pair(0, 3)); // Includes the terminal entry.
+  // Partial overlap on one side.
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(3, 7)),
+              testing::Pair(0, 1));
+  // On the other side
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(0, 15)),
+              testing::Pair(0, 2));
+  // On both sides
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(2, 3)),
+              testing::Pair(0, 1));
+  // Empty ranges
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(0, 0)),
+              testing::Pair(0, 0));
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(5, 0)),
+              testing::Pair(0, 0));
+  EXPECT_THAT(table->GetLineEntryIndexRange(make_range(10, 0)),
+              testing::Pair(1, 1));
 }
 
 TEST_F(LineTableTest, FindLineEntryByAddress) {

>From e264317b45163f5c3ba7fc5375dcdecd827fce95 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel at labath.sk>
Date: Thu, 20 Feb 2025 10:25:59 +0100
Subject: [PATCH 10/26] [lldb] Fix
 RangeDataVector::CombineConsecutiveEntriesWithEqualData (#127059)

Function was merging equal data even if they weren't adjecant. This
caused a problem in command-disassemble.s test because the two ranges
describing the function would be merged and "swallow" the function
between them.

This PR copies/adapts the algorithm from
RangeVector::CombineConsecutiveEntries (which does not have the same
problem) and also adds a call to ComputeUpperBounds as moving entries
around invalidates the binary tree. (The lack of this call wasn't
noticed until now either because we were not calling methods which rely
on upper bounds (right now, it's only the ill-named FindEntryIndexes
method), or because we weren't merging anything.
---
 lldb/include/lldb/Utility/RangeMap.h          | 47 ++++++++-----------
 .../test/Shell/Commands/command-disassemble.s |  3 +-
 lldb/unittests/Utility/RangeMapTest.cpp       | 21 +++++++++
 3 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h
index 433466eebced8..8af690e813c4a 100644
--- a/lldb/include/lldb/Utility/RangeMap.h
+++ b/lldb/include/lldb/Utility/RangeMap.h
@@ -493,36 +493,27 @@ class RangeDataVector {
 #ifdef ASSERT_RANGEMAP_ARE_SORTED
     assert(IsSorted());
 #endif
-    typename Collection::iterator pos;
-    typename Collection::iterator end;
-    typename Collection::iterator prev;
-    bool can_combine = false;
-    // First we determine if we can combine any of the Entry objects so we
-    // don't end up allocating and making a new collection for no reason
-    for (pos = m_entries.begin(), end = m_entries.end(), prev = end; pos != end;
-         prev = pos++) {
-      if (prev != end && prev->data == pos->data) {
-        can_combine = true;
-        break;
-      }
-    }
+    auto first_intersect = std::adjacent_find(
+        m_entries.begin(), m_entries.end(), [](const Entry &a, const Entry &b) {
+          return a.DoesAdjoinOrIntersect(b) && a.data == b.data;
+        });
 
-    // We can combine at least one entry, then we make a new collection and
-    // populate it accordingly, and then swap it into place.
-    if (can_combine) {
-      Collection minimal_ranges;
-      for (pos = m_entries.begin(), end = m_entries.end(), prev = end;
-           pos != end; prev = pos++) {
-        if (prev != end && prev->data == pos->data)
-          minimal_ranges.back().SetRangeEnd(pos->GetRangeEnd());
-        else
-          minimal_ranges.push_back(*pos);
-      }
-      // Use the swap technique in case our new vector is much smaller. We must
-      // swap when using the STL because std::vector objects never release or
-      // reduce the memory once it has been allocated/reserved.
-      m_entries.swap(minimal_ranges);
+    if (first_intersect == m_entries.end())
+      return;
+
+    // We can combine at least one entry. Make a new collection and populate it
+    // accordingly, and then swap it into place.
+    auto pos = std::next(first_intersect);
+    Collection minimal_ranges(m_entries.begin(), pos);
+    for (; pos != m_entries.end(); ++pos) {
+      Entry &back = minimal_ranges.back();
+      if (back.DoesAdjoinOrIntersect(*pos) && back.data == pos->data)
+        back.SetRangeEnd(std::max(back.GetRangeEnd(), pos->GetRangeEnd()));
+      else
+        minimal_ranges.push_back(*pos);
     }
+    m_entries.swap(minimal_ranges);
+    ComputeUpperBounds(0, m_entries.size());
   }
 
   void Clear() { m_entries.clear(); }
diff --git a/lldb/test/Shell/Commands/command-disassemble.s b/lldb/test/Shell/Commands/command-disassemble.s
index eb84a9ce39d4a..14f416d221231 100644
--- a/lldb/test/Shell/Commands/command-disassemble.s
+++ b/lldb/test/Shell/Commands/command-disassemble.s
@@ -94,8 +94,7 @@
 # CHECK-EMPTY:
 # CHECK-NEXT: command-disassemble.s.tmp`n2::case3:
 # CHECK-NEXT: command-disassemble.s.tmp[0x9046] <+0>: jmp 0x6046 ; <-12288>
-## FIXME: This should resolve to `middle_of_case3`
-# CHECK-NEXT: command-disassemble.s.tmp[0x904b] <+5>: jmp 0x7046 ; n2::case3 - 8192
+# CHECK-NEXT: command-disassemble.s.tmp[0x904b] <+5>: jmp 0x7046 ; middle_of_case3
 # CHECK-NEXT: command-disassemble.s.tmp[0x9050] <+10>: int    $0x2a
 # CHECK-EMPTY:
 # CHECK-NEXT: command-disassemble.s.tmp`n1::case3:
diff --git a/lldb/unittests/Utility/RangeMapTest.cpp b/lldb/unittests/Utility/RangeMapTest.cpp
index 981fa2a7d1c34..2022a2374fb8d 100644
--- a/lldb/unittests/Utility/RangeMapTest.cpp
+++ b/lldb/unittests/Utility/RangeMapTest.cpp
@@ -238,3 +238,24 @@ TEST(RangeDataVector, FindEntryIndexesThatContain_Overlap) {
   EXPECT_THAT(FindEntryIndexes(39, Map), testing::ElementsAre(10));
   EXPECT_THAT(FindEntryIndexes(40, Map), testing::ElementsAre());
 }
+
+TEST(RangeDataVector, CombineConsecutiveEntriesWithEqualData) {
+  RangeDataVectorT Map;
+  Map.Append(EntryT(0, 10, 47));
+  Map.Append(EntryT(10, 10, 47));
+  Map.Sort();
+  Map.CombineConsecutiveEntriesWithEqualData();
+  EXPECT_THAT(FindEntryIndexes(5, Map), testing::ElementsAre(47));
+  EXPECT_THAT(FindEntryIndexes(15, Map), testing::ElementsAre(47));
+  EXPECT_THAT(FindEntryIndexes(25, Map), testing::ElementsAre());
+
+  Map.Clear();
+  Map.Append(EntryT(0, 10, 47));
+  Map.Append(EntryT(20, 10, 47));
+  Map.Sort();
+  Map.CombineConsecutiveEntriesWithEqualData();
+  EXPECT_THAT(FindEntryIndexes(5, Map), testing::ElementsAre(47));
+  EXPECT_THAT(FindEntryIndexes(15, Map), testing::ElementsAre());
+  EXPECT_THAT(FindEntryIndexes(25, Map), testing::ElementsAre(47));
+  EXPECT_THAT(FindEntryIndexes(35, Map), testing::ElementsAre());
+}

>From 55fa2fa3480928d2ce6e06336842f673ac92f9d3 Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov at intel.com>
Date: Thu, 20 Feb 2025 10:27:15 +0100
Subject: [PATCH 11/26] [SPIR-V] Add SPV_INTEL_bindless_images extension
 (#127737)

Adds instructions to convert convert unsigned integer handles to images,
samplers and sampled images.

Spec:

https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_bindless_images.asciidoc

---------

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov at intel.com>
---
 llvm/docs/SPIRVUsage.rst                      |  2 ++
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp       | 31 +++++++++++++++++
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |  6 ++++
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    |  2 ++
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |  8 +++++
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 11 ++++++
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |  2 ++
 .../bindless_images_generic.ll                | 34 +++++++++++++++++++
 8 files changed, 96 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/bindless_images_generic.ll

diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index b7601b26beb89..93c53a04bc447 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -155,6 +155,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Adds atomic min and max instruction on floating-point numbers.
    * - ``SPV_INTEL_arbitrary_precision_integers``
      - Allows generating arbitrary width integer types.
+   * - ``SPV_INTEL_bindless_images``
+     - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images.
    * - ``SPV_INTEL_bfloat16_conversion``
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
    * - ``SPV_INTEL_cache_controls``
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 7b897f7e34c6f..473fcec8c4dea 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1043,6 +1043,24 @@ static bool buildExtendedBitOpsInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+/// Helper function for building Intel's bindless image instructions.
+static bool buildBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
+                                        unsigned Opcode,
+                                        MachineIRBuilder &MIRBuilder,
+                                        SPIRVGlobalRegistry *GR) {
+  // Generate SPIRV instruction accordingly.
+  if (Call->isSpirvOp())
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+                              GR->getSPIRVTypeID(Call->ReturnType));
+
+  auto MIB = MIRBuilder.buildInstr(Opcode)
+                 .addDef(Call->ReturnRegister)
+                 .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+                 .addUse(Call->Arguments[0]);
+
+  return true;
+}
+
 static unsigned getNumComponentsForDim(SPIRV::Dim::Dim dim) {
   switch (dim) {
   case SPIRV::Dim::DIM_1D:
@@ -2232,6 +2250,17 @@ static bool generateExtendedBitOpsInst(const SPIRV::IncomingCall *Call,
   return buildExtendedBitOpsInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generateBindlessImageINTELInst(const SPIRV::IncomingCall *Call,
+                                           MachineIRBuilder &MIRBuilder,
+                                           SPIRVGlobalRegistry *GR) {
+  // Lookup the instruction opcode in the TableGen records.
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+
+  return buildBindlessImageINTELInst(Call, Opcode, MIRBuilder, GR);
+}
+
 static bool buildNDRange(const SPIRV::IncomingCall *Call,
                          MachineIRBuilder &MIRBuilder,
                          SPIRVGlobalRegistry *GR) {
@@ -2809,6 +2838,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateCoopMatrInst(Call.get(), MIRBuilder, GR);
   case SPIRV::ExtendedBitOps:
     return generateExtendedBitOpsInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::BindlessINTEL:
+    return generateBindlessImageINTELInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 85f42fc08a4e0..c9a5c92ee3a66 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -66,6 +66,7 @@ def Construct : BuiltinGroup;
 def CoopMatr : BuiltinGroup;
 def ICarryBorrow : BuiltinGroup;
 def ExtendedBitOps : BuiltinGroup;
+def BindlessINTEL : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -708,6 +709,11 @@ defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixStoreCheckedINTEL", Open
 defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixConstructCheckedINTEL", OpenCL_std, CoopMatr, 5, 5, OpCooperativeMatrixConstructCheckedINTEL>;
 defm : DemangledNativeBuiltin<"__spirv_CooperativeMatrixGetElementCoordINTEL", OpenCL_std, CoopMatr, 2, 2, OpCooperativeMatrixGetElementCoordINTEL>;
 
+// SPV_INTEL_bindless_images builtin records:
+defm : DemangledNativeBuiltin<"__spirv_ConvertHandleToImageINTEL", OpenCL_std, BindlessINTEL, 1, 1, OpConvertHandleToImageINTEL>;
+defm : DemangledNativeBuiltin<"__spirv_ConvertHandleToSamplerINTEL", OpenCL_std, BindlessINTEL, 1, 1, OpConvertHandleToSamplerINTEL>;
+defm : DemangledNativeBuiltin<"__spirv_ConvertHandleToSampledImageINTEL", OpenCL_std, BindlessINTEL, 1, 1, OpConvertHandleToSampledImageINTEL>;
+
 //===----------------------------------------------------------------------===//
 // Class defining a work/sub group builtin that should be translated into a
 // SPIR-V instruction using the defined properties.
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 13683fd9a266d..2472e92b360be 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -71,6 +71,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_KHR_linkonce_odr},
         {"SPV_INTEL_inline_assembly",
          SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
+        {"SPV_INTEL_bindless_images",
+         SPIRV::Extension::Extension::SPV_INTEL_bindless_images},
         {"SPV_INTEL_bfloat16_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
         {"SPV_KHR_subgroup_rotate",
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 981e224a66399..65cf14fc93864 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -931,3 +931,11 @@ def OpCooperativeMatrixPrefetchINTEL: Op<6449, (outs),
 // SPV_EXT_arithmetic_fence
 def OpArithmeticFenceEXT: Op<6145, (outs ID:$res), (ins TYPE:$type, ID:$target),
                   "$res = OpArithmeticFenceEXT $type $target">;
+
+// SPV_INTEL_bindless_images
+def OpConvertHandleToImageINTEL: Op<6529, (outs ID:$res), (ins TYPE:$type, ID:$operand),
+                  "$res = OpConvertHandleToImageINTEL $type $operand">;
+def OpConvertHandleToSamplerINTEL: Op<6530, (outs ID:$res), (ins TYPE:$type, ID:$operand),
+                  "$res = OpConvertHandleToSamplerINTEL $type $operand">;
+def OpConvertHandleToSampledImageINTEL: Op<6531, (outs ID:$res), (ins TYPE:$type, ID:$operand),
+                  "$res = OpConvertHandleToSampledImageINTEL $type $operand">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index a7a5ecead6f5f..407034702426c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1677,6 +1677,17 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(
         SPIRV::Capability::CooperativeMatrixInvocationInstructionsINTEL);
     break;
+  case SPIRV::OpConvertHandleToImageINTEL:
+  case SPIRV::OpConvertHandleToSamplerINTEL:
+  case SPIRV::OpConvertHandleToSampledImageINTEL:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bindless_images))
+      report_fatal_error("OpConvertHandleTo[Image/Sampler/SampledImage]INTEL "
+                         "instructions require the following SPIR-V extension: "
+                         "SPV_INTEL_bindless_images",
+                         false);
+    Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bindless_images);
+    Reqs.addCapability(SPIRV::Capability::BindlessImagesINTEL);
+    break;
   case SPIRV::OpKill: {
     Reqs.addCapability(SPIRV::Capability::Shader);
   } break;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index fec3cb0091bf5..ada19a40c42f1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -309,6 +309,7 @@ defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>;
 defm SPV_EXT_optnone : ExtensionOperand<113>;
 defm SPV_INTEL_joint_matrix : ExtensionOperand<114>;
 defm SPV_INTEL_float_controls2 : ExtensionOperand<115>;
+defm SPV_INTEL_bindless_images : ExtensionOperand<116>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -505,6 +506,7 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0,
 defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>;
+defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/bindless_images_generic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/bindless_images_generic.ll
new file mode 100644
index 0000000000000..3963180e51172
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/bindless_images_generic.ll
@@ -0,0 +1,34 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bindless_images %s -o - | FileCheck %s
+
+; CHECK-ERROR: LLVM ERROR: OpConvertHandleTo[Image/Sampler/SampledImage]INTEL instruction
+; CHECK-ERROR-SAME: require the following SPIR-V extension: SPV_INTEL_bindless_images
+
+; CHECK: OpCapability BindlessImagesINTEL
+; CHECK: OpExtension "SPV_INTEL_bindless_images"
+
+; CHECK-DAG: %[[#VoidTy:]] = OpTypeVoid
+; CHECK-DAG: %[[#Int64Ty:]] = OpTypeInt 64
+; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#Int64Ty]] 42
+; CHECK-DAG: %[[#Const43:]] = OpConstant %[[#Int64Ty]] 43
+; CHECK-DAG: %[[#IntImgTy:]] = OpTypeImage %[[#Int64Ty]]
+; CHECK-DAG: %[[#SamplerTy:]] = OpTypeSampler
+; CHECK-DAG: %[[#IntSmpImgTy:]] = OpTypeImage %[[#Int64Ty]]
+; CHECK-DAG: %[[#SampImageTy:]] = OpTypeSampledImage %[[#IntSmpImgTy]]
+; CHECK: %[[#Input:]] = OpFunctionParameter %[[#Int64Ty]]
+; CHECK: %[[#]] = OpConvertHandleToImageINTEL %[[#IntImgTy]] %[[#Input]]
+; CHECK: %[[#]] = OpConvertHandleToSamplerINTEL %[[#SamplerTy]] %[[#Const42]]
+; CHECK: %[[#]] = OpConvertHandleToSampledImageINTEL %[[#SampImageTy]] %[[#Const43]]
+
+define spir_func void @foo(i64 %in) {
+  %img = call spir_func target("spirv.Image", i64, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELl(i64 %in)
+  %samp = call spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64 42)
+  %sampImage = call spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64 43)
+  ret void
+}
+
+declare spir_func target("spirv.Image", i64, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELl(i64)
+
+declare spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64)
+
+declare spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64)

>From 268def27f5053b3a7795443990c54c9ddaec24ab Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 11 Dec 2024 08:51:55 +0000
Subject: [PATCH 12/26] [CodeGen][NewPM] Port RegAllocGreedy to NPM

---
 llvm/include/llvm/CodeGen/MachineFunction.h   |   1 +
 llvm/include/llvm/CodeGen/Passes.h            |   2 +-
 llvm/include/llvm/InitializePasses.h          |   2 +-
 .../llvm/Passes/MachinePassRegistry.def       |   9 +
 llvm/lib/CodeGen/CodeGen.cpp                  |   2 +-
 llvm/lib/CodeGen/RegAllocGreedy.cpp           | 185 ++++++++++++++----
 llvm/lib/CodeGen/RegAllocGreedy.h             |  57 +++---
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 8 files changed, 196 insertions(+), 63 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index f1e595cde54e3..7fd0994883fe8 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -927,6 +927,7 @@ class LLVM_ABI MachineFunction {
 
   /// Run the current MachineFunction through the machine code verifier, useful
   /// for debugger use.
+  /// TODO: Add the param LiveStks
   /// \returns true if no problems were found.
   bool verify(LiveIntervals *LiveInts, SlotIndexes *Indexes,
               const char *Banner = nullptr, raw_ostream *OS = nullptr,
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index b5d2a7e6bf035..0182f21bee5f5 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -171,7 +171,7 @@ namespace llvm {
   extern char &LiveRangeShrinkID;
 
   /// Greedy register allocator.
-  extern char &RAGreedyID;
+  extern char &RAGreedyLegacyID;
 
   /// Basic register allocator.
   extern char &RABasicID;
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 5b30eb53208a8..69c9e14541907 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -248,7 +248,7 @@ void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
 void initializePromoteLegacyPassPass(PassRegistry &);
 void initializeRABasicPass(PassRegistry &);
 void initializePseudoProbeInserterPass(PassRegistry &);
-void initializeRAGreedyPass(PassRegistry &);
+void initializeRAGreedyLegacyPass(PassRegistry &);
 void initializeReachingDefAnalysisPass(PassRegistry &);
 void initializeReassociateLegacyPassPass(PassRegistry &);
 void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 373bd047e2395..78b4c8153e26b 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -194,6 +194,15 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
       return parseRegAllocFastPassOptions(*PB, Params);
     },
     "filter=reg-filter;no-clear-vregs")
+
+MACHINE_FUNCTION_PASS_WITH_PARAMS(
+    "regallocgreedy", "RAGreedy",
+    [](RegAllocFilterFunc F) { return RAGreedyPass(F); },
+    [PB = this](StringRef Params) {
+      // TODO: parseRegAllocFilter(*PB, Params);
+      return Expected<RegAllocFilterFunc>(nullptr);
+    }, ""
+)
 #undef MACHINE_FUNCTION_PASS_WITH_PARAMS
 
 // After a pass is converted to new pass manager, its entry should be moved from
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 35df2a479a545..21f76bdb2ad6b 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -112,7 +112,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeProcessImplicitDefsPass(Registry);
   initializeRABasicPass(Registry);
-  initializeRAGreedyPass(Registry);
+  initializeRAGreedyLegacyPass(Registry);
   initializeRegAllocFastPass(Registry);
   initializeRegUsageInfoCollectorLegacyPass(Registry);
   initializeRegUsageInfoPropagationLegacyPass(Registry);
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index bd81d630f9d1f..f4cc80c751350 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -43,8 +43,10 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegAllocEvictionAdvisor.h"
+#include "llvm/CodeGen/RegAllocGreedyPass.h"
 #include "llvm/CodeGen/RegAllocPriorityAdvisor.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
@@ -55,6 +57,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -146,11 +149,134 @@ static cl::opt<unsigned> SplitThresholdForRegWithHint(
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
                                        createGreedyRegisterAllocator);
 
-char RAGreedy::ID = 0;
-char &llvm::RAGreedyID = RAGreedy::ID;
+namespace {
+class RAGreedyLegacy : public MachineFunctionPass {
+  RegAllocFilterFunc F;
 
-INITIALIZE_PASS_BEGIN(RAGreedy, "greedy",
-                "Greedy Register Allocator", false, false)
+public:
+  RAGreedyLegacy(const RegAllocFilterFunc F = nullptr);
+
+  static char ID;
+  /// Return the pass name.
+  StringRef getPassName() const override { return "Greedy Register Allocator"; }
+
+  /// RAGreedy analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+};
+
+} // end anonymous namespace
+
+RAGreedyLegacy::RAGreedyLegacy(const RegAllocFilterFunc F)
+    : MachineFunctionPass(ID), F(F) {
+  initializeRAGreedyLegacyPass(*PassRegistry::getPassRegistry());
+}
+
+RAGreedy::RAGreedy(const RegAllocFilterFunc F) : RegAllocBase(F) {}
+
+void RAGreedy::setAnalyses(RequiredAnalyses &Analyses) {
+  VRM = Analyses.VRM;
+  LIS = Analyses.LIS;
+  Matrix = Analyses.LRM;
+  Indexes = Analyses.Indexes;
+  MBFI = Analyses.MBFI;
+  DomTree = Analyses.DomTree;
+  Loops = Analyses.Loops;
+  ORE = Analyses.ORE;
+  Bundles = Analyses.Bundles;
+  SpillPlacer = Analyses.SpillPlacer;
+  DebugVars = Analyses.DebugVars;
+  LSS = Analyses.LSS;
+  EvictProvider = Analyses.EvictProvider;
+  PriorityProvider = Analyses.PriorityProvider;
+}
+
+PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
+                                    MachineFunctionAnalysisManager &MFAM) {
+  MFPropsModifier _(*this, MF);
+
+  RAGreedy Impl(Filter);
+  RAGreedy::RequiredAnalyses Analyses;
+
+  Analyses.VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
+  Analyses.LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  Analyses.LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+  Analyses.LSS = &MFAM.getResult<LiveStacksAnalysis>(MF);
+  Analyses.Indexes = &MFAM.getResult<SlotIndexesAnalysis>(MF);
+  Analyses.MBFI = &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+  Analyses.DomTree = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+  Analyses.ORE = &MFAM.getResult<MachineOptimizationRemarkEmitterAnalysis>(MF);
+  Analyses.Loops = &MFAM.getResult<MachineLoopAnalysis>(MF);
+  Analyses.Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF);
+  Analyses.SpillPlacer = &MFAM.getResult<SpillPlacementAnalysis>(MF);
+  Analyses.DebugVars = &MFAM.getResult<LiveDebugVariablesAnalysis>(MF);
+  Analyses.EvictProvider =
+      MFAM.getResult<RegAllocEvictionAdvisorAnalysis>(MF).Provider;
+  Analyses.PriorityProvider =
+      MFAM.getResult<RegAllocPriorityAdvisorAnalysis>(MF).Provider;
+
+  Impl.setAnalyses(Analyses);
+  bool Changed = Impl.run(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<MachineBlockFrequencyAnalysis>();
+  PA.preserve<LiveIntervalsAnalysis>();
+  PA.preserve<SlotIndexesAnalysis>();
+  PA.preserve<LiveDebugVariablesAnalysis>();
+  PA.preserve<LiveStacksAnalysis>();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachineLoopAnalysis>();
+  PA.preserve<VirtRegMapAnalysis>();
+  PA.preserve<LiveRegMatrixAnalysis>();
+  return PA;
+}
+
+bool RAGreedyLegacy::runOnMachineFunction(MachineFunction &MF) {
+  RAGreedy Impl(F);
+
+  RAGreedy::RequiredAnalyses Analyses;
+  Analyses.VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+  Analyses.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  Analyses.LSS = &getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  Analyses.LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+  Analyses.Indexes = &getAnalysis<SlotIndexesWrapperPass>().getSI();
+  Analyses.MBFI =
+      &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+  Analyses.DomTree =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  Analyses.ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+  Analyses.Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  Analyses.Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+  Analyses.SpillPlacer =
+      &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
+  Analyses.DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
+  Analyses.EvictProvider =
+      getAnalysis<RegAllocEvictionAdvisorAnalysisLegacy>().getProvider().get();
+  Analyses.PriorityProvider =
+      &getAnalysis<RegAllocPriorityAdvisorAnalysisLegacy>().getProvider();
+
+  Impl.setAnalyses(Analyses);
+  return Impl.run(MF);
+}
+
+char RAGreedyLegacy::ID = 0;
+char &llvm::RAGreedyLegacyID = RAGreedyLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(RAGreedyLegacy, "greedy", "Greedy Register Allocator",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
@@ -166,8 +292,8 @@ INITIALIZE_PASS_DEPENDENCY(SpillPlacementWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysisLegacy)
 INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysisLegacy)
-INITIALIZE_PASS_END(RAGreedy, "greedy",
-                "Greedy Register Allocator", false, false)
+INITIALIZE_PASS_END(RAGreedyLegacy, "greedy", "Greedy Register Allocator",
+                    false, false)
 
 #ifndef NDEBUG
 const char *const RAGreedy::StageName[] = {
@@ -186,17 +312,14 @@ const char *const RAGreedy::StageName[] = {
 const float Hysteresis = (2007 / 2048.0f); // 0.97998046875
 
 FunctionPass* llvm::createGreedyRegisterAllocator() {
-  return new RAGreedy();
+  return new RAGreedyLegacy();
 }
 
 FunctionPass *llvm::createGreedyRegisterAllocator(RegAllocFilterFunc Ftor) {
-  return new RAGreedy(Ftor);
+  return new RAGreedyLegacy(Ftor);
 }
 
-RAGreedy::RAGreedy(RegAllocFilterFunc F)
-    : MachineFunctionPass(ID), RegAllocBase(F) {}
-
-void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
+void RAGreedyLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
   AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
@@ -1057,7 +1180,8 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   }
 
   if (VerifyEnabled)
-    MF->verify(this, "After splitting live range around region", &errs());
+    MF->verify(LIS, Indexes, "After splitting live range around region",
+               &errs());
 }
 
 MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
@@ -1326,7 +1450,8 @@ Register RAGreedy::tryBlockSplit(const LiveInterval &VirtReg,
   }
 
   if (VerifyEnabled)
-    MF->verify(this, "After splitting live range around basic blocks", &errs());
+    MF->verify(LIS, Indexes, "After splitting live range around basic blocks",
+               &errs());
   return Register();
 }
 
@@ -2524,7 +2649,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
       DebugVars->splitRegister(r, LRE.regs(), *LIS);
 
     if (VerifyEnabled)
-      MF->verify(this, "After spilling", &errs());
+      MF->verify(LIS, Indexes, "After spilling", &errs());
   }
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -2720,7 +2845,7 @@ bool RAGreedy::hasVirtRegAlloc() {
   return false;
 }
 
-bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
+bool RAGreedy::run(MachineFunction &mf) {
   LLVM_DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                     << "********** Function: " << mf.getName() << '\n');
 
@@ -2728,29 +2853,18 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
 
   if (VerifyEnabled)
-    MF->verify(this, "Before greedy register allocator", &errs());
+    MF->verify(LIS, Indexes, "Before greedy register allocator", &errs());
 
-  RegAllocBase::init(getAnalysis<VirtRegMapWrapperLegacy>().getVRM(),
-                     getAnalysis<LiveIntervalsWrapperPass>().getLIS(),
-                     getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM());
+  RegAllocBase::init(*this->VRM, *this->LIS, *this->Matrix);
 
   // Early return if there is no virtual register to be allocated to a
   // physical register.
   if (!hasVirtRegAlloc())
     return false;
 
-  Indexes = &getAnalysis<SlotIndexesWrapperPass>().getSI();
   // Renumber to get accurate and consistent results from
   // SlotIndexes::getApproxInstrDistance.
   Indexes->packIndexes();
-  MBFI = &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
-  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
-  Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
-  SpillPlacer = &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
-  DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
-  auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
 
   initializeCSRCost();
 
@@ -2766,17 +2880,12 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
 
   ExtraInfo.emplace();
 
-  auto &EvictAdvisorProvider =
-      getAnalysis<RegAllocEvictionAdvisorAnalysisLegacy>().getProvider();
-  EvictAdvisor = EvictAdvisorProvider.getAdvisor(*MF, *this, MBFI, Loops);
-
-  PriorityAdvisor = getAnalysis<RegAllocPriorityAdvisorAnalysisLegacy>()
-                        .getProvider()
-                        .getAdvisor(*MF, *this, *Indexes);
+  EvictAdvisor = EvictProvider->getAdvisor(*MF, *this, MBFI, Loops);
+  PriorityAdvisor = PriorityProvider->getAdvisor(*MF, *this, *Indexes);
 
   VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
   SpillerInstance.reset(
-      createInlineSpiller({*LIS, LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
+      createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI));
 
   VRAI->calculateSpillWeightsAndHints();
 
@@ -2793,7 +2902,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   tryHintsRecoloring();
 
   if (VerifyEnabled)
-    MF->verify(this, "Before post optimization", &errs());
+    MF->verify(LIS, Indexes, "Before post optimization", &errs());
   postOptimization();
   reportStats();
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 1698607984bcd..7586d6abd18f5 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -25,13 +25,15 @@
 #include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocEvictionAdvisor.h"
 #include "llvm/CodeGen/RegAllocPriorityAdvisor.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SpillPlacement.h"
 #include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/PassManager.h"
 #include <algorithm>
 #include <cstdint>
 #include <memory>
@@ -56,11 +58,30 @@ class SlotIndexes;
 class TargetInstrInfo;
 class VirtRegMap;
 
-class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
-                                         public RegAllocBase,
+class LLVM_LIBRARY_VISIBILITY RAGreedy : public RegAllocBase,
                                          private LiveRangeEdit::Delegate {
-  // Interface to eviction advisers
 public:
+  struct RequiredAnalyses {
+    VirtRegMap *VRM = nullptr;
+    LiveIntervals *LIS = nullptr;
+    LiveRegMatrix *LRM = nullptr;
+    SlotIndexes *Indexes = nullptr;
+    MachineBlockFrequencyInfo *MBFI = nullptr;
+    MachineDominatorTree *DomTree = nullptr;
+    MachineLoopInfo *Loops = nullptr;
+    MachineOptimizationRemarkEmitter *ORE = nullptr;
+    EdgeBundles *Bundles = nullptr;
+    SpillPlacement *SpillPlacer = nullptr;
+    LiveDebugVariables *DebugVars = nullptr;
+
+    // Used by InlineSpiller
+    LiveStacks *LSS;
+    // Proxies for eviction and priority advisors
+    RegAllocEvictionAdvisorProvider *EvictProvider;
+    RegAllocPriorityAdvisorProvider *PriorityProvider;
+  };
+
+  // Interface to eviction advisers
   /// Track allocation stage and eviction loop prevention during allocation.
   class ExtraRegInfo final {
     // RegInfo - Keep additional information about each live range.
@@ -178,6 +199,10 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
   EdgeBundles *Bundles = nullptr;
   SpillPlacement *SpillPlacer = nullptr;
   LiveDebugVariables *DebugVars = nullptr;
+  LiveStacks *LSS = nullptr; // Used by InlineSpiller
+  // Proxy for the advisors
+  RegAllocEvictionAdvisorProvider *EvictProvider = nullptr;
+  RegAllocPriorityAdvisorProvider *PriorityProvider = nullptr;
 
   // state
   std::unique_ptr<Spiller> SpillerInstance;
@@ -282,13 +307,11 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
 
 public:
   RAGreedy(const RegAllocFilterFunc F = nullptr);
+  // Evict and priority advisors use this object, so we can construct those
+  // first and pass them here.
+  // Not required once legacy PM is removed.
+  void setAnalyses(RequiredAnalyses &Analyses);
 
-  /// Return the pass name.
-  StringRef getPassName() const override { return "Greedy Register Allocator"; }
-
-  /// RAGreedy analysis usage.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
   Spiller &spiller() override { return *SpillerInstance; }
   void enqueueImpl(const LiveInterval *LI) override;
   const LiveInterval *dequeue() override;
@@ -297,19 +320,9 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
   void aboutToRemoveInterval(const LiveInterval &) override;
 
   /// Perform register allocation.
-  bool runOnMachineFunction(MachineFunction &mf) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoPHIs);
-  }
-
-  MachineFunctionProperties getClearedProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::IsSSA);
-  }
+  bool run(MachineFunction &mf);
 
-  static char ID;
+  void releaseMemory();
 
 private:
   MCRegister selectOrSplitImpl(const LiveInterval &,
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5bb2e7d0abdd9..455fac776597d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -129,6 +129,7 @@
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/RegAllocEvictionAdvisor.h"
 #include "llvm/CodeGen/RegAllocFast.h"
+#include "llvm/CodeGen/RegAllocGreedyPass.h"
 #include "llvm/CodeGen/RegAllocPriorityAdvisor.h"
 #include "llvm/CodeGen/RegUsageInfoCollector.h"
 #include "llvm/CodeGen/RegUsageInfoPropagate.h"

>From cf059dadbccb7a5988bde84bf2a90ac891b2ffc0 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Fri, 13 Dec 2024 10:59:04 +0000
Subject: [PATCH 13/26] Move pass to a new header, add options

---
 .../include/llvm/CodeGen/RegAllocGreedyPass.h | 43 +++++++++++++++++++
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  1 +
 .../llvm/Passes/MachinePassRegistry.def       | 11 +++--
 llvm/lib/CodeGen/RegAllocGreedy.cpp           |  7 ++-
 4 files changed, 55 insertions(+), 7 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/RegAllocGreedyPass.h

diff --git a/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h b/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
new file mode 100644
index 0000000000000..f325224c5384c
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
@@ -0,0 +1,43 @@
+//==- RegAllocGreedyPass.h --- greedy register allocator pass ------*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
+#include "llvm/CodeGen/RegAllocFast.h"
+#include "llvm/IR/PassManager.h"
+
+using namespace llvm;
+
+class RAGreedyPass : public PassInfoMixin<RAGreedyPass> {
+
+public:
+  struct Options {
+    RegAllocFilterFunc Filter;
+    StringRef FilterName;
+    Options(RegAllocFilterFunc F = nullptr, StringRef FN = "all")
+        : Filter(F), FilterName(FN) {};
+  };
+
+  RAGreedyPass(Options Opts = Options()) : Opts(Opts) {}
+  PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM);
+
+  MachineFunctionProperties getRequiredProperties() const {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  void printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const;
+  static bool isRequired() { return true; }
+
+private:
+  Options Opts;
+};
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 12781e2b84623..ca065d67eacef 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -59,6 +59,7 @@
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/RegAllocEvictionAdvisor.h"
 #include "llvm/CodeGen/RegAllocFast.h"
+#include "llvm/CodeGen/RegAllocGreedyPass.h"
 #include "llvm/CodeGen/RegUsageInfoCollector.h"
 #include "llvm/CodeGen/RegUsageInfoPropagate.h"
 #include "llvm/CodeGen/RegisterCoalescerPass.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 78b4c8153e26b..9ac0f4f0b2555 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -196,12 +196,12 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
     "filter=reg-filter;no-clear-vregs")
 
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
-    "regallocgreedy", "RAGreedy",
-    [](RegAllocFilterFunc F) { return RAGreedyPass(F); },
+    "regallocgreedy", "RAGreedyPass",
+    [](RAGreedyPass::Options Opts) { return RAGreedyPass(Opts); },
     [PB = this](StringRef Params) {
-      // TODO: parseRegAllocFilter(*PB, Params);
-      return Expected<RegAllocFilterFunc>(nullptr);
-    }, ""
+      // TODO: parseRegAllocGreedyFilterFunc(*PB, Params);
+      return Expected<RAGreedyPass::Options>(RAGreedyPass::Options{});
+    }, "reg-filter"
 )
 #undef MACHINE_FUNCTION_PASS_WITH_PARAMS
 
@@ -269,7 +269,6 @@ DUMMY_MACHINE_FUNCTION_PASS("processimpdefs", ProcessImplicitDefsPass)
 DUMMY_MACHINE_FUNCTION_PASS("prologepilog", PrologEpilogInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("prologepilog-code", PrologEpilogCodeInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("ra-basic", RABasicPass)
-DUMMY_MACHINE_FUNCTION_PASS("ra-greedy", RAGreedyPass)
 DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass)
 DUMMY_MACHINE_FUNCTION_PASS("regalloc", RegAllocPass)
 DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass)
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index f4cc80c751350..d193d2e7389df 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -202,11 +202,16 @@ void RAGreedy::setAnalyses(RequiredAnalyses &Analyses) {
   PriorityProvider = Analyses.PriorityProvider;
 }
 
+void RAGreedyPass::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const {
+  StringRef FilterName = Opts.FilterName.empty() ? "all" : Opts.FilterName;
+  OS << "regallocgreedy<" << FilterName << ">";
+}
+
 PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &MFAM) {
   MFPropsModifier _(*this, MF);
 
-  RAGreedy Impl(Filter);
+  RAGreedy Impl(Opts.Filter);
   RAGreedy::RequiredAnalyses Analyses;
 
   Analyses.VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);

>From 3c06c54e2bcda0e59ccb327a58506480787ff111 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 16 Dec 2024 09:34:30 +0000
Subject: [PATCH 14/26] Move VRM after LiveIntervals

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index d193d2e7389df..7eaf7254a0017 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -214,7 +214,6 @@ PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
   RAGreedy Impl(Opts.Filter);
   RAGreedy::RequiredAnalyses Analyses;
 
-  Analyses.VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
   Analyses.LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
   Analyses.LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
   Analyses.LSS = &MFAM.getResult<LiveStacksAnalysis>(MF);
@@ -230,6 +229,7 @@ PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
       MFAM.getResult<RegAllocEvictionAdvisorAnalysis>(MF).Provider;
   Analyses.PriorityProvider =
       MFAM.getResult<RegAllocPriorityAdvisorAnalysis>(MF).Provider;
+  Analyses.VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
 
   Impl.setAnalyses(Analyses);
   bool Changed = Impl.run(MF);

>From 246b37fc3f0a97fcfa92c9e64b2815f6edffc3d9 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 1 Jan 2025 08:51:35 +0000
Subject: [PATCH 15/26] AS: Rename to greedy, CFG obviation, comment fix

---
 llvm/include/llvm/CodeGen/MachineFunction.h      | 2 +-
 llvm/include/llvm/Passes/MachinePassRegistry.def | 2 +-
 llvm/lib/CodeGen/RegAllocGreedy.cpp              | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 7fd0994883fe8..408428fe28a32 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -927,7 +927,7 @@ class LLVM_ABI MachineFunction {
 
   /// Run the current MachineFunction through the machine code verifier, useful
   /// for debugger use.
-  /// TODO: Add the param LiveStks
+  /// TODO: Add the param for LiveStacks analysis.
   /// \returns true if no problems were found.
   bool verify(LiveIntervals *LiveInts, SlotIndexes *Indexes,
               const char *Banner = nullptr, raw_ostream *OS = nullptr,
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 9ac0f4f0b2555..45abbc3b02e75 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -196,7 +196,7 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
     "filter=reg-filter;no-clear-vregs")
 
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
-    "regallocgreedy", "RAGreedyPass",
+    "greedy", "RAGreedyPass",
     [](RAGreedyPass::Options Opts) { return RAGreedyPass(Opts); },
     [PB = this](StringRef Params) {
       // TODO: parseRegAllocGreedyFilterFunc(*PB, Params);
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 7eaf7254a0017..717ae3afcc3db 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -242,8 +242,6 @@ PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
   PA.preserve<SlotIndexesAnalysis>();
   PA.preserve<LiveDebugVariablesAnalysis>();
   PA.preserve<LiveStacksAnalysis>();
-  PA.preserve<MachineDominatorTreeAnalysis>();
-  PA.preserve<MachineLoopAnalysis>();
   PA.preserve<VirtRegMapAnalysis>();
   PA.preserve<LiveRegMatrixAnalysis>();
   return PA;
@@ -269,7 +267,7 @@ bool RAGreedyLegacy::runOnMachineFunction(MachineFunction &MF) {
       &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
   Analyses.DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
   Analyses.EvictProvider =
-      getAnalysis<RegAllocEvictionAdvisorAnalysisLegacy>().getProvider().get();
+      &getAnalysis<RegAllocEvictionAdvisorAnalysisLegacy>().getProvider();
   Analyses.PriorityProvider =
       &getAnalysis<RegAllocPriorityAdvisorAnalysisLegacy>().getProvider();
 

>From 72e96a2d19b9471bb8b49ff5d03d591bd9b9028e Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 23 Jan 2025 09:32:52 +0000
Subject: [PATCH 16/26] AS, remove setAnlayses and use constructor itself

---
 llvm/include/llvm/CodeGen/RegAllocGreedyPass.h |  1 -
 llvm/lib/CodeGen/RegAllocGreedy.cpp            | 13 +++++++------
 llvm/lib/CodeGen/RegAllocGreedy.h              |  7 ++-----
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h b/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
index f325224c5384c..9fb6998921987 100644
--- a/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
+++ b/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
@@ -13,7 +13,6 @@
 using namespace llvm;
 
 class RAGreedyPass : public PassInfoMixin<RAGreedyPass> {
-
 public:
   struct Options {
     RegAllocFilterFunc Filter;
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 717ae3afcc3db..69fa08cade063 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -183,7 +183,9 @@ RAGreedyLegacy::RAGreedyLegacy(const RegAllocFilterFunc F)
   initializeRAGreedyLegacyPass(*PassRegistry::getPassRegistry());
 }
 
-RAGreedy::RAGreedy(const RegAllocFilterFunc F) : RegAllocBase(F) {}
+RAGreedy::RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F) : RegAllocBase(F) {
+  setAnalyses(Analyses);
+}
 
 void RAGreedy::setAnalyses(RequiredAnalyses &Analyses) {
   VRM = Analyses.VRM;
@@ -204,14 +206,13 @@ void RAGreedy::setAnalyses(RequiredAnalyses &Analyses) {
 
 void RAGreedyPass::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const {
   StringRef FilterName = Opts.FilterName.empty() ? "all" : Opts.FilterName;
-  OS << "regallocgreedy<" << FilterName << ">";
+  OS << "regallocgreedy<" << FilterName << '>';
 }
 
 PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &MFAM) {
   MFPropsModifier _(*this, MF);
 
-  RAGreedy Impl(Opts.Filter);
   RAGreedy::RequiredAnalyses Analyses;
 
   Analyses.LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
@@ -231,7 +232,8 @@ PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
       MFAM.getResult<RegAllocPriorityAdvisorAnalysis>(MF).Provider;
   Analyses.VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
 
-  Impl.setAnalyses(Analyses);
+  RAGreedy Impl(Analyses, Opts.Filter);
+
   bool Changed = Impl.run(MF);
   if (!Changed)
     return PreservedAnalyses::all();
@@ -248,7 +250,6 @@ PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
 }
 
 bool RAGreedyLegacy::runOnMachineFunction(MachineFunction &MF) {
-  RAGreedy Impl(F);
 
   RAGreedy::RequiredAnalyses Analyses;
   Analyses.VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
@@ -271,7 +272,7 @@ bool RAGreedyLegacy::runOnMachineFunction(MachineFunction &MF) {
   Analyses.PriorityProvider =
       &getAnalysis<RegAllocPriorityAdvisorAnalysisLegacy>().getProvider();
 
-  Impl.setAnalyses(Analyses);
+  RAGreedy Impl(Analyses, F);
   return Impl.run(MF);
 }
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 7586d6abd18f5..696a4009bec1b 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -305,12 +305,9 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public RegAllocBase,
 
   bool ReverseLocalAssignment = false;
 
-public:
-  RAGreedy(const RegAllocFilterFunc F = nullptr);
-  // Evict and priority advisors use this object, so we can construct those
-  // first and pass them here.
-  // Not required once legacy PM is removed.
   void setAnalyses(RequiredAnalyses &Analyses);
+public:
+  RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F = nullptr);
 
   Spiller &spiller() override { return *SpillerInstance; }
   void enqueueImpl(const LiveInterval *LI) override;

>From 249eb1f933ccf754617539beaede0d6f8e362cc3 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 6 Feb 2025 04:17:13 +0000
Subject: [PATCH 17/26] remove setAnalyses()

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 9 +++------
 llvm/lib/CodeGen/RegAllocGreedy.h   | 1 -
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 69fa08cade063..ebb85db210322 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -183,11 +183,8 @@ RAGreedyLegacy::RAGreedyLegacy(const RegAllocFilterFunc F)
   initializeRAGreedyLegacyPass(*PassRegistry::getPassRegistry());
 }
 
-RAGreedy::RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F) : RegAllocBase(F) {
-  setAnalyses(Analyses);
-}
-
-void RAGreedy::setAnalyses(RequiredAnalyses &Analyses) {
+RAGreedy::RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F)
+    : RegAllocBase(F) {
   VRM = Analyses.VRM;
   LIS = Analyses.LIS;
   Matrix = Analyses.LRM;
@@ -206,7 +203,7 @@ void RAGreedy::setAnalyses(RequiredAnalyses &Analyses) {
 
 void RAGreedyPass::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const {
   StringRef FilterName = Opts.FilterName.empty() ? "all" : Opts.FilterName;
-  OS << "regallocgreedy<" << FilterName << '>';
+  OS << "regalloc-greedy<" << FilterName << '>';
 }
 
 PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 696a4009bec1b..bb50f8a7f124e 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -305,7 +305,6 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public RegAllocBase,
 
   bool ReverseLocalAssignment = false;
 
-  void setAnalyses(RequiredAnalyses &Analyses);
 public:
   RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F = nullptr);
 

>From ffd27f0e566ce45e57267baafdff5a374682a6e5 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Wed, 12 Feb 2025 05:58:30 +0000
Subject: [PATCH 18/26] keep pass name same as legacy (greedy)

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index ebb85db210322..3dca5faacfe5d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -203,7 +203,7 @@ RAGreedy::RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F)
 
 void RAGreedyPass::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const {
   StringRef FilterName = Opts.FilterName.empty() ? "all" : Opts.FilterName;
-  OS << "regalloc-greedy<" << FilterName << '>';
+  OS << "greedy<" << FilterName << '>';
 }
 
 PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,

>From 7ea196dd16d5d8aa3a8701fb98f5f9c51ffebacf Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 17 Feb 2025 08:42:14 +0000
Subject: [PATCH 19/26] Set analyses in constructors

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 81 ++++++++++++++---------------
 llvm/lib/CodeGen/RegAllocGreedy.h   |  4 ++
 2 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 3dca5faacfe5d..0fa67d423f96c 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -206,29 +206,30 @@ void RAGreedyPass::printPipeline(raw_ostream &OS, function_ref<StringRef(StringR
   OS << "greedy<" << FilterName << '>';
 }
 
+RAGreedy::RequiredAnalyses::RequiredAnalyses(
+    MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) {
+  LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+  LSS = &MFAM.getResult<LiveStacksAnalysis>(MF);
+  Indexes = &MFAM.getResult<SlotIndexesAnalysis>(MF);
+  MBFI = &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+  DomTree = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+  ORE = &MFAM.getResult<MachineOptimizationRemarkEmitterAnalysis>(MF);
+  Loops = &MFAM.getResult<MachineLoopAnalysis>(MF);
+  Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF);
+  SpillPlacer = &MFAM.getResult<SpillPlacementAnalysis>(MF);
+  DebugVars = &MFAM.getResult<LiveDebugVariablesAnalysis>(MF);
+  EvictProvider = MFAM.getResult<RegAllocEvictionAdvisorAnalysis>(MF).Provider;
+  PriorityProvider =
+      MFAM.getResult<RegAllocPriorityAdvisorAnalysis>(MF).Provider;
+  VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
+}
+
 PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
                                     MachineFunctionAnalysisManager &MFAM) {
   MFPropsModifier _(*this, MF);
 
-  RAGreedy::RequiredAnalyses Analyses;
-
-  Analyses.LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
-  Analyses.LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
-  Analyses.LSS = &MFAM.getResult<LiveStacksAnalysis>(MF);
-  Analyses.Indexes = &MFAM.getResult<SlotIndexesAnalysis>(MF);
-  Analyses.MBFI = &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
-  Analyses.DomTree = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
-  Analyses.ORE = &MFAM.getResult<MachineOptimizationRemarkEmitterAnalysis>(MF);
-  Analyses.Loops = &MFAM.getResult<MachineLoopAnalysis>(MF);
-  Analyses.Bundles = &MFAM.getResult<EdgeBundlesAnalysis>(MF);
-  Analyses.SpillPlacer = &MFAM.getResult<SpillPlacementAnalysis>(MF);
-  Analyses.DebugVars = &MFAM.getResult<LiveDebugVariablesAnalysis>(MF);
-  Analyses.EvictProvider =
-      MFAM.getResult<RegAllocEvictionAdvisorAnalysis>(MF).Provider;
-  Analyses.PriorityProvider =
-      MFAM.getResult<RegAllocPriorityAdvisorAnalysis>(MF).Provider;
-  Analyses.VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
-
+  RAGreedy::RequiredAnalyses Analyses(MF, MFAM);
   RAGreedy Impl(Analyses, Opts.Filter);
 
   bool Changed = Impl.run(MF);
@@ -246,29 +247,27 @@ PreservedAnalyses RAGreedyPass::run(MachineFunction &MF,
   return PA;
 }
 
-bool RAGreedyLegacy::runOnMachineFunction(MachineFunction &MF) {
-
-  RAGreedy::RequiredAnalyses Analyses;
-  Analyses.VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
-  Analyses.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  Analyses.LSS = &getAnalysis<LiveStacksWrapperLegacy>().getLS();
-  Analyses.LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
-  Analyses.Indexes = &getAnalysis<SlotIndexesWrapperPass>().getSI();
-  Analyses.MBFI =
-      &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
-  Analyses.DomTree =
-      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  Analyses.ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
-  Analyses.Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Analyses.Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
-  Analyses.SpillPlacer =
-      &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
-  Analyses.DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
-  Analyses.EvictProvider =
-      &getAnalysis<RegAllocEvictionAdvisorAnalysisLegacy>().getProvider();
-  Analyses.PriorityProvider =
-      &getAnalysis<RegAllocPriorityAdvisorAnalysisLegacy>().getProvider();
+RAGreedy::RequiredAnalyses::RequiredAnalyses(Pass &P) {
+  VRM = &P.getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+  LIS = &P.getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  LSS = &P.getAnalysis<LiveStacksWrapperLegacy>().getLS();
+  LRM = &P.getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+  Indexes = &P.getAnalysis<SlotIndexesWrapperPass>().getSI();
+  MBFI = &P.getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+  DomTree = &P.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  ORE = &P.getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+  Loops = &P.getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  Bundles = &P.getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+  SpillPlacer = &P.getAnalysis<SpillPlacementWrapperLegacy>().getResult();
+  DebugVars = &P.getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
+  EvictProvider =
+      &P.getAnalysis<RegAllocEvictionAdvisorAnalysisLegacy>().getProvider();
+  PriorityProvider =
+      &P.getAnalysis<RegAllocPriorityAdvisorAnalysisLegacy>().getProvider();
+}
 
+bool RAGreedyLegacy::runOnMachineFunction(MachineFunction &MF) {
+  RAGreedy::RequiredAnalyses Analyses(*this);
   RAGreedy Impl(Analyses, F);
   return Impl.run(MF);
 }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index bb50f8a7f124e..e9910c46a6452 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -79,6 +79,10 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public RegAllocBase,
     // Proxies for eviction and priority advisors
     RegAllocEvictionAdvisorProvider *EvictProvider;
     RegAllocPriorityAdvisorProvider *PriorityProvider;
+
+    RequiredAnalyses() {}
+    RequiredAnalyses(Pass &P);
+    RequiredAnalyses(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
   };
 
   // Interface to eviction advisers

>From 0d08e55c89219e2e07db5e7617717fa0d61283a6 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 20 Feb 2025 09:06:36 +0000
Subject: [PATCH 20/26] Move RequiredAnalyses and correct preserved analyses

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 24 ++++++++++++++++++++++++
 llvm/lib/CodeGen/RegAllocGreedy.h   | 25 +------------------------
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 0fa67d423f96c..f65660771901d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -183,6 +183,30 @@ RAGreedyLegacy::RAGreedyLegacy(const RegAllocFilterFunc F)
   initializeRAGreedyLegacyPass(*PassRegistry::getPassRegistry());
 }
 
+struct RAGreedy::RequiredAnalyses {
+  VirtRegMap *VRM = nullptr;
+  LiveIntervals *LIS = nullptr;
+  LiveRegMatrix *LRM = nullptr;
+  SlotIndexes *Indexes = nullptr;
+  MachineBlockFrequencyInfo *MBFI = nullptr;
+  MachineDominatorTree *DomTree = nullptr;
+  MachineLoopInfo *Loops = nullptr;
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
+  EdgeBundles *Bundles = nullptr;
+  SpillPlacement *SpillPlacer = nullptr;
+  LiveDebugVariables *DebugVars = nullptr;
+
+  // Used by InlineSpiller
+  LiveStacks *LSS;
+  // Proxies for eviction and priority advisors
+  RegAllocEvictionAdvisorProvider *EvictProvider;
+  RegAllocPriorityAdvisorProvider *PriorityProvider;
+
+  RequiredAnalyses() {}
+  RequiredAnalyses(Pass &P);
+  RequiredAnalyses(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
 RAGreedy::RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F)
     : RegAllocBase(F) {
   VRM = Analyses.VRM;
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index e9910c46a6452..675b0214599d2 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/SpillPlacement.h"
 #include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/PassManager.h"
 #include <algorithm>
 #include <cstdint>
 #include <memory>
@@ -61,29 +60,7 @@ class VirtRegMap;
 class LLVM_LIBRARY_VISIBILITY RAGreedy : public RegAllocBase,
                                          private LiveRangeEdit::Delegate {
 public:
-  struct RequiredAnalyses {
-    VirtRegMap *VRM = nullptr;
-    LiveIntervals *LIS = nullptr;
-    LiveRegMatrix *LRM = nullptr;
-    SlotIndexes *Indexes = nullptr;
-    MachineBlockFrequencyInfo *MBFI = nullptr;
-    MachineDominatorTree *DomTree = nullptr;
-    MachineLoopInfo *Loops = nullptr;
-    MachineOptimizationRemarkEmitter *ORE = nullptr;
-    EdgeBundles *Bundles = nullptr;
-    SpillPlacement *SpillPlacer = nullptr;
-    LiveDebugVariables *DebugVars = nullptr;
-
-    // Used by InlineSpiller
-    LiveStacks *LSS;
-    // Proxies for eviction and priority advisors
-    RegAllocEvictionAdvisorProvider *EvictProvider;
-    RegAllocPriorityAdvisorProvider *PriorityProvider;
-
-    RequiredAnalyses() {}
-    RequiredAnalyses(Pass &P);
-    RequiredAnalyses(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
-  };
+  struct RequiredAnalyses;
 
   // Interface to eviction advisers
   /// Track allocation stage and eviction loop prevention during allocation.

>From ec6387d5bce23928e90d6af8159846be0232eb7f Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Tue, 11 Feb 2025 12:36:40 +0000
Subject: [PATCH 21/26] [CodeGen][NewPM] Plug greedy RA in codegen pipeline

---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h | 51 ++++++++++++++-----
 .../llvm/Passes/MachinePassRegistry.def       |  4 +-
 .../include/llvm/Target/CGPassBuilderOption.h |  4 +-
 llvm/lib/Passes/PassBuilder.cpp               | 14 +++++
 ...plicit-def-remat-requires-impdef-check.mir |  1 +
 ...implicit-def-with-impdef-greedy-assert.mir |  1 +
 llvm/test/CodeGen/AArch64/pr51516.mir         |  1 +
 llvm/test/CodeGen/AArch64/spill-fold.mir      |  2 +
 llvm/test/CodeGen/MIR/Generic/runPass.mir     |  1 +
 .../SystemZ/clear-liverange-spillreg.mir      |  1 +
 llvm/test/CodeGen/Thumb/high-reg-clobber.mir  |  1 +
 llvm/test/CodeGen/X86/limit-split-cost.mir    |  1 +
 llvm/tools/llc/NewPMDriver.cpp                | 15 ++++--
 13 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index ca065d67eacef..d895eee9bf4da 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -1062,7 +1062,9 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addMachineSSAOptimization(
 ///
 /// A target that uses the standard regalloc pass order for fast or optimized
 /// allocation may still override this for per-target regalloc
-/// selection. But -regalloc=... always takes precedence.
+/// selection. But -regalloc-npm=... always takes precedence.
+/// If a target does not want to allow users to set -regalloc-npm=... at all,
+/// check if Opt.RegAlloc == RegAllocType::Unset.
 template <typename Derived, typename TargetMachineT>
 void CodeGenPassBuilder<Derived, TargetMachineT>::addTargetRegisterAllocator(
     AddMachinePass &addPass, bool Optimized) const {
@@ -1075,10 +1077,29 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addTargetRegisterAllocator(
 /// Find and instantiate the register allocation pass requested by this target
 /// at the current optimization level.  Different register allocators are
 /// defined as separate passes because they may require different analysis.
+///
+/// This helper ensures that the -regalloc-npm= option is always available,
+/// even for targets that override the default allocator.
 template <typename Derived, typename TargetMachineT>
 void CodeGenPassBuilder<Derived, TargetMachineT>::addRegAllocPass(
     AddMachinePass &addPass, bool Optimized) const {
-  // TODO: Parse Opt.RegAlloc to add register allocator.
+  // Use the specified -regalloc-npm={basic|greedy|fast|pbqp}
+  if (Opt.RegAlloc > RegAllocType::Default) {
+    switch (Opt.RegAlloc) {
+    case RegAllocType::Fast:
+      addPass(RegAllocFastPass());
+      break;
+    case RegAllocType::Greedy:
+      addPass(RAGreedyPass());
+      break;
+    default:
+      report_fatal_error("register allocator not supported yet.", false);
+    }
+    return;
+  }
+  // -regalloc=default or unspecified, so pick based on the optimization level
+  // or ask the target for the regalloc pass.
+  derived().addTargetRegisterAllocator(addPass, Optimized);
 }
 
 template <typename Derived, typename TargetMachineT>
@@ -1149,20 +1170,22 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addOptimizedRegAlloc(
   // PreRA instruction scheduling.
   addPass(MachineSchedulerPass(&TM));
 
-  if (derived().addRegAssignmentOptimized(addPass)) {
-    // Allow targets to expand pseudo instructions depending on the choice of
-    // registers before MachineCopyPropagation.
-    derived().addPostRewrite(addPass);
+  if (auto E = derived().addRegAssignmentOptimized(addPass)) {
+    // addRegAssignmentOptimized did not add a reg alloc pass, so do nothing.
+    return;
+  }
+  // Allow targets to expand pseudo instructions depending on the choice of
+  // registers before MachineCopyPropagation.
+  derived().addPostRewrite(addPass);
 
-    // Copy propagate to forward register uses and try to eliminate COPYs that
-    // were not coalesced.
-    addPass(MachineCopyPropagationPass());
+  // Copy propagate to forward register uses and try to eliminate COPYs that
+  // were not coalesced.
+  addPass(MachineCopyPropagationPass());
 
-    // Run post-ra machine LICM to hoist reloads / remats.
-    //
-    // FIXME: can this move into MachineLateOptimization?
-    addPass(MachineLICMPass());
-  }
+  // Run post-ra machine LICM to hoist reloads / remats.
+  //
+  // FIXME: can this move into MachineLateOptimization?
+  addPass(MachineLICMPass());
 }
 
 //===---------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 45abbc3b02e75..3199337f065fb 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -195,12 +195,12 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
     },
     "filter=reg-filter;no-clear-vregs")
 
+// 'all' is the default filter
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
     "greedy", "RAGreedyPass",
     [](RAGreedyPass::Options Opts) { return RAGreedyPass(Opts); },
     [PB = this](StringRef Params) {
-      // TODO: parseRegAllocGreedyFilterFunc(*PB, Params);
-      return Expected<RAGreedyPass::Options>(RAGreedyPass::Options{});
+      return parseRegAllocGreedyFilterFunc(*PB, Params);
     }, "reg-filter"
 )
 #undef MACHINE_FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index d3d19c8a7dc9f..b496a9f66296f 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -20,7 +20,7 @@
 namespace llvm {
 
 enum class RunOutliner { TargetDefault, AlwaysOutline, NeverOutline };
-enum class RegAllocType { Default, Basic, Fast, Greedy, PBQP };
+enum class RegAllocType { Unset, Default, Basic, Fast, Greedy, PBQP };
 
 // Not one-on-one but mostly corresponding to commandline options in
 // TargetPassConfig.cpp.
@@ -52,7 +52,7 @@ struct CGPassBuilderOption {
   bool RequiresCodeGenSCCOrder = false;
 
   RunOutliner EnableMachineOutliner = RunOutliner::TargetDefault;
-  StringRef RegAlloc = "default";
+  RegAllocType RegAlloc = RegAllocType::Unset;
   std::optional<GlobalISelAbortMode> EnableGlobalISelAbort;
   std::string FSProfileFile;
   std::string FSRemappingFile;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 455fac776597d..401f4dfa1c037 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1414,6 +1414,20 @@ parseBoundsCheckingOptions(StringRef Params) {
   return Options;
 }
 
+Expected<RAGreedyPass::Options>
+parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) {
+  if (Params.empty() || Params == "all") {
+    return RAGreedyPass::Options();
+  }
+  std::optional<RegAllocFilterFunc> Filter = PB.parseRegAllocFilter(Params);
+  if (!Filter) {
+    return make_error<StringError>(
+        formatv("invalid regallocgreedy register filter '{0}' ", Params).str(),
+        inconvertibleErrorCode());
+  }
+  return RAGreedyPass::Options{*Filter, Params};
+}
+
 } // namespace
 
 /// Tests whether a pass name starts with a valid prefix for a default pipeline
diff --git a/llvm/test/CodeGen/AArch64/implicit-def-remat-requires-impdef-check.mir b/llvm/test/CodeGen/AArch64/implicit-def-remat-requires-impdef-check.mir
index 47aa34e3c0115..e8d0c43e81433 100644
--- a/llvm/test/CodeGen/AArch64/implicit-def-remat-requires-impdef-check.mir
+++ b/llvm/test/CodeGen/AArch64/implicit-def-remat-requires-impdef-check.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=arm64-apple-macosx -mcpu=apple-m1 -stress-regalloc=4 -verify-regalloc -run-pass=greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=arm64-apple-macosx -mcpu=apple-m1 -stress-regalloc=4 -verify-regalloc -passes=greedy -o - %s | FileCheck %s
 
 --- |
   define void @inst_stores_to_dead_spill_implicit_def_impdef() {
diff --git a/llvm/test/CodeGen/AArch64/implicit-def-with-impdef-greedy-assert.mir b/llvm/test/CodeGen/AArch64/implicit-def-with-impdef-greedy-assert.mir
index a5d74ef75f0a0..a1caa46e8b2bb 100644
--- a/llvm/test/CodeGen/AArch64/implicit-def-with-impdef-greedy-assert.mir
+++ b/llvm/test/CodeGen/AArch64/implicit-def-with-impdef-greedy-assert.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=arm64-apple-ios -run-pass=greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=arm64-apple-ios -passes=greedy -o - %s | FileCheck %s
 
 ---
 name:            widget
diff --git a/llvm/test/CodeGen/AArch64/pr51516.mir b/llvm/test/CodeGen/AArch64/pr51516.mir
index 910bfb858b50f..ae54ad0d5cef4 100644
--- a/llvm/test/CodeGen/AArch64/pr51516.mir
+++ b/llvm/test/CodeGen/AArch64/pr51516.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=aarch64-unknown-fuchsia -run-pass=greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-unknown-fuchsia -passes=greedy -verify-machineinstrs -o - %s | FileCheck %s
 
 # Check that we spill %31 and do not rematerialize it since the use operand
 # of ADDXri is killed by the STRXui in this block.
diff --git a/llvm/test/CodeGen/AArch64/spill-fold.mir b/llvm/test/CodeGen/AArch64/spill-fold.mir
index b1e7ebe3a7e82..0149e4504bed2 100644
--- a/llvm/test/CodeGen/AArch64/spill-fold.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fold.mir
@@ -1,5 +1,7 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass greedy -verify-machineinstrs  -o - %s | FileCheck %s
 # RUN: llc -mtriple=aarch64_be-none-linux-gnu -run-pass greedy -verify-machineinstrs  -o - %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -passes=greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=aarch64_be-none-linux-gnu -passes=greedy -o - %s | FileCheck %s
 --- |
   define i64 @test_subreg_spill_fold() { ret i64 0 }
   define i64 @test_subreg_spill_fold2() { ret i64 0 }
diff --git a/llvm/test/CodeGen/MIR/Generic/runPass.mir b/llvm/test/CodeGen/MIR/Generic/runPass.mir
index 75763c5389b09..824d9ed0e65f8 100644
--- a/llvm/test/CodeGen/MIR/Generic/runPass.mir
+++ b/llvm/test/CodeGen/MIR/Generic/runPass.mir
@@ -2,6 +2,7 @@
 # RUN: llc -run-pass=regallocbasic -debug-pass=Arguments -o - %s | FileCheck %s
 # RUN: llc -run-pass=regallocfast -debug-pass=Arguments -o - %s | FileCheck %s
 # RUN: llc -passes=regallocfast -o - %s | FileCheck %s
+# RUN: llc -passes=greedy -o - %s | FileCheck %s
 
 # Check that passes are initialized correctly, so that it's possible to
 # use -run-pass.
diff --git a/llvm/test/CodeGen/SystemZ/clear-liverange-spillreg.mir b/llvm/test/CodeGen/SystemZ/clear-liverange-spillreg.mir
index 197c3d8551fc3..a9aecb76edcf6 100644
--- a/llvm/test/CodeGen/SystemZ/clear-liverange-spillreg.mir
+++ b/llvm/test/CodeGen/SystemZ/clear-liverange-spillreg.mir
@@ -1,4 +1,5 @@
 #RUN: llc -o - %s -mtriple=s390x-ibm-linux -run-pass=greedy
+#RUN: llc -o - %s -mtriple=s390x-ibm-linux -passes=greedy
 #PR34502. Check HoistSpill works properly after the live range of spilled
 #virtual register is cleared.
 --- |
diff --git a/llvm/test/CodeGen/Thumb/high-reg-clobber.mir b/llvm/test/CodeGen/Thumb/high-reg-clobber.mir
index 1402c7c2cbca3..6897aa7f34f94 100644
--- a/llvm/test/CodeGen/Thumb/high-reg-clobber.mir
+++ b/llvm/test/CodeGen/Thumb/high-reg-clobber.mir
@@ -3,6 +3,7 @@
 # RUN: llc -mtriple thumbv6m-arm-none-eabi -run-pass greedy %s -o - | FileCheck %s
 # RUN: llc -mtriple thumbv6m-arm-none-eabi -run-pass regallocfast %s -o - | FileCheck %s --check-prefix=FAST
 # RUN: llc -mtriple thumbv6m-arm-none-eabi -passes=regallocfast %s -o - | FileCheck %s --check-prefix=FAST
+# RUN: llc -mtriple thumbv6m-arm-none-eabi -passes=greedy %s -o - | FileCheck %s
 
 ...
 ---
diff --git a/llvm/test/CodeGen/X86/limit-split-cost.mir b/llvm/test/CodeGen/X86/limit-split-cost.mir
index 7ec0404e0f737..5b8bb98389c02 100644
--- a/llvm/test/CodeGen/X86/limit-split-cost.mir
+++ b/llvm/test/CodeGen/X86/limit-split-cost.mir
@@ -1,5 +1,6 @@
 # REQUIRES: asserts
 # RUN: llc -mtriple=x86_64-- -run-pass=greedy %s -debug-only=regalloc -huge-size-for-split=0 -o /dev/null 2>&1 | FileCheck %s
+# RUN: llc -mtriple=x86_64-- -passes=greedy %s -debug-only=regalloc -huge-size-for-split=0 -o /dev/null 2>&1 | FileCheck %s
 # Check no global region split is needed because the live range to split is trivially rematerializable.
 # CHECK-NOT: Compact region bundles
 --- |
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 3892fbb8c74f7..0f7aa6284962a 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -48,10 +48,17 @@
 
 using namespace llvm;
 
-static cl::opt<std::string>
-    RegAlloc("regalloc-npm",
-             cl::desc("Register allocator to use for new pass manager"),
-             cl::Hidden, cl::init("default"));
+static cl::opt<RegAllocType> RegAlloc(
+    "regalloc-npm", cl::desc("Register allocator to use for new pass manager"),
+    cl::Hidden, cl::init(RegAllocType::Unset),
+    cl::values(
+        clEnumValN(RegAllocType::Default, "default",
+                   "Default register allocator"),
+        clEnumValN(RegAllocType::PBQP, "pbqp", "PBQP register allocator"),
+        clEnumValN(RegAllocType::Fast, "fast", "Fast register allocator"),
+        clEnumValN(RegAllocType::Basic, "basic", "Basic register allocator"),
+        clEnumValN(RegAllocType::Greedy, "greedy",
+                   "Greedy register allocator")));
 
 static cl::opt<bool>
     DebugPM("debug-pass-manager", cl::Hidden,

>From a59679bda55002d9a51bf10ec5614122829c22e2 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Fri, 14 Feb 2025 04:38:32 +0000
Subject: [PATCH 22/26] review

---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  2 +-
 llvm/lib/Passes/PassBuilder.cpp               | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index d895eee9bf4da..d26ec18d6761b 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -1093,7 +1093,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addRegAllocPass(
       addPass(RAGreedyPass());
       break;
     default:
-      report_fatal_error("register allocator not supported yet.", false);
+      report_fatal_error("register allocator not supported yet", false);
     }
     return;
   }
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 401f4dfa1c037..384d0fcfca5cb 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1420,12 +1420,12 @@ parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) {
     return RAGreedyPass::Options();
   }
   std::optional<RegAllocFilterFunc> Filter = PB.parseRegAllocFilter(Params);
-  if (!Filter) {
-    return make_error<StringError>(
-        formatv("invalid regallocgreedy register filter '{0}' ", Params).str(),
-        inconvertibleErrorCode());
+  if (Filter) {
+    return RAGreedyPass::Options{*Filter, Params};
   }
-  return RAGreedyPass::Options{*Filter, Params};
+  return make_error<StringError>(
+      formatv("invalid regallocgreedy register filter '{0}' ", Params).str(),
+      inconvertibleErrorCode());
 }
 
 } // namespace

>From 613d6ca5a7048e6f62fd8172c0fe23fa60940809 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Fri, 14 Feb 2025 10:43:38 +0000
Subject: [PATCH 23/26] if body single statement

---
 llvm/lib/Passes/PassBuilder.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 384d0fcfca5cb..4e21ef0704e5d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1416,13 +1416,13 @@ parseBoundsCheckingOptions(StringRef Params) {
 
 Expected<RAGreedyPass::Options>
 parseRegAllocGreedyFilterFunc(PassBuilder &PB, StringRef Params) {
-  if (Params.empty() || Params == "all") {
+  if (Params.empty() || Params == "all")
     return RAGreedyPass::Options();
-  }
+
   std::optional<RegAllocFilterFunc> Filter = PB.parseRegAllocFilter(Params);
-  if (Filter) {
+  if (Filter)
     return RAGreedyPass::Options{*Filter, Params};
-  }
+
   return make_error<StringError>(
       formatv("invalid regallocgreedy register filter '{0}' ", Params).str(),
       inconvertibleErrorCode());

>From 2f670958a111862ba5cb375a241f5c59b252d30f Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 17 Feb 2025 06:51:29 +0000
Subject: [PATCH 24/26] Refactor RegAllocType cl option to use a custom parser

This will allow -sgpr-regalloc-npm=greedy to reuse the same parser.
---
 llvm/include/llvm/Target/CGPassBuilderOption.h   | 16 ++++++++++++++++
 .../llc/new-pm/x86_64-regalloc-pipeline.mir      |  6 ++++++
 llvm/tools/llc/NewPMDriver.cpp                   | 15 ++++-----------
 3 files changed, 26 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir

diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index b496a9f66296f..13881f6dcf82b 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGET_CGPASSBUILDEROPTION_H
 #define LLVM_TARGET_CGPASSBUILDEROPTION_H
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
@@ -22,6 +23,21 @@ namespace llvm {
 enum class RunOutliner { TargetDefault, AlwaysOutline, NeverOutline };
 enum class RegAllocType { Unset, Default, Basic, Fast, Greedy, PBQP };
 
+class RegAllocTypeParser : public cl::parser<RegAllocType> {
+public:
+  RegAllocTypeParser(cl::Option &O) : cl::parser<RegAllocType>(O) {}
+  void initialize() {
+    cl::parser<RegAllocType>::initialize();
+    addLiteralOption("default", RegAllocType::Default,
+                     "Default register allocator");
+    addLiteralOption("pbqp", RegAllocType::PBQP, "PBQP register allocator");
+    addLiteralOption("fast", RegAllocType::Fast, "Fast register allocator");
+    addLiteralOption("basic", RegAllocType::Basic, "Basic register allocator");
+    addLiteralOption("greedy", RegAllocType::Greedy,
+                     "Greedy register allocator");
+  }
+};
+
 // Not one-on-one but mostly corresponding to commandline options in
 // TargetPassConfig.cpp.
 struct CGPassBuilderOption {
diff --git a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
new file mode 100644
index 0000000000000..58ea42278b122
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
@@ -0,0 +1,6 @@
+# REQUIRES x86_64-registered-target
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=fast -print-pipeline-passes %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=greedy -print-pipeline-passes %s 2>&1 | FileCheck %s --check-prefix=CHECK-GREEDY
+
+# CHECK: regallocfast
+# CHECK-GREEDY: greedy<all>
\ No newline at end of file
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 0f7aa6284962a..fa935b895a20a 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -48,17 +48,10 @@
 
 using namespace llvm;
 
-static cl::opt<RegAllocType> RegAlloc(
-    "regalloc-npm", cl::desc("Register allocator to use for new pass manager"),
-    cl::Hidden, cl::init(RegAllocType::Unset),
-    cl::values(
-        clEnumValN(RegAllocType::Default, "default",
-                   "Default register allocator"),
-        clEnumValN(RegAllocType::PBQP, "pbqp", "PBQP register allocator"),
-        clEnumValN(RegAllocType::Fast, "fast", "Fast register allocator"),
-        clEnumValN(RegAllocType::Basic, "basic", "Basic register allocator"),
-        clEnumValN(RegAllocType::Greedy, "greedy",
-                   "Greedy register allocator")));
+static cl::opt<RegAllocType, false, RegAllocTypeParser>
+    RegAlloc("regalloc-npm",
+             cl::desc("Register allocator to use for new pass manager"),
+             cl::Hidden, cl::init(RegAllocType::Unset));
 
 static cl::opt<bool>
     DebugPM("debug-pass-manager", cl::Hidden,

>From 9b4d0f57a8eee7770fcf2a543849e4a5532394b4 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Mon, 17 Feb 2025 09:47:48 +0000
Subject: [PATCH 25/26] AS

---
 llvm/include/llvm/Passes/MachinePassRegistry.def        | 2 +-
 llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 3199337f065fb..bb1a59a9c4ed3 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -195,7 +195,7 @@ MACHINE_FUNCTION_PASS_WITH_PARAMS(
     },
     "filter=reg-filter;no-clear-vregs")
 
-// 'all' is the default filter
+// 'all' is the default filter.
 MACHINE_FUNCTION_PASS_WITH_PARAMS(
     "greedy", "RAGreedyPass",
     [](RAGreedyPass::Options Opts) { return RAGreedyPass(Opts); },
diff --git a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
index 58ea42278b122..e72bdc3a299be 100644
--- a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
+++ b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
@@ -3,4 +3,4 @@
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=greedy -print-pipeline-passes %s 2>&1 | FileCheck %s --check-prefix=CHECK-GREEDY
 
 # CHECK: regallocfast
-# CHECK-GREEDY: greedy<all>
\ No newline at end of file
+# CHECK-GREEDY: greedy<all>

>From 1e1e4a73640d11fbc0a8927fef369b29fe5af22d Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Thu, 20 Feb 2025 10:00:57 +0000
Subject: [PATCH 26/26] clang-format diff main

---
 llvm/include/llvm/CodeGen/RegAllocGreedyPass.h | 4 +++-
 llvm/lib/CodeGen/RegAllocGreedy.cpp            | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h b/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
index 9fb6998921987..5c0cbc3c4045c 100644
--- a/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
+++ b/llvm/include/llvm/CodeGen/RegAllocGreedyPass.h
@@ -34,7 +34,9 @@ class RAGreedyPass : public PassInfoMixin<RAGreedyPass> {
         MachineFunctionProperties::Property::IsSSA);
   }
 
-  void printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const;
+  void
+  printPipeline(raw_ostream &OS,
+                function_ref<StringRef(StringRef)> MapClassName2PassName) const;
   static bool isRequired() { return true; }
 
 private:
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index f65660771901d..9d0e20f8de8d0 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -225,7 +225,9 @@ RAGreedy::RAGreedy(RequiredAnalyses &Analyses, const RegAllocFilterFunc F)
   PriorityProvider = Analyses.PriorityProvider;
 }
 
-void RAGreedyPass::printPipeline(raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) const {
+void RAGreedyPass::printPipeline(
+    raw_ostream &OS,
+    function_ref<StringRef(StringRef)> MapClassName2PassName) const {
   StringRef FilterName = Opts.FilterName.empty() ? "all" : Opts.FilterName;
   OS << "greedy<" << FilterName << '>';
 }