[llvm-branch-commits] [llvm] release/22.x: [AArch64] Add new pass after VirtRegRewriter to add implicit-defs (#174188) (PR #176197)

Thu Jan 15 08:26:16 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: None (llvmbot)

<details>
<summary>Changes</summary>

Backport 9fc7c429752ed87a36f383ee47bad575fea7702a 0133247567a2e69e107bcdd4b1d72fe93b7f93f9 91f5d73b311f3622517ff1d34d21cc8ef1f52ea9

Requested by: @sdesmalen-arm

---

Patch is 1020.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176197.diff


120 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64.h (+2) 
- (modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+5-5) 
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+2-3) 
- (added) llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp (+248) 
- (modified) llvm/lib/Target/AArch64/AArch64Subtarget.cpp (+16-3) 
- (modified) llvm/lib/Target/AArch64/AArch64Subtarget.h (+7-1) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+15-1) 
- (modified) llvm/lib/Target/AArch64/CMakeLists.txt (+1) 
- (modified) llvm/test/CodeGen/AArch64/O3-pipeline.ll (+1) 
- (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+2-15) 
- (modified) llvm/test/CodeGen/AArch64/arm64-addrmode.ll (+40-90) 
- (modified) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+3-9) 
- (modified) llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll (+3-9) 
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+11-53) 
- (modified) llvm/test/CodeGen/AArch64/ldst-implicitop.mir (+29) 
- (modified) llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll (+5-10) 
- (modified) llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir (-1) 
- (modified) llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll (-3) 
- (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll (-1) 
- (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll (-1) 
- (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+10-40) 
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-body.ll (-2) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll (-24) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfmul.ll (-18) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll (-18) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll (-4) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll (+78-78) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fclamp.ll (-18) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll (+2-98) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll (-54) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll (+2-6) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll (+182-218) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll (+182-218) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll (+92-152) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll (+1-192) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll (-8) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll (-64) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll (-82) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll (+104-104) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sclamp.ll (-24) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll (-2) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll (+52-52) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll (-60) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll (-26) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-uclamp.ll (-24) 
- (modified) llvm/test/CodeGen/AArch64/sme2p2-intrinsics-fmul.ll (-54) 
- (added) llvm/test/CodeGen/AArch64/subreg-liveness-fix-subreg-to-reg-implicit-def.mir (+107) 
- (modified) llvm/test/CodeGen/AArch64/subreg_to_reg_coalescing_issue.mir (+1-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll (+2-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll (-38) 
- (modified) llvm/test/CodeGen/AArch64/sve-fmsub.ll (+2-50) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll (+1-9) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll (+2-5) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll (+2-26) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll (+2-26) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll (+2-121) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll (+2-65) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll (+2-83) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll (+4-16) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll (+3-13) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll (+3-13) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll (+3-37) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll (+9-69) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll (+3-5) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll (+3-23) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+3-39) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+3-20) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll (+3-29) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll (+3-18) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll (+10-10) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll (+3-135) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll (+3-22) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll (+7-14) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll (+3-63) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll (+3-87) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll (+3-23) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+8-32) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll (+4-144) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll (+3-27) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll (+81-141) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll (+15-39) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll (+3-75) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll (+3-99) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mul.ll (+3-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll (+6-126) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll (+3-38) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll (+78-132) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll (+3-36) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll (+7-97) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+12-52) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+7-7) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll (+3-5) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll (+3-5) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll (+3-30) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+3-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll (+3-3) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll (+3-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll (+26-26) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll (+12-16) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll (+2-3) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll (+3-37) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll (+3-23) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll (+8-12) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll (+355-365) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll (+6-33) 
- (modified) llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll (+30-61) 
- (modified) llvm/test/CodeGen/AArch64/sve-vector-interleave.ll (+5-64) 
- (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll (+3-3) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-dots-partial-reduction.ll (+2-10) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-crypto.ll (+2-28) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll (+4-157) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll (+56-56) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll (+51-51) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll (+10-10) 
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll (+4-36) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index a8e15c338352a..40983714ddf1d 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -64,6 +64,7 @@ FunctionPass *createAArch64CollectLOHPass();
 FunctionPass *createSMEABIPass();
 FunctionPass *createSMEPeepholeOptPass();
 FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
+FunctionPass *createAArch64SRLTDefineSuperRegsPass();
 ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
@@ -117,6 +118,7 @@ void initializeLDTLSCleanupPass(PassRegistry&);
 void initializeSMEABIPass(PassRegistry &);
 void initializeSMEPeepholeOptPass(PassRegistry &);
 void initializeMachineSMEABIPass(PassRegistry &);
+void initializeAArch64SRLTDefineSuperRegsPass(PassRegistry &);
 void initializeSVEIntrinsicOptsPass(PassRegistry &);
 void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 45599de6a4828..3d9444c0c5426 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -833,10 +833,10 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
   }
 }
 
-static bool isRewritableImplicitDef(unsigned Opc) {
-  switch (Opc) {
+static bool isRewritableImplicitDef(const MachineOperand &MO) {
+  switch (MO.getParent()->getOpcode()) {
   default:
-    return false;
+    return MO.isRenamable();
   case AArch64::ORRWrs:
   case AArch64::ADDWri:
     return true;
@@ -1047,7 +1047,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                         MI.getRegClassConstraint(OpIdx, TII, TRI))
                   MatchingReg = GetMatchingSubReg(RC);
                 else {
-                  if (!isRewritableImplicitDef(MI.getOpcode()))
+                  if (!isRewritableImplicitDef(MOP))
                     continue;
                   MatchingReg = GetMatchingSubReg(
                       TRI->getMinimalPhysRegClass(MOP.getReg()));
@@ -1739,7 +1739,7 @@ static bool canRenameMOP(const MachineOperand &MOP,
     // them must be known. For example, in ORRWrs the implicit-def
     // corresponds to the result register.
     if (MOP.isImplicit() && MOP.isDef()) {
-      if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
+      if (!isRewritableImplicitDef(MOP))
         return false;
       return TRI->isSuperOrSubRegisterEq(
           MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 098fc4528c91e..8c0dd4381fae8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1385,9 +1385,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
   MachineFunction &MF = *MI->getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  // Coalescing of SUBREG_TO_REG is broken when using subreg liveness tracking,
-  // we must disable it for now.
-  if (MI->isSubregToReg() && MRI.subRegLivenessEnabled())
+  if (MI->isSubregToReg() && MRI.subRegLivenessEnabled() &&
+      !MF.getSubtarget<AArch64Subtarget>().enableSRLTSubregToRegMitigation())
     return false;
 
   if (MI->isCopy() &&
diff --git a/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp b/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp
new file mode 100644
index 0000000000000..40345769a64d9
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp
@@ -0,0 +1,248 @@
+//===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds
+// extra implicit-def's to instructions that define the low N bits of
+// a GPR/FPR register to also define the top bits, because all AArch64
+// instructions that write the low bits of a GPR/FPR also implicitly zero
+// the top bits.  For example, 'mov w0, w1' writes zeroes to the top 32-bits of
+// x0, so this pass adds a `implicit-def $x0` after register allocation.
+//
+// These semantics are originally represented in the MIR using `SUBREG_TO_REG`
+// which expresses that the top bits have been defined by the preceding
+// instructions, but during register coalescing this information is lost and in
+// contrast to when SRTL is disabled, when rewriting virtual -> physical
+// registers the implicit-defs are not added to the instruction.
+//
+// There have been several attempts to fix this in the coalescer [1], but each
+// iteration has exposed new bugs and the patch had to be reverted.
+// Additionally, the concept of adding 'implicit-def' of a virtual register is
+// particularly fragile and many places don't expect it (for example in
+// `X86::commuteInstructionImpl` the  code only looks at specific operands and
+// does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where
+// it traverses operand 'defs' rather than 'all_defs').
+//
+// We want a temporary solution that doesn't impact other targets and is simpler
+// and less intrusive than the patch proposed for the register coalescer [1], so
+// that we can enable SRLT for AArch64.
+//
+// The approach here is to just add the 'implicit-def' manually after rewriting
+// virtual regs -> phsyical regs. This still means that during the register
+// allocation process the dependences are not accurately represented in the MIR
+// and LiveIntervals, but there are several reasons why we believe this isn't a
+// problem in practice:
+// (A) The register allocator only spills entire virtual registers.
+//     This is additionally guarded by code in
+//     AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot
+//     where it checks if a register matches the expected register class.
+// (B) Rematerialization only happens when the instruction writes the full
+//     register.
+// (C) The high bits of the AArch64 register cannot be written independently.
+// (D) Instructions that write only part of a register always take that same
+//     register as a tied input operand, to indicate it's a merging operation.
+//
+// (A) means that for two virtual registers of regclass GPR32 and GPR64, if the
+// GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would
+// be spilled/filled even if only the low 32-bits would be required for the
+// given liverange. (B) means that the top bits of a GPR64 would never be
+// overwritten by rematerialising a GPR32 sub-register for a given liverange.
+// (C-D) means that we can assume that the MIR as input to the register
+// allocator correctly expresses the instruction behaviour and dependences
+// between values, so unless the register allocator would violate (A) or (B),
+// the MIR is otherwise sound.
+//
+// Alternative approaches have also been considered, such as:
+// (1) Changing the AArch64 instruction definitions to write all bits and
+//     extract the low N bits for the result.
+// (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell
+//     the register allocator to favour the same register for the input/output.
+// (3) Adding a new coalescer guard node with a tied-operand constraint, such
+//     that when the SUBREG_TO_REG is removed, something still represents that
+//     the top bits are defined. The node would get removed before rewriting
+//     virtregs.
+// (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize
+//     away the INSERT_SUBREG (this is a more explicit variant of (2) and (3))
+// (5) Adding a new MachineOperand flag that represents the top bits would be
+//     defined, but are not read nor undef.
+//
+// (1) would be the best approach but would be a significant effort as it
+// requires rewriting most/all instruction definitions and fixing MIR passes
+// that rely on the current definitions, whereas (2-4) result in sub-optimal
+// code that can't really be avoided because the explicit nodes would stop
+// rematerialization. (5) might be a way to mitigate the
+// fragility of implicit-def's of virtual registers if we want to pursue
+// landing [1], but then we'd rather choose approach (1) to avoid using
+// SUBREG_TO_REG entirely.
+//
+// [1] https://github.com/llvm/llvm-project/pull/168353
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-srlt-define-superregs"
+#define PASS_NAME "AArch64 SRLT Define Super-Regs Pass"
+
+namespace {
+
+struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass {
+  inline static char ID = 0;
+
+  AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  Register getWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits,
+                             const BitVector &QHiRegUnits);
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  MachineFunction *MF = nullptr;
+  const AArch64Subtarget *Subtarget = nullptr;
+  const AArch64RegisterInfo *TRI = nullptr;
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, false)
+
+// Returns the widest super-reg for a given reg, or NoRegister if no suitable
+// wider super-reg has been found. For example:
+//  W0    -> X0
+//  B1    -> Q1 (without SVE)
+//        -> Z1 (with SVE)
+//  W1_W2 -> X1_X2
+//  D0_D1 -> Q0_Q1 (without SVE)
+//        -> Z0_Z1 (with SVE)
+Register AArch64SRLTDefineSuperRegs::getWidestSuperReg(
+    Register R, const BitVector &RequiredBaseRegUnits,
+    const BitVector &QHiRegUnits) {
+  assert(R.isPhysical() &&
+         "Expected to be run straight after virtregrewriter!");
+
+  BitVector Units(TRI->getNumRegUnits());
+  for (MCRegUnit U : TRI->regunits(R))
+    Units.set((unsigned)U);
+
+  auto IsSuitableSuperReg = [&](Register SR) {
+    for (MCRegUnit U : TRI->regunits(SR)) {
+      // Avoid choosing z1 as super-reg of d1 if SVE is not available.
+      // Q*_HI registers are only set for SVE registers, as those consist
+      // of the Q* register for the low 128 bits and the Q*_HI (artificial)
+      // register for the top (vscale-1) * 128 bits.
+      if (QHiRegUnits.test((unsigned)U) &&
+          !Subtarget->isSVEorStreamingSVEAvailable())
+        return false;
+      // We consider a super-reg as unsuitable if any of its reg units is not
+      // artificial and not shared, as that would imply that U is a unit for a
+      // different register, which means the candidate super-reg is likely
+      // a register tuple.
+      if (!TRI->isArtificialRegUnit(U) &&
+          (!Units.test((unsigned)U) || !RequiredBaseRegUnits.test((unsigned)U)))
+        return false;
+    }
+    return true;
+  };
+
+  Register LargestSuperReg = AArch64::NoRegister;
+  for (Register SR : TRI->superregs(R))
+    if (IsSuitableSuperReg(SR) && (LargestSuperReg == AArch64::NoRegister ||
+                                   TRI->isSuperRegister(LargestSuperReg, SR)))
+      LargestSuperReg = SR;
+
+  return LargestSuperReg;
+}
+
+bool AArch64SRLTDefineSuperRegs::runOnMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+  Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+  TRI = Subtarget->getRegisterInfo();
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  if (!MRI->subRegLivenessEnabled())
+    return false;
+
+  assert(!MRI->isSSA() && "Expected to be run after breaking down SSA form!");
+
+  auto XRegs = seq_inclusive<unsigned>(AArch64::X0, AArch64::X28);
+  auto ZRegs = seq_inclusive<unsigned>(AArch64::Z0, AArch64::Z31);
+  constexpr unsigned FixedRegs[] = {AArch64::FP, AArch64::LR, AArch64::SP};
+
+  BitVector RequiredBaseRegUnits(TRI->getNumRegUnits());
+  for (Register R : concat<unsigned>(XRegs, ZRegs, FixedRegs))
+    for (MCRegUnit U : TRI->regunits(R))
+      RequiredBaseRegUnits.set((unsigned)U);
+
+  BitVector QHiRegUnits(TRI->getNumRegUnits());
+  for (Register R : seq_inclusive<unsigned>(AArch64::Q0_HI, AArch64::Q31_HI))
+    for (MCRegUnit U : TRI->regunits(R))
+      QHiRegUnits.set((unsigned)U);
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // PATCHPOINT may have a 'def' that's not a register, avoid this.
+      if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
+        continue;
+      // For each partial register write, also add an implicit-def for top bits
+      // of the register (e.g. for w0 add a def of x0).
+      SmallSet<Register, 8> SuperRegs;
+      for (const MachineOperand &DefOp : MI.defs())
+        if (Register R = getWidestSuperReg(DefOp.getReg(), RequiredBaseRegUnits,
+                                           QHiRegUnits);
+            R != AArch64::NoRegister)
+          SuperRegs.insert(R);
+
+      if (!SuperRegs.size())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Adding implicit-defs to: " << MI);
+      for (Register R : SuperRegs) {
+        LLVM_DEBUG(dbgs() << "  " << printReg(R, TRI) << "\n");
+        bool IsRenamable = any_of(MI.defs(), [&](const MachineOperand &MO) {
+          return MO.isRenamable() && TRI->regsOverlap(MO.getReg(), R);
+        });
+        bool IsDead = any_of(MI.defs(), [&](const MachineOperand &MO) {
+          return MO.isDead() && TRI->regsOverlap(MO.getReg(), R);
+        });
+        MachineOperand DefOp = MachineOperand::CreateReg(
+            R, /*isDef=*/true, /*isImp=*/true, /*isKill=*/false,
+            /*isDead=*/IsDead, /*isUndef=*/false, /*isEarlyClobber=*/false,
+            /*SubReg=*/0, /*isDebug=*/false, /*isInternalRead=*/false,
+            /*isRenamable=*/IsRenamable);
+        MI.addOperand(DefOp);
+      }
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64SRLTDefineSuperRegsPass() {
+  return new AArch64SRLTDefineSuperRegs();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 92a7412e83fac..a642841243be3 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -355,7 +355,8 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
                                    unsigned MinSVEVectorSizeInBitsOverride,
                                    unsigned MaxSVEVectorSizeInBitsOverride,
                                    bool IsStreaming, bool IsStreamingCompatible,
-                                   bool HasMinSize)
+                                   bool HasMinSize,
+                                   bool EnableSRLTSubregToRegMitigation)
     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
@@ -367,7 +368,9 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
               ? std::optional<unsigned>(AArch64StreamingHazardSize)
               : std::nullopt),
       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
-      MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
+      MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride),
+      EnableSRLTSubregToRegMitigation(EnableSRLTSubregToRegMitigation),
+      TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
       TLInfo(TM, *this) {
   if (AArch64::isX18ReservedByDefault(TT))
@@ -400,7 +403,17 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
     ReserveXRegisterForRA.set(29);
 
-  EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
+  // To benefit from SME2's strided-register multi-vector load/store
+  // instructions we'll need to enable subreg liveness. Our longer
+  // term aim is to make this the default, regardless of streaming
+  // mode, but there are still some outstanding issues, see:
+  //  https://github.com/llvm/llvm-project/pull/174188
+  // and:
+  //  https://github.com/llvm/llvm-project/pull/168353
+  if (IsStreaming)
+    EnableSubregLiveness = true;
+  else
+    EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
 }
 
 const CallLowering *AArch64Subtarget::getCallLowering() const {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index bd8a2d5234f2d..248e140b3101c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -88,6 +88,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   std::optional<unsigned> StreamingHazardSize;
   unsigned MinSVEVectorSizeInBits;
   unsigned MaxSVEVectorSizeInBits;
+  bool EnableSRLTSubregToRegMitigation;
   unsigned VScaleForTuning = 1;
   TailFoldingOpts DefaultSVETFOpts = TailFoldingOpts::Disabled;
 
@@ -128,7 +129,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
                    unsigned MinSVEVectorSizeInBitsOverride = 0,
                    unsigned MaxSVEVectorSizeInBitsOverride = 0,
                    bool IsStreaming = false, bool IsStreamingCompatible = false,
-                   bool HasMinSize = false);
+                   bool HasMinSize = false,
+                   bool EnableSRLTSubregToRegMitigation = false);
 
 // Getters for SubtargetFeatures defined in tablegen
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
@@ -467,6 +469,10 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   /// add + cnt instructions.
   bool useScalarIncVL() const;
 
+  bool enableSRLTSubregToRegMitigation() const {
+    return EnableSRLTSubregToRegMitigation;
+  }
+
   /// Choose a method of checking LR before performing a tail call.
   AArch64PAuth::AuthCheckMethod
   getAuthenticatedLRCheckMethod(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 1ec5a20cc0ce0..3aba866458830 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -227,6 +227,12 @@ static cl::opt<bool>
                             cl::desc("Enable new lowering for the SME ABI"),
                             cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableSRLTSubregToRegMitigation(
+    "aarch64-srlt-mitigate-sr2r",
+    cl::desc("Enable SUBREG_TO_REG mitigation by adding 'implicit-def' for "
+             "super-regs when using Subreg Liveness Tracking"),
+    cl::init(true), cl::Hidden);
+
 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
 LLVMInitializeAArch64Target() {
   // Register the target.
@@ -268,6 +274,7 @@ LLVMInitializeAArch64Target() {
   initializeKCFIPass(PR);
   initializeSMEABIPass(PR);
   initializeMachineSMEABIPass(PR);
+  initializeAArch64SRLTDefineSuperRegsPass(PR);
   initializeSMEPeepholeOptPass(PR);
   initializeSVEIntrinsicOptsPass(PR);
   initializeAArch64SpeculationHardeningPass(PR);
@@ -462,7 +469,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = std::make_unique<AArch64Subtarget>(
         TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize,
-        MaxSVEVectorSize, IsStreaming, Is...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/176197