[llvm-branch-commits] [llvm] release/22.x: [AArch64] Add new pass after VirtRegRewriter to add implicit-defs (#174188) (PR #176197)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 15 08:26:16 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (llvmbot)
<details>
<summary>Changes</summary>
Backport 9fc7c429752ed87a36f383ee47bad575fea7702a 0133247567a2e69e107bcdd4b1d72fe93b7f93f9 91f5d73b311f3622517ff1d34d21cc8ef1f52ea9
Requested by: @<!-- -->sdesmalen-arm
---
Patch is 1020.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176197.diff
120 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64.h (+2)
- (modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+5-5)
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+2-3)
- (added) llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp (+248)
- (modified) llvm/lib/Target/AArch64/AArch64Subtarget.cpp (+16-3)
- (modified) llvm/lib/Target/AArch64/AArch64Subtarget.h (+7-1)
- (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+15-1)
- (modified) llvm/lib/Target/AArch64/CMakeLists.txt (+1)
- (modified) llvm/test/CodeGen/AArch64/O3-pipeline.ll (+1)
- (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+2-15)
- (modified) llvm/test/CodeGen/AArch64/arm64-addrmode.ll (+40-90)
- (modified) llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll (+3-9)
- (modified) llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll (+3-9)
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+11-53)
- (modified) llvm/test/CodeGen/AArch64/ldst-implicitop.mir (+29)
- (modified) llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll (+5-10)
- (modified) llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir (-1)
- (modified) llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll (-3)
- (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll (-1)
- (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll (-1)
- (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+10-40)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-body.ll (-2)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll (-24)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfmul.ll (-18)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-bfscale.ll (-18)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll (-4)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll (+78-78)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fclamp.ll (-18)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll (+2-98)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fscale.ll (-54)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll (+2-6)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll (+182-218)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll (+182-218)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll (+92-152)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll (+1-192)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll (-8)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll (-64)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll (-82)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll (+104-104)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sclamp.ll (-24)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-select-sme-tileslice.ll (-2)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll (+52-52)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll (-60)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll (-26)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-uclamp.ll (-24)
- (modified) llvm/test/CodeGen/AArch64/sme2p2-intrinsics-fmul.ll (-54)
- (added) llvm/test/CodeGen/AArch64/subreg-liveness-fix-subreg-to-reg-implicit-def.mir (+107)
- (modified) llvm/test/CodeGen/AArch64/subreg_to_reg_coalescing_issue.mir (+1-2)
- (modified) llvm/test/CodeGen/AArch64/sve-bf16-reductions.ll (+2-4)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll (-38)
- (modified) llvm/test/CodeGen/AArch64/sve-fmsub.ll (+2-50)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-contiguous-prefetches.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll (+1-9)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll (+2-5)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-sqdec.ll (+2-26)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-sqinc.ll (+2-26)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll (+2-121)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-reg-addr-mode.ll (+2-65)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll (+2-83)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll (+4-16)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll (+3-13)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll (+3-13)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll (+3-37)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll (+9-69)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll (+3-5)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll (+3-23)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+3-39)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+3-20)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll (+3-29)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll (+3-18)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll (+10-10)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll (+3-135)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll (+3-22)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll (+7-14)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll (+3-63)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll (+3-87)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll (+3-23)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+8-32)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll (+4-144)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll (+3-27)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll (+81-141)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll (+15-39)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll (+3-75)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll (+3-99)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mul.ll (+3-6)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll (+6-126)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll (+3-38)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll (+78-132)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll (+3-36)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll (+7-97)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+12-52)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+7-7)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll (+3-5)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll (+3-5)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll (+3-30)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+3-4)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll (+3-4)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll (+26-26)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll (+12-16)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll (+2-3)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll (+3-37)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll (+3-23)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll (+8-12)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll (+355-365)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll (+6-33)
- (modified) llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll (+30-61)
- (modified) llvm/test/CodeGen/AArch64/sve-vector-interleave.ll (+5-64)
- (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-luti.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-dots-partial-reduction.ll (+2-10)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-crypto.ll (+2-28)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll (+4-157)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll (+56-56)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll (+51-51)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll (+10-10)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll (+4-36)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index a8e15c338352a..40983714ddf1d 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -64,6 +64,7 @@ FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
FunctionPass *createSMEPeepholeOptPass();
FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
+FunctionPass *createAArch64SRLTDefineSuperRegsPass();
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
@@ -117,6 +118,7 @@ void initializeLDTLSCleanupPass(PassRegistry&);
void initializeSMEABIPass(PassRegistry &);
void initializeSMEPeepholeOptPass(PassRegistry &);
void initializeMachineSMEABIPass(PassRegistry &);
+void initializeAArch64SRLTDefineSuperRegsPass(PassRegistry &);
void initializeSVEIntrinsicOptsPass(PassRegistry &);
void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 45599de6a4828..3d9444c0c5426 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -833,10 +833,10 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
}
}
-static bool isRewritableImplicitDef(unsigned Opc) {
- switch (Opc) {
+static bool isRewritableImplicitDef(const MachineOperand &MO) {
+ switch (MO.getParent()->getOpcode()) {
default:
- return false;
+ return MO.isRenamable();
case AArch64::ORRWrs:
case AArch64::ADDWri:
return true;
@@ -1047,7 +1047,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MI.getRegClassConstraint(OpIdx, TII, TRI))
MatchingReg = GetMatchingSubReg(RC);
else {
- if (!isRewritableImplicitDef(MI.getOpcode()))
+ if (!isRewritableImplicitDef(MOP))
continue;
MatchingReg = GetMatchingSubReg(
TRI->getMinimalPhysRegClass(MOP.getReg()));
@@ -1739,7 +1739,7 @@ static bool canRenameMOP(const MachineOperand &MOP,
// them must be known. For example, in ORRWrs the implicit-def
// corresponds to the result register.
if (MOP.isImplicit() && MOP.isDef()) {
- if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
+ if (!isRewritableImplicitDef(MOP))
return false;
return TRI->isSuperOrSubRegisterEq(
MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 098fc4528c91e..8c0dd4381fae8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1385,9 +1385,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
MachineFunction &MF = *MI->getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
- // Coalescing of SUBREG_TO_REG is broken when using subreg liveness tracking,
- // we must disable it for now.
- if (MI->isSubregToReg() && MRI.subRegLivenessEnabled())
+ if (MI->isSubregToReg() && MRI.subRegLivenessEnabled() &&
+ !MF.getSubtarget<AArch64Subtarget>().enableSRLTSubregToRegMitigation())
return false;
if (MI->isCopy() &&
diff --git a/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp b/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp
new file mode 100644
index 0000000000000..40345769a64d9
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SRLTDefineSuperRegs.cpp
@@ -0,0 +1,248 @@
+//===- AArch64SRLTDefineSuperRegs.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// When SubRegister Liveness Tracking (SRLT) is enabled, this pass adds
+// extra implicit-def's to instructions that define the low N bits of
+// a GPR/FPR register to also define the top bits, because all AArch64
+// instructions that write the low bits of a GPR/FPR also implicitly zero
+// the top bits. For example, 'mov w0, w1' writes zeroes to the top 32-bits of
+// x0, so this pass adds a `implicit-def $x0` after register allocation.
+//
+// These semantics are originally represented in the MIR using `SUBREG_TO_REG`
+// which expresses that the top bits have been defined by the preceding
+// instructions, but during register coalescing this information is lost and in
+// contrast to when SRTL is disabled, when rewriting virtual -> physical
+// registers the implicit-defs are not added to the instruction.
+//
+// There have been several attempts to fix this in the coalescer [1], but each
+// iteration has exposed new bugs and the patch had to be reverted.
+// Additionally, the concept of adding 'implicit-def' of a virtual register is
+// particularly fragile and many places don't expect it (for example in
+// `X86::commuteInstructionImpl` the code only looks at specific operands and
+// does not consider implicit-defs. Similar in `SplitEditor::addDeadDef` where
+// it traverses operand 'defs' rather than 'all_defs').
+//
+// We want a temporary solution that doesn't impact other targets and is simpler
+// and less intrusive than the patch proposed for the register coalescer [1], so
+// that we can enable SRLT for AArch64.
+//
+// The approach here is to just add the 'implicit-def' manually after rewriting
+// virtual regs -> phsyical regs. This still means that during the register
+// allocation process the dependences are not accurately represented in the MIR
+// and LiveIntervals, but there are several reasons why we believe this isn't a
+// problem in practice:
+// (A) The register allocator only spills entire virtual registers.
+// This is additionally guarded by code in
+// AArch64InstrInfo::storeRegToStackSlot/loadRegFromStackSlot
+// where it checks if a register matches the expected register class.
+// (B) Rematerialization only happens when the instruction writes the full
+// register.
+// (C) The high bits of the AArch64 register cannot be written independently.
+// (D) Instructions that write only part of a register always take that same
+// register as a tied input operand, to indicate it's a merging operation.
+//
+// (A) means that for two virtual registers of regclass GPR32 and GPR64, if the
+// GPR32 register is coalesced into the GPR64 vreg then the full GPR64 would
+// be spilled/filled even if only the low 32-bits would be required for the
+// given liverange. (B) means that the top bits of a GPR64 would never be
+// overwritten by rematerialising a GPR32 sub-register for a given liverange.
+// (C-D) means that we can assume that the MIR as input to the register
+// allocator correctly expresses the instruction behaviour and dependences
+// between values, so unless the register allocator would violate (A) or (B),
+// the MIR is otherwise sound.
+//
+// Alternative approaches have also been considered, such as:
+// (1) Changing the AArch64 instruction definitions to write all bits and
+// extract the low N bits for the result.
+// (2) Disabling coalescing of SUBREG_TO_REG and using regalloc hints to tell
+// the register allocator to favour the same register for the input/output.
+// (3) Adding a new coalescer guard node with a tied-operand constraint, such
+// that when the SUBREG_TO_REG is removed, something still represents that
+// the top bits are defined. The node would get removed before rewriting
+// virtregs.
+// (4) Using an explicit INSERT_SUBREG into a zero value and try to optimize
+// away the INSERT_SUBREG (this is a more explicit variant of (2) and (3))
+// (5) Adding a new MachineOperand flag that represents the top bits would be
+// defined, but are not read nor undef.
+//
+// (1) would be the best approach but would be a significant effort as it
+// requires rewriting most/all instruction definitions and fixing MIR passes
+// that rely on the current definitions, whereas (2-4) result in sub-optimal
+// code that can't really be avoided because the explicit nodes would stop
+// rematerialization. (5) might be a way to mitigate the
+// fragility of implicit-def's of virtual registers if we want to pursue
+// landing [1], but then we'd rather choose approach (1) to avoid using
+// SUBREG_TO_REG entirely.
+//
+// [1] https://github.com/llvm/llvm-project/pull/168353
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-srlt-define-superregs"
+#define PASS_NAME "AArch64 SRLT Define Super-Regs Pass"
+
+namespace {
+
+struct AArch64SRLTDefineSuperRegs : public MachineFunctionPass {
+ inline static char ID = 0;
+
+ AArch64SRLTDefineSuperRegs() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ Register getWidestSuperReg(Register R, const BitVector &RequiredBaseRegUnits,
+ const BitVector &QHiRegUnits);
+
+ StringRef getPassName() const override { return PASS_NAME; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ MachineFunction *MF = nullptr;
+ const AArch64Subtarget *Subtarget = nullptr;
+ const AArch64RegisterInfo *TRI = nullptr;
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64SRLTDefineSuperRegs, DEBUG_TYPE, PASS_NAME, false, false)
+
+// Returns the widest super-reg for a given reg, or NoRegister if no suitable
+// wider super-reg has been found. For example:
+// W0 -> X0
+// B1 -> Q1 (without SVE)
+// -> Z1 (with SVE)
+// W1_W2 -> X1_X2
+// D0_D1 -> Q0_Q1 (without SVE)
+// -> Z0_Z1 (with SVE)
+Register AArch64SRLTDefineSuperRegs::getWidestSuperReg(
+ Register R, const BitVector &RequiredBaseRegUnits,
+ const BitVector &QHiRegUnits) {
+ assert(R.isPhysical() &&
+ "Expected to be run straight after virtregrewriter!");
+
+ BitVector Units(TRI->getNumRegUnits());
+ for (MCRegUnit U : TRI->regunits(R))
+ Units.set((unsigned)U);
+
+ auto IsSuitableSuperReg = [&](Register SR) {
+ for (MCRegUnit U : TRI->regunits(SR)) {
+ // Avoid choosing z1 as super-reg of d1 if SVE is not available.
+ // Q*_HI registers are only set for SVE registers, as those consist
+ // of the Q* register for the low 128 bits and the Q*_HI (artificial)
+ // register for the top (vscale-1) * 128 bits.
+ if (QHiRegUnits.test((unsigned)U) &&
+ !Subtarget->isSVEorStreamingSVEAvailable())
+ return false;
+ // We consider a super-reg as unsuitable if any of its reg units is not
+ // artificial and not shared, as that would imply that U is a unit for a
+ // different register, which means the candidate super-reg is likely
+ // a register tuple.
+ if (!TRI->isArtificialRegUnit(U) &&
+ (!Units.test((unsigned)U) || !RequiredBaseRegUnits.test((unsigned)U)))
+ return false;
+ }
+ return true;
+ };
+
+ Register LargestSuperReg = AArch64::NoRegister;
+ for (Register SR : TRI->superregs(R))
+ if (IsSuitableSuperReg(SR) && (LargestSuperReg == AArch64::NoRegister ||
+ TRI->isSuperRegister(LargestSuperReg, SR)))
+ LargestSuperReg = SR;
+
+ return LargestSuperReg;
+}
+
+bool AArch64SRLTDefineSuperRegs::runOnMachineFunction(MachineFunction &MF) {
+ this->MF = &MF;
+ Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+ TRI = Subtarget->getRegisterInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+ if (!MRI->subRegLivenessEnabled())
+ return false;
+
+ assert(!MRI->isSSA() && "Expected to be run after breaking down SSA form!");
+
+ auto XRegs = seq_inclusive<unsigned>(AArch64::X0, AArch64::X28);
+ auto ZRegs = seq_inclusive<unsigned>(AArch64::Z0, AArch64::Z31);
+ constexpr unsigned FixedRegs[] = {AArch64::FP, AArch64::LR, AArch64::SP};
+
+ BitVector RequiredBaseRegUnits(TRI->getNumRegUnits());
+ for (Register R : concat<unsigned>(XRegs, ZRegs, FixedRegs))
+ for (MCRegUnit U : TRI->regunits(R))
+ RequiredBaseRegUnits.set((unsigned)U);
+
+ BitVector QHiRegUnits(TRI->getNumRegUnits());
+ for (Register R : seq_inclusive<unsigned>(AArch64::Q0_HI, AArch64::Q31_HI))
+ for (MCRegUnit U : TRI->regunits(R))
+ QHiRegUnits.set((unsigned)U);
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ // PATCHPOINT may have a 'def' that's not a register, avoid this.
+ if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
+ continue;
+ // For each partial register write, also add an implicit-def for top bits
+ // of the register (e.g. for w0 add a def of x0).
+ SmallSet<Register, 8> SuperRegs;
+ for (const MachineOperand &DefOp : MI.defs())
+ if (Register R = getWidestSuperReg(DefOp.getReg(), RequiredBaseRegUnits,
+ QHiRegUnits);
+ R != AArch64::NoRegister)
+ SuperRegs.insert(R);
+
+ if (!SuperRegs.size())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Adding implicit-defs to: " << MI);
+ for (Register R : SuperRegs) {
+ LLVM_DEBUG(dbgs() << " " << printReg(R, TRI) << "\n");
+ bool IsRenamable = any_of(MI.defs(), [&](const MachineOperand &MO) {
+ return MO.isRenamable() && TRI->regsOverlap(MO.getReg(), R);
+ });
+ bool IsDead = any_of(MI.defs(), [&](const MachineOperand &MO) {
+ return MO.isDead() && TRI->regsOverlap(MO.getReg(), R);
+ });
+ MachineOperand DefOp = MachineOperand::CreateReg(
+ R, /*isDef=*/true, /*isImp=*/true, /*isKill=*/false,
+ /*isDead=*/IsDead, /*isUndef=*/false, /*isEarlyClobber=*/false,
+ /*SubReg=*/0, /*isDebug=*/false, /*isInternalRead=*/false,
+ /*isRenamable=*/IsRenamable);
+ MI.addOperand(DefOp);
+ }
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64SRLTDefineSuperRegsPass() {
+ return new AArch64SRLTDefineSuperRegs();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 92a7412e83fac..a642841243be3 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -355,7 +355,8 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
unsigned MinSVEVectorSizeInBitsOverride,
unsigned MaxSVEVectorSizeInBitsOverride,
bool IsStreaming, bool IsStreamingCompatible,
- bool HasMinSize)
+ bool HasMinSize,
+ bool EnableSRLTSubregToRegMitigation)
: AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
@@ -367,7 +368,9 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
? std::optional<unsigned>(AArch64StreamingHazardSize)
: std::nullopt),
MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
- MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
+ MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride),
+ EnableSRLTSubregToRegMitigation(EnableSRLTSubregToRegMitigation),
+ TargetTriple(TT),
InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
TLInfo(TM, *this) {
if (AArch64::isX18ReservedByDefault(TT))
@@ -400,7 +403,17 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
ReserveXRegisterForRA.set(29);
- EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
+ // To benefit from SME2's strided-register multi-vector load/store
+ // instructions we'll need to enable subreg liveness. Our longer
+ // term aim is to make this the default, regardless of streaming
+ // mode, but there are still some outstanding issues, see:
+ // https://github.com/llvm/llvm-project/pull/174188
+ // and:
+ // https://github.com/llvm/llvm-project/pull/168353
+ if (IsStreaming)
+ EnableSubregLiveness = true;
+ else
+ EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
}
const CallLowering *AArch64Subtarget::getCallLowering() const {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index bd8a2d5234f2d..248e140b3101c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -88,6 +88,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
std::optional<unsigned> StreamingHazardSize;
unsigned MinSVEVectorSizeInBits;
unsigned MaxSVEVectorSizeInBits;
+ bool EnableSRLTSubregToRegMitigation;
unsigned VScaleForTuning = 1;
TailFoldingOpts DefaultSVETFOpts = TailFoldingOpts::Disabled;
@@ -128,7 +129,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
unsigned MinSVEVectorSizeInBitsOverride = 0,
unsigned MaxSVEVectorSizeInBitsOverride = 0,
bool IsStreaming = false, bool IsStreamingCompatible = false,
- bool HasMinSize = false);
+ bool HasMinSize = false,
+ bool EnableSRLTSubregToRegMitigation = false);
// Getters for SubtargetFeatures defined in tablegen
#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
@@ -467,6 +469,10 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
/// add + cnt instructions.
bool useScalarIncVL() const;
+ bool enableSRLTSubregToRegMitigation() const {
+ return EnableSRLTSubregToRegMitigation;
+ }
+
/// Choose a method of checking LR before performing a tail call.
AArch64PAuth::AuthCheckMethod
getAuthenticatedLRCheckMethod(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 1ec5a20cc0ce0..3aba866458830 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -227,6 +227,12 @@ static cl::opt<bool>
cl::desc("Enable new lowering for the SME ABI"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableSRLTSubregToRegMitigation(
+ "aarch64-srlt-mitigate-sr2r",
+ cl::desc("Enable SUBREG_TO_REG mitigation by adding 'implicit-def' for "
+ "super-regs when using Subreg Liveness Tracking"),
+ cl::init(true), cl::Hidden);
+
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
LLVMInitializeAArch64Target() {
// Register the target.
@@ -268,6 +274,7 @@ LLVMInitializeAArch64Target() {
initializeKCFIPass(PR);
initializeSMEABIPass(PR);
initializeMachineSMEABIPass(PR);
+ initializeAArch64SRLTDefineSuperRegsPass(PR);
initializeSMEPeepholeOptPass(PR);
initializeSVEIntrinsicOptsPass(PR);
initializeAArch64SpeculationHardeningPass(PR);
@@ -462,7 +469,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
resetTargetOptions(F);
I = std::make_unique<AArch64Subtarget>(
TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize,
- MaxSVEVectorSize, IsStreaming, Is...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/176197
More information about the llvm-branch-commits
mailing list