[llvm] [AArch64][MachineCombiner] Combine sequences of gather patterns (PR #152979)
Jonathan Cohen via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 17 23:50:49 PDT 2025
================
@@ -7412,11 +7419,335 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+/// Check if a given MachineInstr `MIa` may alias with any of the instructions
+/// in `MemInstrs`.
+static bool mayAlias(const MachineInstr &MIa,
+ SmallVectorImpl<const MachineInstr *> &MemInstrs,
+ AliasAnalysis *AA) {
+ for (const MachineInstr *MIb : MemInstrs) {
+ if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
+ MIb->dump();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Check if the given instruction forms a gather load pattern that can be
+/// optimized for better Memory-Level Parallelism (MLP). This function
+/// identifies chains of NEON lane load instructions that load data from
+/// different memory addresses into individual lanes of a 128-bit vector
+/// register, then attempts to split the pattern into parallel loads to break
+/// the serial dependency between instructions.
+///
+/// Pattern Matched:
+/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
+/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
+///
+/// Transformed Into:
+/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
+/// to combine the results, enabling better memory-level parallelism.
+///
+/// Supported Element Types:
+/// - 32-bit elements (LD1i32, 4 lanes total)
+/// - 16-bit elements (LD1i16, 8 lanes total)
+/// - 8-bit elements (LD1i8, 16 lanes total)
+static bool getGatherLanePattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
+ const MachineFunction *MF = Root.getMF();
+
+ // Early exit if optimizing for size.
+ if (MF->getFunction().hasMinSize())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single non-debug use (since we will be replacing the virtual
+ // register)
+ // 2. That the addressing mode only uses a single pointer operand
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
+ SmallVector<const MachineInstr *, 16> LoadInstrs = {};
+ while (!RemainingLanes.empty() && CurrInstr &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ LoadInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Check that we have found a match for lanes N-1.. 1.
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an integer into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
+
+ // If there is any chance of aliasing, do not apply the pattern.
+ // Walk backward through the MBB starting from Root.
+ // Exit early if we've encountered all load instructions or hit the search
+ // limit.
+ auto MBBItr = Root.getIterator();
+ unsigned RemainingSteps = GatherOptSearchLimit;
+ SmallSet<const MachineInstr *, 16> RemainingLoadInstrs;
+ RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
+ const MachineBasicBlock *MBB = Root.getParent();
+
+ for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
+ !RemainingLoadInstrs.empty();
+ --MBBItr, --RemainingSteps) {
+ const MachineInstr &CurrInstr = *MBBItr;
+
+ // Remove this instruction from remaining loads if it's one we're tracking.
+ RemainingLoadInstrs.erase(&CurrInstr);
+
+ // Check for potential aliasing with any of the load instructions to
+ // optimize.
+ if ((CurrInstr.mayLoadOrStore() || CurrInstr.isCall()) &&
+ mayAlias(CurrInstr, LoadInstrs, nullptr))
----------------
jcohen-apple wrote:
Ah, that makes more sense, I wasn't familiar with this method.
AA is always nullptr because I didn't want to add alias analysis as a dependency for MachineCombiner to support a single pattern, the potential compile time impact was not worth the upside, but I haven't measured how impactful it is. I would rather use isLoadFoldBarrier pessimistically, and only add proper alias analysis if I can find a compelling reason for that.
https://github.com/llvm/llvm-project/pull/152979
More information about the llvm-commits
mailing list