[llvm] [AArch64][MachineCombiner] Combine sequences of gather patterns (PR #152979)

Sun Aug 17 04:54:18 PDT 2025

================
@@ -7412,11 +7413,347 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+/// Check if there are any stores or calls between two instructions in the same
+/// basic block.
+static bool hasInterveningStoreOrCall(const MachineInstr *First,
+                                      const MachineInstr *Last) {
+  if (!First || !Last || First == Last)
+    return false;
+
+  // Both instructions must be in the same basic block.
+  if (First->getParent() != Last->getParent())
+    return false;
+
+  // Sanity check that First comes before Last.
+  const MachineBasicBlock *MBB = First->getParent();
+  auto InstrIt = First->getIterator();
+  auto LastIt = Last->getIterator();
+
+  for (; InstrIt != MBB->end(); ++InstrIt) {
+    if (InstrIt == LastIt)
+      break;
+
+    // Check for stores or calls that could interfere
+    if (InstrIt->mayStore() || InstrIt->isCall())
+      return true;
+  }
+
+  // If we reached the end of the basic block, our instructions must have not
+  // been ordered correctly and the analysis is invalid.
+  assert(InstrIt != MBB->end() &&
+         "Got bad machine instructions, First should come before Last!");
+  return false;
+}
+
+/// Check if the given instruction forms a gather load pattern that can be
+/// optimized for better Memory-Level Parallelism (MLP). This function
+/// identifies chains of NEON lane load instructions that load data from
+/// different memory addresses into individual lanes of a 128-bit vector
+/// register, then attempts to split the pattern into parallel loads to break
+/// the serial dependency between instructions.
+///
+/// Pattern Matched:
+///   Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
+///   LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
+///
+/// Transformed Into:
+///   Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
+///   to combine the results, enabling better memory-level parallelism.
+///
+/// Supported Element Types:
+///   - 32-bit elements (LD1i32, 4 lanes total)
+///   - 16-bit elements (LD1i16, 8 lanes total)
+///   - 8-bit elements (LD1i8, 16 lanes total)
+static bool getGatherPattern(MachineInstr &Root,
+                             SmallVectorImpl<unsigned> &Patterns,
+                             unsigned LoadLaneOpCode, unsigned NumLanes) {
+  const MachineFunction *MF = Root.getMF();
+
+  // Early exit if optimizing for size.
+  if (MF->getFunction().hasMinSize())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single non-debug use (since we will be replacing the virtual
+  // register)
+  // 2. That the addressing mode only uses a single pointer operand
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
+  SmallSet<const MachineInstr *, 16> LoadInstrs = {};
+  while (!RemainingLanes.empty() && CurrInstr &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    LoadInstrs.insert(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Check that we have found a match for lanes N-1.. 1.
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an integer into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  LoadInstrs.insert(MRI.getUniqueVRegDef(Lane0LoadReg));
+
+  // Check for intervening stores or calls between the first and last load.
+  // Sort load instructions by program order.
+  SmallVector<const MachineInstr *, 16> SortedLoads(LoadInstrs.begin(),
+                                                    LoadInstrs.end());
+  llvm::sort(SortedLoads, [](const MachineInstr *A, const MachineInstr *B) {
+    if (A->getParent() != B->getParent()) {
----------------
jcohen-apple wrote:

Good catch. I added this to get something working but forgot to get back to it.
I added a parameter to limit the number of instructions searched, and implement an iteration going up the basic block starting from Root, and exiting once all loads have been encountered or we have hit the limit.

https://github.com/llvm/llvm-project/pull/152979