[llvm] [AArch64][MachineCombiner] Combine sequences of gather patterns (PR #152979)

Sun Aug 17 04:56:36 PDT 2025

================
@@ -7412,11 +7413,347 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+/// Check if there are any stores or calls between two instructions in the same
+/// basic block.
+static bool hasInterveningStoreOrCall(const MachineInstr *First,
+                                      const MachineInstr *Last) {
+  if (!First || !Last || First == Last)
+    return false;
+
+  // Both instructions must be in the same basic block.
+  if (First->getParent() != Last->getParent())
+    return false;
+
+  // Sanity check that First comes before Last.
+  const MachineBasicBlock *MBB = First->getParent();
+  auto InstrIt = First->getIterator();
+  auto LastIt = Last->getIterator();
+
+  for (; InstrIt != MBB->end(); ++InstrIt) {
+    if (InstrIt == LastIt)
+      break;
+
+    // Check for stores or calls that could interfere
+    if (InstrIt->mayStore() || InstrIt->isCall())
+      return true;
+  }
+
+  // If we reached the end of the basic block, our instructions must have not
+  // been ordered correctly and the analysis is invalid.
+  assert(InstrIt != MBB->end() &&
+         "Got bad machine instructions, First should come before Last!");
+  return false;
+}
+
+/// Check if the given instruction forms a gather load pattern that can be
+/// optimized for better Memory-Level Parallelism (MLP). This function
+/// identifies chains of NEON lane load instructions that load data from
+/// different memory addresses into individual lanes of a 128-bit vector
+/// register, then attempts to split the pattern into parallel loads to break
+/// the serial dependency between instructions.
+///
+/// Pattern Matched:
+///   Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
+///   LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
+///
+/// Transformed Into:
+///   Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
+///   to combine the results, enabling better memory-level parallelism.
+///
+/// Supported Element Types:
+///   - 32-bit elements (LD1i32, 4 lanes total)
+///   - 16-bit elements (LD1i16, 8 lanes total)
+///   - 8-bit elements (LD1i8, 16 lanes total)
+static bool getGatherPattern(MachineInstr &Root,
+                             SmallVectorImpl<unsigned> &Patterns,
+                             unsigned LoadLaneOpCode, unsigned NumLanes) {
+  const MachineFunction *MF = Root.getMF();
+
+  // Early exit if optimizing for size.
+  if (MF->getFunction().hasMinSize())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single non-debug use (since we will be replacing the virtual
+  // register)
+  // 2. That the addressing mode only uses a single pointer operand
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
+  SmallSet<const MachineInstr *, 16> LoadInstrs = {};
+  while (!RemainingLanes.empty() && CurrInstr &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    LoadInstrs.insert(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Check that we have found a match for lanes N-1.. 1.
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an integer into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  LoadInstrs.insert(MRI.getUniqueVRegDef(Lane0LoadReg));
+
+  // Check for intervening stores or calls between the first and last load.
+  // Sort load instructions by program order.
+  SmallVector<const MachineInstr *, 16> SortedLoads(LoadInstrs.begin(),
+                                                    LoadInstrs.end());
+  llvm::sort(SortedLoads, [](const MachineInstr *A, const MachineInstr *B) {
+    if (A->getParent() != B->getParent()) {
+      // If in different blocks, this shouldn't happen for gather patterns.
+      return false;
+    }
+    // Compare positions within the same basic block.
+    for (const MachineInstr &MI : *A->getParent()) {
+      if (&MI == A)
+        return true;
+      if (&MI == B)
+        return false;
+    }
+    return false;
+  });
+
+  const MachineInstr *FirstLoad = SortedLoads.front();
+  const MachineInstr *LastLoad = SortedLoads.back();
+
+  if (hasInterveningStoreOrCall(FirstLoad, LastLoad))
+    return false;
+
+  switch (NumLanes) {
+  case 4:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+    break;
+  case 8:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+    break;
+  case 16:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+    break;
+  default:
+    llvm_unreachable("Got bad number of lanes for gather pattern.");
+  }
+
+  return true;
+}
+
+/// Search for patterns of LD instructions we can optimize.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+
+  // The pattern searches for loads into single lanes.
+  switch (Root.getOpcode()) {
+  case AArch64::LD1i32:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+  case AArch64::LD1i16:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+  case AArch64::LD1i8:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+  default:
+    return false;
+  }
+}
+
+/// Generate optimized instruction sequence for gather load patterns to improve
+/// Memory-Level Parallelism (MLP). This function transforms a chain of
+/// sequential NEON lane loads into parallel vector loads that can execute
+/// concurrently.
+static void
+generateGatherPattern(MachineInstr &Root,
----------------
jcohen-apple wrote:

Done

https://github.com/llvm/llvm-project/pull/152979