[llvm] [AArch64][Machine-Combiner] Split loads into lanes of neon vectors into multiple vectors when possible (PR #142941)

Sun Jul 13 05:03:05 PDT 2025

https://github.com/jcohen-apple updated https://github.com/llvm/llvm-project/pull/142941

>From 50209a0ec70ee389abc5f4609668a775226da483 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 1 Jun 2025 11:10:48 +0300
Subject: [PATCH 1/7] Initial unit test to demonstrate current behavior

---
 .../AArch64/aarch64-combine-split-loads.mir   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir

diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
new file mode 100644
index 0000000000000..3188a9d556dc9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+
+---
+name:            split_loads_to_fpr128
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0

>From 01933f68dcfb8892d92644552887b3d9896f34fa Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Wed, 4 Jun 2025 09:39:50 +0300
Subject: [PATCH 2/7] Apply pattern to basic case of 4 i64 loads into fpr128
 register

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 137 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   2 +
 .../AArch64/aarch64-combine-split-loads.mir   |  68 ++++++++-
 3 files changed, 199 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa76..7a2623320f53e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -7327,6 +7329,7 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+  case AArch64MachineCombinerPattern::SPLIT_LD:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7367,11 +7370,64 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+/// Search for patterns where we use LD1i32 instructions to load into
+/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+  const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+  const TargetRegisterInfo *TRI =
+      Root.getMF()->getSubtarget().getRegisterInfo();
+  // Enable this only on Darwin targets, where it should be profitable. Other
+  // targets can remove this check if it is profitable there as well.
+  if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+    return false;
+
+  // The pattern searches for loads into single lanes.
+  if (Root.getOpcode() != AArch64::LD1i32)
+    return false;
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != 3)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  SmallSet<unsigned, 4> RemainingLanes({1, 2});
+  while (RemainingLanes.begin() != RemainingLanes.end() &&
+         CurrInstr->getOpcode() == AArch64::LD1i32 &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an i32 into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+  return true;
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::SPLIT_LD:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7401,6 +7457,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
+  // Load patterns
+  if (getLoadPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8656,6 +8716,83 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
+  case AArch64MachineCombinerPattern::SPLIT_LD: {
+    // Gather the initial load instructions to build the pattern
+    MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+    MachineInstr *Lane1Load =
+        MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
+    MachineInstr *SubregToReg =
+        MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
+    MachineInstr *Lane0Load = 
+        MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
+    
+    const TargetRegisterClass *FPR128RegClass =
+        MRI.getRegClass(Root.getOperand(0).getReg());
+
+    auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+                                  Register SrcRegister, unsigned Lane,
+                                  Register OffsetRegister) {
+      auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+      MachineInstrBuilder LoadIndexIntoRegister =
+          BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                  NewRegister)
+              .addReg(SrcRegister)
+              .addImm(Lane)
+              .addReg(OffsetRegister, getKillRegState(true));
+      InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+      InsInstrs.push_back(LoadIndexIntoRegister);
+      return NewRegister;
+    };
+
+    // Helper to create load instruction based on opcode
+    auto CreateLoadInstruction = [&](unsigned Opcode, Register DestReg, 
+                                    Register OffsetReg) -> MachineInstrBuilder {
+          return BuildMI(MF, MIMetadata(Root), TII->get(AArch64::LDRSui), DestReg)
+              .addReg(OffsetReg)
+              .addImm(0); // immediate offset
+    };
+
+    // Load index 1 into register 0 lane 1
+    Register Index1LoadReg =
+        LoadLaneToRegister(Lane1Load, SubregToReg->getOperand(0).getReg(), 1,
+                          Lane1Load->getOperand(3).getReg());
+    DelInstrs.push_back(Lane1Load);
+
+    // Load index 2 into register 1 lane 0
+    auto DestRegForIndex2 = MRI.createVirtualRegister(
+        MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+    
+    MachineInstrBuilder Index2LoadInstr = CreateLoadInstruction(
+        Lane0Load->getOpcode(), DestRegForIndex2, 
+        Lane2Load->getOperand(3).getReg());
+    
+    InstrIdxForVirtReg.insert(std::make_pair(DestRegForIndex2, InsInstrs.size()));
+    InsInstrs.push_back(Index2LoadInstr);
+    DelInstrs.push_back(Lane2Load);
+
+    // Convert fpr32 to fpr128 using subreg
+    auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+    auto SubRegToRegInstr = BuildMI(MF, MIMetadata(Root), 
+                                  TII->get(SubregToReg->getOpcode()), 
+                                  DestRegForSubregToReg)
+        .addImm(0)
+        .addReg(DestRegForIndex2, getKillRegState(true))
+        .addImm(AArch64::ssub);
+    InstrIdxForVirtReg.insert(std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+    InsInstrs.push_back(SubRegToRegInstr);
+
+    // Load index 3 into register 1 lane 1
+    auto Index3LoadReg = LoadLaneToRegister(&Root, DestRegForSubregToReg, 1,
+                                            Root.getOperand(3).getReg());
+
+    // Create the final zip instruction to combine the results
+    MachineInstrBuilder ZipInstr =
+        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+                Root.getOperand(0).getReg())
+            .addReg(Index1LoadReg)
+            .addReg(Index3LoadReg);
+    InsInstrs.push_back(ZipInstr);
+  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..c45e8e0a43a2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  SPLIT_LD,
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index 3188a9d556dc9..a9c23d0100d35 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64e-apple-darwin -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            split_loads_to_fpr128
@@ -8,17 +8,19 @@ body:             |
     liveins: $x0, $x1, $x2, $x3, $x4
 
     ; CHECK-LABEL: name: split_loads_to_fpr128
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
-    ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:gpr64common = COPY $x0
     %1:gpr64common = COPY $x1
@@ -32,3 +34,53 @@ body:             |
     %9:fpr128 = LD1i32 %8, 3, killed %4
     $q0 = COPY %9
     RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSui %0, 0
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %1
+    %8:fpr128 = LD1i32 %7, 2, killed %2
+    %9:fpr128 = LD1i32 %8, 3, killed %3
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: negative_pattern
+    ; CHECK:      [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+    ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+  
+    %0:gpr64common = COPY $x0
+    %1:fpr128 = LDRQui $x1, 0
+    %2:fpr128 = LD1i32 %1, 3, %0
+    $q0 = COPY %2
+    RET_ReallyLR implicit $q0

>From e3f9d7dd2afd6f0efdb04c0f1a136df6d8f8cec9 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 6 Jul 2025 15:09:53 +0300
Subject: [PATCH 3/7] Support additional data types

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 304 ++++++++++++------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   4 +-
 .../AArch64/aarch64-combine-split-loads.mir   | 184 ++++++++++-
 3 files changed, 393 insertions(+), 99 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7a2623320f53e..c00b96152aa7d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7329,7 +7329,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
-  case AArch64MachineCombinerPattern::SPLIT_LD:
+  case AArch64MachineCombinerPattern::GATHER_i32:
+  case AArch64MachineCombinerPattern::GATHER_i16:
+  case AArch64MachineCombinerPattern::GATHER_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7370,32 +7372,27 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
-/// Search for patterns where we use LD1i32 instructions to load into
-/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
-/// by loading into 2 Neon registers instead.
-static bool getLoadPatterns(MachineInstr &Root,
-                            SmallVectorImpl<unsigned> &Patterns) {
+static bool getGatherPattern(MachineInstr &Root,
+                                SmallVectorImpl<unsigned> &Patterns,
+                                unsigned LoadLaneOpCode,
+                                unsigned NumLanes) {
   const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
   const TargetRegisterInfo *TRI =
       Root.getMF()->getSubtarget().getRegisterInfo();
-  // Enable this only on Darwin targets, where it should be profitable. Other
-  // targets can remove this check if it is profitable there as well.
-  if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
-    return false;
-
-  // The pattern searches for loads into single lanes.
-  if (Root.getOpcode() != AArch64::LD1i32)
-    return false;
 
   // The root of the pattern must load into the last lane of the vector.
-  if (Root.getOperand(2).getImm() != 3)
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
     return false;
 
   // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single debug use (since we will be replacing the virtual register)
+  // 2. That the addressing mode only uses a single offset register.
   auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-  SmallSet<unsigned, 4> RemainingLanes({1, 2});
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
   while (RemainingLanes.begin() != RemainingLanes.end() &&
-         CurrInstr->getOpcode() == AArch64::LD1i32 &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
          MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
          CurrInstr->getNumOperands() == 4) {
     RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
@@ -7409,25 +7406,202 @@ static bool getLoadPatterns(MachineInstr &Root,
   if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
     return false;
 
-  // Verify that the subreg to reg loads an i32 into the first lane.
+  // Verify that the subreg to reg loads an integer into the first lane.
   auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
-  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
     return false;
 
   // Verify that it also has a single non debug use.
   if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
     return false;
 
-  Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+  switch (NumLanes) {
+    case 4:
+      Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
+      break;
+    case 8:
+      Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
+      break;
+    case 16:
+      Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
+      break;
+    default:
+      llvm_unreachable("Got bad number of lanes for gather pattern.");
+  }
+
   return true;
 }
 
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase MLP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+  // Enable this only on Darwin targets, where it should be profitable. Other
+  // targets can remove this check if it is profitable there as well.
+  if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+    return false;
+
+  // The pattern searches for loads into single lanes.
+  switch (Root.getOpcode()) {
+    case AArch64::LD1i32:
+      return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+    case AArch64::LD1i16:
+      return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+    case AArch64::LD1i8:
+      return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+    default:
+      return false;
+  }
+}
+
+static void generateGatherPattern(
+    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
+    unsigned NumLanes) {
+  
+  MachineFunction &MF = *Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  // Gather the initial load instructions to build the pattern
+  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+  MachineInstr *CurrInstr = &Root;
+  for (unsigned i = 0; i < NumLanes - 1; ++i) {
+    LoadToLaneInstrs.push_back(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+  
+  MachineInstr *SubregToReg = CurrInstr;
+  LoadToLaneInstrs.push_back(
+      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+  const TargetRegisterClass *FPR128RegClass =
+      MRI.getRegClass(Root.getOperand(0).getReg());
+
+  auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+                                Register SrcRegister, unsigned Lane,
+                                Register OffsetRegister) {
+    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+    MachineInstrBuilder LoadIndexIntoRegister =
+        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                NewRegister)
+            .addReg(SrcRegister)
+            .addImm(Lane)
+            .addReg(OffsetRegister, getKillRegState(true));
+    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+    InsInstrs.push_back(LoadIndexIntoRegister);
+    return NewRegister;
+  };
+
+  // Helper to create load instruction based on opcode
+  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, 
+                                  Register OffsetReg) -> MachineInstrBuilder {
+      unsigned Opcode;
+      switch (NumLanes) {
+        case 4:
+          Opcode = AArch64::LDRSui;
+          break;
+        case 8:
+          Opcode = AArch64::LDRHui;
+          break;
+        case 16:
+          Opcode = AArch64::LDRBui;
+          break;
+        default:
+          llvm_unreachable("Got unsupported number of lanes in machine-combiner gather pattern");
+      }
+      // Immediate offset load
+      return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+            .addReg(OffsetReg)
+            .addImm(0); // immediate offset
+  };
+
+  // Load the remaining lanes into register 0.
+  auto LanesToLoadToReg0 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+  auto PrevReg = SubregToReg->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg0 = PrevReg;
+
+  // First load into register 1. Perform a LDRSui to zero out the upper lanes in a single instruction.
+  auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+  auto OriginalSplitLoad = *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+  auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+  
+  MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction(
+      NumLanes, DestRegForMiddleIndex, 
+      OriginalSplitLoad->getOperand(3).getReg());
+  
+  InstrIdxForVirtReg.insert(std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+  InsInstrs.push_back(MiddleIndexLoadInstr);
+  DelInstrs.push_back(OriginalSplitLoad);
+
+  // Subreg To Reg instruction for register 1.
+  auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+  unsigned SubregType;
+  switch (NumLanes) {
+    case 4:
+      SubregType = AArch64::ssub;
+      break;
+    case 8:
+      SubregType = AArch64::hsub;
+      break;
+    case 16:
+      SubregType = AArch64::bsub;
+      break;
+    default:
+      llvm_unreachable("Got invalid NumLanes for machine-combiner gather pattern");
+  }
+
+  auto SubRegToRegInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+              DestRegForSubregToReg)
+          .addImm(0)
+          .addReg(DestRegForMiddleIndex, getKillRegState(true))
+          .addImm(SubregType);
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+  InsInstrs.push_back(SubRegToRegInstr);
+
+  // Load remaining lanes into register 1.
+  auto LanesToLoadToReg1 = llvm::make_range(
+      LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, LoadToLaneInstrsAscending.end());
+  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+    if (Index == NumLanes / 2 - 2) {
+      break;
+    }
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg1 = PrevReg;
+
+  // Create the final zip instruction to combine the results.
+  MachineInstrBuilder ZipInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+              Root.getOperand(0).getReg())
+          .addReg(LastLoadReg0)
+          .addReg(LastLoadReg1);
+  InsInstrs.push_back(ZipInstr);
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
-  case AArch64MachineCombinerPattern::SPLIT_LD:
+  case AArch64MachineCombinerPattern::GATHER_i32:
+  case AArch64MachineCombinerPattern::GATHER_i16:
+  case AArch64MachineCombinerPattern::GATHER_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -8716,82 +8890,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
-  case AArch64MachineCombinerPattern::SPLIT_LD: {
-    // Gather the initial load instructions to build the pattern
-    MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-    MachineInstr *Lane1Load =
-        MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
-    MachineInstr *SubregToReg =
-        MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
-    MachineInstr *Lane0Load = 
-        MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
-    
-    const TargetRegisterClass *FPR128RegClass =
-        MRI.getRegClass(Root.getOperand(0).getReg());
-
-    auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
-                                  Register SrcRegister, unsigned Lane,
-                                  Register OffsetRegister) {
-      auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
-      MachineInstrBuilder LoadIndexIntoRegister =
-          BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
-                  NewRegister)
-              .addReg(SrcRegister)
-              .addImm(Lane)
-              .addReg(OffsetRegister, getKillRegState(true));
-      InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
-      InsInstrs.push_back(LoadIndexIntoRegister);
-      return NewRegister;
-    };
-
-    // Helper to create load instruction based on opcode
-    auto CreateLoadInstruction = [&](unsigned Opcode, Register DestReg, 
-                                    Register OffsetReg) -> MachineInstrBuilder {
-          return BuildMI(MF, MIMetadata(Root), TII->get(AArch64::LDRSui), DestReg)
-              .addReg(OffsetReg)
-              .addImm(0); // immediate offset
-    };
-
-    // Load index 1 into register 0 lane 1
-    Register Index1LoadReg =
-        LoadLaneToRegister(Lane1Load, SubregToReg->getOperand(0).getReg(), 1,
-                          Lane1Load->getOperand(3).getReg());
-    DelInstrs.push_back(Lane1Load);
-
-    // Load index 2 into register 1 lane 0
-    auto DestRegForIndex2 = MRI.createVirtualRegister(
-        MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-    
-    MachineInstrBuilder Index2LoadInstr = CreateLoadInstruction(
-        Lane0Load->getOpcode(), DestRegForIndex2, 
-        Lane2Load->getOperand(3).getReg());
-    
-    InstrIdxForVirtReg.insert(std::make_pair(DestRegForIndex2, InsInstrs.size()));
-    InsInstrs.push_back(Index2LoadInstr);
-    DelInstrs.push_back(Lane2Load);
-
-    // Convert fpr32 to fpr128 using subreg
-    auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
-    auto SubRegToRegInstr = BuildMI(MF, MIMetadata(Root), 
-                                  TII->get(SubregToReg->getOpcode()), 
-                                  DestRegForSubregToReg)
-        .addImm(0)
-        .addReg(DestRegForIndex2, getKillRegState(true))
-        .addImm(AArch64::ssub);
-    InstrIdxForVirtReg.insert(std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
-    InsInstrs.push_back(SubRegToRegInstr);
-
-    // Load index 3 into register 1 lane 1
-    auto Index3LoadReg = LoadLaneToRegister(&Root, DestRegForSubregToReg, 1,
-                                            Root.getOperand(3).getReg());
-
-    // Create the final zip instruction to combine the results
-    MachineInstrBuilder ZipInstr =
-        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
-                Root.getOperand(0).getReg())
-            .addReg(Index1LoadReg)
-            .addReg(Index3LoadReg);
-    InsInstrs.push_back(ZipInstr);
+  case AArch64MachineCombinerPattern::GATHER_i32: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 4);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_i16: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 8);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_i8: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 16);
+    break;
   }
 
   } // end switch (Pattern)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index c45e8e0a43a2e..3850e2cfecf4e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -173,7 +173,9 @@ enum AArch64MachineCombinerPattern : unsigned {
 
   FNMADD,
 
-  SPLIT_LD,
+  GATHER_i32,
+  GATHER_i16,
+  GATHER_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index a9c23d0100d35..04cc9c4a7cfbf 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mtriple=arm64e-apple-darwin -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            split_loads_to_fpr128
@@ -69,6 +69,188 @@ body:             |
     $q0 = COPY %9
     RET_ReallyLR implicit $q0
 
+---
+name:            split_loads_to_fpr128_i16
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHroX %0, killed %1, 0, 1
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %2
+    %12:fpr128 = LD1i16 %11, 2, killed %3
+    %13:fpr128 = LD1i16 %12, 3, killed %4
+    %14:fpr128 = LD1i16 %13, 4, killed %5
+    %15:fpr128 = LD1i16 %14, 5, killed %6
+    %16:fpr128 = LD1i16 %15, 6, killed %7
+    %17:fpr128 = LD1i16 %16, 7, killed %8
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHui %0, 0
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %1
+    %12:fpr128 = LD1i16 %11, 2, killed %2
+    %13:fpr128 = LD1i16 %12, 3, killed %3
+    %14:fpr128 = LD1i16 %13, 4, killed %4
+    %15:fpr128 = LD1i16 %14, 5, killed %5
+    %16:fpr128 = LD1i16 %15, 6, killed %6
+    %17:fpr128 = LD1i16 %16, 7, killed %7
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i8
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i8
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16
+    ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]]
+    ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]]
+    ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]]
+    ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]]
+    ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:gpr64common = COPY $x9
+    %10:gpr64common = COPY $x10
+    %11:gpr64common = COPY $x11
+    %12:gpr64common = COPY $x12
+    %13:gpr64common = COPY $x13
+    %14:gpr64common = COPY $x14
+    %15:gpr64common = COPY $x15
+    %16:gpr64common = COPY $x16
+    %17:fpr8 = LDRBroX %0, killed %1, 0, 0
+    %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub
+    %19:fpr128 = LD1i8 %18, 1, killed %2
+    %20:fpr128 = LD1i8 %19, 2, killed %3
+    %21:fpr128 = LD1i8 %20, 3, killed %4
+    %22:fpr128 = LD1i8 %21, 4, killed %5
+    %23:fpr128 = LD1i8 %22, 5, killed %6
+    %24:fpr128 = LD1i8 %23, 6, killed %7
+    %25:fpr128 = LD1i8 %24, 7, killed %8
+    %26:fpr128 = LD1i8 %25, 8, killed %9
+    %27:fpr128 = LD1i8 %26, 9, killed %10
+    %28:fpr128 = LD1i8 %27, 10, killed %11
+    %29:fpr128 = LD1i8 %28, 11, killed %12
+    %30:fpr128 = LD1i8 %29, 12, killed %13
+    %31:fpr128 = LD1i8 %30, 13, killed %14
+    %32:fpr128 = LD1i8 %31, 14, killed %15
+    %33:fpr128 = LD1i8 %32, 15, killed %16
+    $q0 = COPY %33
+    RET_ReallyLR implicit $q0
+
 ---
 name:            negative_pattern
 body:             |

>From 6f301b8223640c144976f97e515838fb453da535 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Tue, 8 Jul 2025 22:06:31 +0300
Subject: [PATCH 4/7] Remove check for isOsDarwin()

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c00b96152aa7d..72e5ff6010f07 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7438,10 +7438,6 @@ static bool getGatherPattern(MachineInstr &Root,
 /// by loading into 2 Neon registers instead.
 static bool getLoadPatterns(MachineInstr &Root,
                             SmallVectorImpl<unsigned> &Patterns) {
-  // Enable this only on Darwin targets, where it should be profitable. Other
-  // targets can remove this check if it is profitable there as well.
-  if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
-    return false;
 
   // The pattern searches for loads into single lanes.
   switch (Root.getOpcode()) {

>From d92972e8b21da3616cfb0cadd848e866f8b3320c Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Tue, 8 Jul 2025 22:07:48 +0300
Subject: [PATCH 5/7] Formatting changes

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 161 ++++++++++---------
 1 file changed, 86 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 72e5ff6010f07..9cf17d03f7288 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7373,9 +7373,8 @@ static bool getMiscPatterns(MachineInstr &Root,
 }
 
 static bool getGatherPattern(MachineInstr &Root,
-                                SmallVectorImpl<unsigned> &Patterns,
-                                unsigned LoadLaneOpCode,
-                                unsigned NumLanes) {
+                             SmallVectorImpl<unsigned> &Patterns,
+                             unsigned LoadLaneOpCode, unsigned NumLanes) {
   const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
   const TargetRegisterInfo *TRI =
       Root.getMF()->getSubtarget().getRegisterInfo();
@@ -7386,7 +7385,8 @@ static bool getGatherPattern(MachineInstr &Root,
 
   // Check that we have load into all lanes except lane 0.
   // For each load we also want to check that:
-  // 1. It has a single debug use (since we will be replacing the virtual register)
+  // 1. It has a single debug use (since we will be replacing the virtual
+  // register)
   // 2. That the addressing mode only uses a single offset register.
   auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
   auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
@@ -7417,17 +7417,17 @@ static bool getGatherPattern(MachineInstr &Root,
     return false;
 
   switch (NumLanes) {
-    case 4:
-      Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
-      break;
-    case 8:
-      Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
-      break;
-    case 16:
-      Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
-      break;
-    default:
-      llvm_unreachable("Got bad number of lanes for gather pattern.");
+  case 4:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
+    break;
+  case 8:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
+    break;
+  case 16:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
+    break;
+  default:
+    llvm_unreachable("Got bad number of lanes for gather pattern.");
   }
 
   return true;
@@ -7441,23 +7441,24 @@ static bool getLoadPatterns(MachineInstr &Root,
 
   // The pattern searches for loads into single lanes.
   switch (Root.getOpcode()) {
-    case AArch64::LD1i32:
-      return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
-    case AArch64::LD1i16:
-      return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
-    case AArch64::LD1i8:
-      return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
-    default:
-      return false;
+  case AArch64::LD1i32:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+  case AArch64::LD1i16:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+  case AArch64::LD1i8:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+  default:
+    return false;
   }
 }
 
-static void generateGatherPattern(
-    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
-    SmallVectorImpl<MachineInstr *> &DelInstrs,
-    DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
-    unsigned NumLanes) {
-  
+static void
+generateGatherPattern(MachineInstr &Root,
+                      SmallVectorImpl<MachineInstr *> &InsInstrs,
+                      SmallVectorImpl<MachineInstr *> &DelInstrs,
+                      DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+                      unsigned Pattern, unsigned NumLanes) {
+
   MachineFunction &MF = *Root.getParent()->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
@@ -7469,7 +7470,7 @@ static void generateGatherPattern(
     LoadToLaneInstrs.push_back(CurrInstr);
     CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
   }
-  
+
   MachineInstr *SubregToReg = CurrInstr;
   LoadToLaneInstrs.push_back(
       MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
@@ -7494,26 +7495,27 @@ static void generateGatherPattern(
   };
 
   // Helper to create load instruction based on opcode
-  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, 
-                                  Register OffsetReg) -> MachineInstrBuilder {
-      unsigned Opcode;
-      switch (NumLanes) {
-        case 4:
-          Opcode = AArch64::LDRSui;
-          break;
-        case 8:
-          Opcode = AArch64::LDRHui;
-          break;
-        case 16:
-          Opcode = AArch64::LDRBui;
-          break;
-        default:
-          llvm_unreachable("Got unsupported number of lanes in machine-combiner gather pattern");
-      }
-      // Immediate offset load
-      return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
-            .addReg(OffsetReg)
-            .addImm(0); // immediate offset
+  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+                                   Register OffsetReg) -> MachineInstrBuilder {
+    unsigned Opcode;
+    switch (NumLanes) {
+    case 4:
+      Opcode = AArch64::LDRSui;
+      break;
+    case 8:
+      Opcode = AArch64::LDRHui;
+      break;
+    case 16:
+      Opcode = AArch64::LDRBui;
+      break;
+    default:
+      llvm_unreachable(
+          "Got unsupported number of lanes in machine-combiner gather pattern");
+    }
+    // Immediate offset load
+    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+        .addReg(OffsetReg)
+        .addImm(0); // immediate offset
   };
 
   // Load the remaining lanes into register 0.
@@ -7522,22 +7524,26 @@ static void generateGatherPattern(
                        LoadToLaneInstrsAscending.begin() + NumLanes / 2);
   auto PrevReg = SubregToReg->getOperand(0).getReg();
   for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
-    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
     DelInstrs.push_back(LoadInstr);
   }
   auto LastLoadReg0 = PrevReg;
 
-  // First load into register 1. Perform a LDRSui to zero out the upper lanes in a single instruction.
+  // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+  // a single instruction.
   auto Lane0Load = *LoadToLaneInstrsAscending.begin();
-  auto OriginalSplitLoad = *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+  auto OriginalSplitLoad =
+      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
   auto DestRegForMiddleIndex = MRI.createVirtualRegister(
       MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-  
-  MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction(
-      NumLanes, DestRegForMiddleIndex, 
-      OriginalSplitLoad->getOperand(3).getReg());
-  
-  InstrIdxForVirtReg.insert(std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+
+  MachineInstrBuilder MiddleIndexLoadInstr =
+      CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+                            OriginalSplitLoad->getOperand(3).getReg());
+
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
   InsInstrs.push_back(MiddleIndexLoadInstr);
   DelInstrs.push_back(OriginalSplitLoad);
 
@@ -7545,17 +7551,18 @@ static void generateGatherPattern(
   auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
   unsigned SubregType;
   switch (NumLanes) {
-    case 4:
-      SubregType = AArch64::ssub;
-      break;
-    case 8:
-      SubregType = AArch64::hsub;
-      break;
-    case 16:
-      SubregType = AArch64::bsub;
-      break;
-    default:
-      llvm_unreachable("Got invalid NumLanes for machine-combiner gather pattern");
+  case 4:
+    SubregType = AArch64::ssub;
+    break;
+  case 8:
+    SubregType = AArch64::hsub;
+    break;
+  case 16:
+    SubregType = AArch64::bsub;
+    break;
+  default:
+    llvm_unreachable(
+        "Got invalid NumLanes for machine-combiner gather pattern");
   }
 
   auto SubRegToRegInstr =
@@ -7569,11 +7576,13 @@ static void generateGatherPattern(
   InsInstrs.push_back(SubRegToRegInstr);
 
   // Load remaining lanes into register 1.
-  auto LanesToLoadToReg1 = llvm::make_range(
-      LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, LoadToLaneInstrsAscending.end());
+  auto LanesToLoadToReg1 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+                       LoadToLaneInstrsAscending.end());
   PrevReg = SubRegToRegInstr->getOperand(0).getReg();
   for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
-    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
     if (Index == NumLanes / 2 - 2) {
       break;
     }
@@ -8892,11 +8901,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     break;
   }
   case AArch64MachineCombinerPattern::GATHER_i16: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 8);
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 8);
     break;
   }
   case AArch64MachineCombinerPattern::GATHER_i8: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 16);
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 16);
     break;
   }
 

>From 81052e90b931c49614911810a51cf6d2bb79b6dd Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 13 Jul 2025 13:05:49 +0300
Subject: [PATCH 6/7] Code review comments

- Early exit if optimizing for size
- Fix loop condition to check if CurrInstr is not null
- use .empty() instead of begin() != end()
- Rename pattern enum
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 34 +++++++++++---------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   |  6 ++--
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9cf17d03f7288..35c9cb34c2222 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7329,9 +7329,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
-  case AArch64MachineCombinerPattern::GATHER_i32:
-  case AArch64MachineCombinerPattern::GATHER_i16:
-  case AArch64MachineCombinerPattern::GATHER_i8:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7375,6 +7375,10 @@ static bool getMiscPatterns(MachineInstr &Root,
 static bool getGatherPattern(MachineInstr &Root,
                              SmallVectorImpl<unsigned> &Patterns,
                              unsigned LoadLaneOpCode, unsigned NumLanes) {
+  // Early exit if optimizing for size.
+  if (Root.getMF()->getFunction().hasMinSize())
+    return false;
+
   const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
   const TargetRegisterInfo *TRI =
       Root.getMF()->getSubtarget().getRegisterInfo();
@@ -7391,7 +7395,7 @@ static bool getGatherPattern(MachineInstr &Root,
   auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
   auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
   SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
-  while (RemainingLanes.begin() != RemainingLanes.end() &&
+  while (!RemainingLanes.empty() && CurrInstr &&
          CurrInstr->getOpcode() == LoadLaneOpCode &&
          MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
          CurrInstr->getNumOperands() == 4) {
@@ -7418,13 +7422,13 @@ static bool getGatherPattern(MachineInstr &Root,
 
   switch (NumLanes) {
   case 4:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
     break;
   case 8:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
     break;
   case 16:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
     break;
   default:
     llvm_unreachable("Got bad number of lanes for gather pattern.");
@@ -7434,8 +7438,8 @@ static bool getGatherPattern(MachineInstr &Root,
 }
 
 /// Search for patterns where we use LD1 instructions to load into
-/// separate lanes of an 128 bit Neon register. We can increase MLP
-/// by loading into 2 Neon registers instead.
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
 static bool getLoadPatterns(MachineInstr &Root,
                             SmallVectorImpl<unsigned> &Patterns) {
 
@@ -7604,9 +7608,9 @@ AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
-  case AArch64MachineCombinerPattern::GATHER_i32:
-  case AArch64MachineCombinerPattern::GATHER_i16:
-  case AArch64MachineCombinerPattern::GATHER_i8:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -8895,17 +8899,17 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
-  case AArch64MachineCombinerPattern::GATHER_i32: {
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
     generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
                           Pattern, 4);
     break;
   }
-  case AArch64MachineCombinerPattern::GATHER_i16: {
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
     generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
                           Pattern, 8);
     break;
   }
-  case AArch64MachineCombinerPattern::GATHER_i8: {
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
     generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
                           Pattern, 16);
     break;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 3850e2cfecf4e..02734866e7122 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -173,9 +173,9 @@ enum AArch64MachineCombinerPattern : unsigned {
 
   FNMADD,
 
-  GATHER_i32,
-  GATHER_i16,
-  GATHER_i8
+  GATHER_LANE_i32,
+  GATHER_LANE_i16,
+  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;

>From 8f68890cc312f28d3aa54350c62781bd603ebaac Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 13 Jul 2025 15:02:07 +0300
Subject: [PATCH 7/7] fix unit-tests

---
 llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index 04cc9c4a7cfbf..f663e215cef16 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            split_loads_to_fpr128