[llvm] [AArch64][Machine-Combiner] Split loads into lanes of neon vectors into multiple vectors when possible (PR #142941)

Mon Jun 16 04:34:13 PDT 2025

https://github.com/jcohen-apple updated https://github.com/llvm/llvm-project/pull/142941

>From 36fc37860f704a4339191b2ae1695688d8e07b41 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 1 Jun 2025 11:10:48 +0300
Subject: [PATCH 1/3] Initial unit test to demonstrate current behavior

---
 .../AArch64/aarch64-combine-split-loads.mir   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir

diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
new file mode 100644
index 0000000000000..3188a9d556dc9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+
+---
+name:            split_loads_to_fpr128
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0

>From 69d4011ef8b6b7d9c74769199d6e5820c70b7d53 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Mon, 16 Jun 2025 14:33:10 +0300
Subject: [PATCH 2/3] Update Cyclone scheduling model LD behavior

---
 llvm/lib/Target/AArch64/AArch64SchedCyclone.td | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 48324654949c0..19198baace993 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -242,7 +242,7 @@ def : WriteRes<WriteST, [CyUnitLS]> {
 // Rt latency is latency WriteIS + WriteLD.
 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 def CyWriteLDIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<ScaledIdxPred, [WriteLD]>,          // Load from scaled register.
   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 
@@ -635,7 +635,7 @@ def : WriteRes<WriteVST, [CyUnitLS]> {
 // same latency, this is acceptable.
 
 // Vd is read 5 cycles after issuing the vector load.
-def : ReadAdvance<ReadVLD, 5>;
+def : ReadAdvance<ReadVLD, 4>;
 
 def : InstRW<[WriteVLD],
              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
@@ -670,13 +670,10 @@ def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
 
-def : InstRW<[WriteVLDShuffle, ReadVLD],
-             (instregex "LD1i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
-             (instregex "LD1i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+def : InstRW<[ReadVLD],
+             (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[ReadVLD],
+             (instregex "LD1i(8|16|32|64)_POST")>;
 
 def : InstRW<[WriteVLDShuffle],
              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;

>From ecceba8979a8600241bb1c32d05a9fd9e2822407 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Wed, 4 Jun 2025 09:39:50 +0300
Subject: [PATCH 3/3] Apply pattern to basic case of 4 i64 loads into fpr128
 register

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 118 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   2 +
 .../AArch64/aarch64-combine-split-loads.mir   |  34 +++--
 3 files changed, 146 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..0843432e3b8ef 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -7317,11 +7319,63 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+/// Search for patterns where we use LD1i32 instructions to load into
+/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+  const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+  const TargetRegisterInfo *TRI =
+      Root.getMF()->getSubtarget().getRegisterInfo();
+  // Enable this only on Darwin targets, where it should be profitable. Other
+  // targets can remove this check if it is profitable there as well.
+  if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+    return false;
+
+  // The pattern searches for loads into single lanes.
+  if (Root.getOpcode() != AArch64::LD1i32)
+    return false;
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != 3)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  SmallSet<unsigned, 4> RemainingLanes({1, 2});
+  while (RemainingLanes.begin() != RemainingLanes.end() &&
+         CurrInstr->getOpcode() == AArch64::LD1i32 &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an i32 into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+  return true;
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::SPLIT_LD:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7351,6 +7405,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
+  // Load patterns
+  if (getLoadPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8681,6 +8739,66 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
+  case AArch64MachineCombinerPattern::SPLIT_LD: {
+    // Gather the initial load instructions, we will use them later to build the
+    // pattern.
+    MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+    MachineInstr *Lane1Load =
+        MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
+    MachineInstr *SubregToReg =
+        MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
+    const TargetRegisterClass *FPR128RegClass =
+        MRI.getRegClass(Root.getOperand(0).getReg());
+
+    auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+                                  Register SrcRegister, unsigned Lane,
+                                  Register OffsetRegister) {
+      auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+      MachineInstrBuilder LoadIndexIntoRegister =
+          BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                  NewRegister)
+              .addReg(SrcRegister)
+              .addImm(Lane)
+              .addReg(OffsetRegister, getKillRegState(true));
+      InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+      InsInstrs.push_back(LoadIndexIntoRegister);
+      return NewRegister;
+    };
+
+    // To rewrite the pattern, we first need define a new register to
+    // load our results into.
+    auto ImplicitDefForReg1 = MRI.createVirtualRegister(FPR128RegClass);
+    auto DefInstr =
+        BuildMI(MF, MIMetadata(Root), TII->get(TargetOpcode::IMPLICIT_DEF),
+                ImplicitDefForReg1);
+    InstrIdxForVirtReg.insert(
+        std::make_pair(ImplicitDefForReg1, InsInstrs.size()));
+    InsInstrs.push_back(DefInstr);
+
+    // Load index 1 into register 1 lane 0.
+    Register Index1LoadReg = LoadLaneToRegister(
+        Lane1Load, ImplicitDefForReg1, 0, Lane1Load->getOperand(3).getReg());
+    DelInstrs.push_back(Lane1Load);
+
+    // Load index 2 into register 0 lane 1.
+    auto Index2LoadReg =
+        LoadLaneToRegister(Lane2Load, SubregToReg->getOperand(0).getReg(), 1,
+                           Lane2Load->getOperand(3).getReg());
+    DelInstrs.push_back(Lane2Load);
+
+    // Load index 3 into register 1 lane 1.
+    auto Index3LoadReg = LoadLaneToRegister(&Root, Index1LoadReg, 1,
+                                            Root.getOperand(3).getReg());
+    // Root will be deleted after this pattern is applied
+
+    // Create the zip instruction.
+    MachineInstrBuilder ZipInstr =
+        BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+                Root.getOperand(0).getReg())
+            .addReg(Index2LoadReg)
+            .addReg(Index3LoadReg);
+    InsInstrs.push_back(ZipInstr);
+  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..c45e8e0a43a2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  SPLIT_LD,
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index 3188a9d556dc9..4de149fdc55d6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64e-apple-darwin %s -o - | FileCheck %s
 
 ---
 name:            split_loads_to_fpr128
@@ -8,17 +8,19 @@ body:             |
     liveins: $x0, $x1, $x2, $x3, $x4
 
     ; CHECK-LABEL: name: split_loads_to_fpr128
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
-    ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]]
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 0, killed [[COPY2]] 
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1_0]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:gpr64common = COPY $x0
     %1:gpr64common = COPY $x1
@@ -32,3 +34,19 @@ body:             |
     %9:fpr128 = LD1i32 %8, 3, killed %4
     $q0 = COPY %9
     RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: negative_pattern
+    ; CHECK:      [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+    ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+  
+    %0:gpr64common = COPY $x0
+    %1:fpr128 = LDRQui $x1, 0
+    %2:fpr128 = LD1i32 %1, 3, %0
+    $q0 = COPY %2
+    RET_ReallyLR implicit $q0