[llvm] [AArch64][Machine-Combiner] Split loads into lanes of neon vectors into multiple vectors when possible (PR #142941)
Jonathan Cohen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 9 05:36:56 PDT 2025
https://github.com/jcohen-apple updated https://github.com/llvm/llvm-project/pull/142941
>From e5f3c18ffa91506bb43c9aeb8c45681d20bc1d42 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 1 Jun 2025 11:10:48 +0300
Subject: [PATCH 1/3] Initial unit test to demonstrate current behavior
---
.../AArch64/aarch64-combine-split-loads.mir | 34 +++++++++++++++++++
1 file changed, 34 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
new file mode 100644
index 0000000000000..3188a9d556dc9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+
+---
+name: split_loads_to_fpr128
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+ ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
+ ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
+ ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+ %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+ %7:fpr128 = LD1i32 %6, 1, killed %2
+ %8:fpr128 = LD1i32 %7, 2, killed %3
+ %9:fpr128 = LD1i32 %8, 3, killed %4
+ $q0 = COPY %9
+ RET_ReallyLR implicit $q0
>From ea2546a253dafb2c0cb6da84aa995518d80a1e67 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Wed, 4 Jun 2025 09:39:50 +0300
Subject: [PATCH 2/3] Apply pattern to basic case of 4 i64 loads into fpr128
register
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 128 ++++++++++++++++++
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 2 +
.../AArch64/aarch64-combine-split-loads.mir | 16 ++-
3 files changed, 139 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a229b71b4b6e7..2dee628ad6286 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7317,11 +7319,63 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+/// Search for patterns where we use LD1i32 instructions to load into
+/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+ const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+ const TargetRegisterInfo *TRI =
+ Root.getMF()->getSubtarget().getRegisterInfo();
+ // Enable this only on Darwin targets, where it should be profitable. Other
+ // targets can remove this check if it is profitable there as well.
+ if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+ return false;
+
+ // The pattern searches for loads into single lanes.
+ if (Root.getOpcode() != AArch64::LD1i32)
+ return false;
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != 3)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ SmallSet<unsigned, 4> RemainingLanes({1, 2});
+ while (RemainingLanes.begin() != RemainingLanes.end() &&
+ Root.getOpcode() == AArch64::LD1i32 &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an i32 into the first lane.
+ auto Lane0Load = CurrInstr->getOperand(2).getReg();
+ if (TRI->getRegSizeInBits(Lane0Load, MRI) != 32)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0Load))
+ return false;
+
+ Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+ return true;
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::SPLIT_LD:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7351,6 +7405,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8681,6 +8739,76 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::SPLIT_LD: {
+ // Gather the initial load instructions, we will use them later to build the
+ // pattern.
+ MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ MachineInstr *Lane1Load =
+ MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
+ MachineInstr *SubregToReg =
+ MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
+ MachineInstr *Lane0Load =
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ // Some helpful lambdas to increase code reuse.
+ auto CreateImplicitDef = [&]() {
+ auto VirtReg = MRI.createVirtualRegister(FPR128RegClass);
+ auto DefInstr = BuildMI(MF, MIMetadata(Root),
+ TII->get(TargetOpcode::IMPLICIT_DEF), VirtReg);
+ InstrIdxForVirtReg.insert(std::make_pair(VirtReg, InsInstrs.size()));
+ InsInstrs.push_back(DefInstr);
+ return VirtReg;
+ };
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+ // To rewrite the pattern, we first need define new registers to
+ // load our results into.
+ Register ImplicitDefForReg0 = CreateImplicitDef();
+ Register ImplicitDefForReg1 = CreateImplicitDef();
+
+ // Load index 0 into register 0 lane 0.
+ Register Index0LoadReg = LoadLaneToRegister(
+ Lane0Load, ImplicitDefForReg0, 0, Lane0Load->getOperand(2).getReg());
+ DelInstrs.push_back(Lane0Load);
+ DelInstrs.push_back(SubregToReg);
+
+ // Load index 1 into register 1 lane 0.
+ Register Index1LoadReg = LoadLaneToRegister(
+ Lane1Load, ImplicitDefForReg1, 0, Lane1Load->getOperand(3).getReg());
+ DelInstrs.push_back(Lane1Load);
+
+ // Load index 2 into register 0 lane 1.
+ auto Index2LoadReg = LoadLaneToRegister(Lane2Load, Index0LoadReg, 1,
+ Lane2Load->getOperand(3).getReg());
+ DelInstrs.push_back(Lane2Load);
+
+ // Load index 3 into register 1 lane 1.
+ auto Index3LoadReg = LoadLaneToRegister(&Root, Index1LoadReg, 1,
+ Root.getOperand(3).getReg());
+ // Root will be deleted after this pattern is applied
+
+ // Create the zip instruction.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(Index2LoadReg)
+ .addReg(Index3LoadReg);
+ InsInstrs.push_back(ZipInstr);
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 0ffaca9af4006..65c8e4fb46e93 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ SPLIT_LD,
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index 3188a9d556dc9..f1dbeb396c5dd 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -8,17 +8,19 @@ body: |
liveins: $x0, $x1, $x2, $x3, $x4
; CHECK-LABEL: name: split_loads_to_fpr128
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
- ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
- ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
- ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
- ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
- ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
- ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[LD0_0:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 0, killed [[COPY1]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 0, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[LD0_0]], 1, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1_0]], 1, killed [[COPY4]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:gpr64common = COPY $x0
%1:gpr64common = COPY $x1
>From 150988f9f8d5c28342eaaff12ba4acddfbc3de6d Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Mon, 9 Jun 2025 15:36:03 +0300
Subject: [PATCH 3/3] fixup! Apply pattern to basic case of 4 i64 loads into
fpr128 register
Fix condition while checking for the pattern and add a negative test where the pattern is not applied
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 2 +-
.../AArch64/aarch64-combine-split-loads.mir | 16 ++++++++++++++++
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 2dee628ad6286..97f76506425ab 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7344,7 +7344,7 @@ static bool getLoadPatterns(MachineInstr &Root,
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
SmallSet<unsigned, 4> RemainingLanes({1, 2});
while (RemainingLanes.begin() != RemainingLanes.end() &&
- Root.getOpcode() == AArch64::LD1i32 &&
+ CurrInstr->getOpcode() == AArch64::LD1i32 &&
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg())) {
RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index f1dbeb396c5dd..72a3ed669ba5c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -34,3 +34,19 @@ body: |
%9:fpr128 = LD1i32 %8, 3, killed %4
$q0 = COPY %9
RET_ReallyLR implicit $q0
+
+---
+name: negative_pattern
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1
+
+ ; CHECK-LABEL: name: negative_pattern
+ ; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+ ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+
+ %0:gpr64common = COPY $x0
+ %1:fpr128 = LDRQui $x1, 0
+ %2:fpr128 = LD1i32 %1, 3, %0
+ $q0 = COPY %2
+ RET_ReallyLR implicit $q0
More information about the llvm-commits
mailing list