[llvm] [AArch64][Machine-Combiner] Split loads into lanes of neon vectors into multiple vectors when possible (PR #142941)
Jonathan Cohen via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 13 05:56:58 PDT 2025
https://github.com/jcohen-apple updated https://github.com/llvm/llvm-project/pull/142941
>From 50209a0ec70ee389abc5f4609668a775226da483 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 1 Jun 2025 11:10:48 +0300
Subject: [PATCH 1/8] Initial unit test to demonstrate current behavior
---
.../AArch64/aarch64-combine-split-loads.mir | 34 +++++++++++++++++++
1 file changed, 34 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
new file mode 100644
index 0000000000000..3188a9d556dc9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+
+---
+name: split_loads_to_fpr128
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+ ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
+ ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
+ ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+ %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+ %7:fpr128 = LD1i32 %6, 1, killed %2
+ %8:fpr128 = LD1i32 %7, 2, killed %3
+ %9:fpr128 = LD1i32 %8, 3, killed %4
+ $q0 = COPY %9
+ RET_ReallyLR implicit $q0
>From 01933f68dcfb8892d92644552887b3d9896f34fa Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Wed, 4 Jun 2025 09:39:50 +0300
Subject: [PATCH 2/8] Apply pattern to basic case of 4 i64 loads into fpr128
register
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 137 ++++++++++++++++++
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 2 +
.../AArch64/aarch64-combine-split-loads.mir | 68 ++++++++-
3 files changed, 199 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa76..7a2623320f53e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -7327,6 +7329,7 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+ case AArch64MachineCombinerPattern::SPLIT_LD:
return true;
} // end switch (Pattern)
return false;
@@ -7367,11 +7370,64 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+/// Search for patterns where we use LD1i32 instructions to load into
+/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+ const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+ const TargetRegisterInfo *TRI =
+ Root.getMF()->getSubtarget().getRegisterInfo();
+ // Enable this only on Darwin targets, where it should be profitable. Other
+ // targets can remove this check if it is profitable there as well.
+ if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+ return false;
+
+ // The pattern searches for loads into single lanes.
+ if (Root.getOpcode() != AArch64::LD1i32)
+ return false;
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != 3)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ SmallSet<unsigned, 4> RemainingLanes({1, 2});
+ while (RemainingLanes.begin() != RemainingLanes.end() &&
+ CurrInstr->getOpcode() == AArch64::LD1i32 &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an i32 into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+ return true;
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::SPLIT_LD:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7401,6 +7457,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8656,6 +8716,83 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::SPLIT_LD: {
+ // Gather the initial load instructions to build the pattern
+ MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ MachineInstr *Lane1Load =
+ MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
+ MachineInstr *SubregToReg =
+ MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
+ MachineInstr *Lane0Load =
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on opcode
+ auto CreateLoadInstruction = [&](unsigned Opcode, Register DestReg,
+ Register OffsetReg) -> MachineInstrBuilder {
+ return BuildMI(MF, MIMetadata(Root), TII->get(AArch64::LDRSui), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0); // immediate offset
+ };
+
+ // Load index 1 into register 0 lane 1
+ Register Index1LoadReg =
+ LoadLaneToRegister(Lane1Load, SubregToReg->getOperand(0).getReg(), 1,
+ Lane1Load->getOperand(3).getReg());
+ DelInstrs.push_back(Lane1Load);
+
+ // Load index 2 into register 1 lane 0
+ auto DestRegForIndex2 = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ MachineInstrBuilder Index2LoadInstr = CreateLoadInstruction(
+ Lane0Load->getOpcode(), DestRegForIndex2,
+ Lane2Load->getOperand(3).getReg());
+
+ InstrIdxForVirtReg.insert(std::make_pair(DestRegForIndex2, InsInstrs.size()));
+ InsInstrs.push_back(Index2LoadInstr);
+ DelInstrs.push_back(Lane2Load);
+
+ // Convert fpr32 to fpr128 using subreg
+ auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ auto SubRegToRegInstr = BuildMI(MF, MIMetadata(Root),
+ TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForIndex2, getKillRegState(true))
+ .addImm(AArch64::ssub);
+ InstrIdxForVirtReg.insert(std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load index 3 into register 1 lane 1
+ auto Index3LoadReg = LoadLaneToRegister(&Root, DestRegForSubregToReg, 1,
+ Root.getOperand(3).getReg());
+
+ // Create the final zip instruction to combine the results
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(Index1LoadReg)
+ .addReg(Index3LoadReg);
+ InsInstrs.push_back(ZipInstr);
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..c45e8e0a43a2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ SPLIT_LD,
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index 3188a9d556dc9..a9c23d0100d35 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-macos-darwin %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64e-apple-darwin -verify-machineinstrs %s -o - | FileCheck %s
---
name: split_loads_to_fpr128
@@ -8,17 +8,19 @@ body: |
liveins: $x0, $x1, $x2, $x3, $x4
; CHECK-LABEL: name: split_loads_to_fpr128
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
- ; CHECK-NEXT: [[LDRSroX:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
- ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LDRSroX]], %subreg.ssub
- ; CHECK-NEXT: [[LD1i32_:%[0-9]+]]:fpr128 = LD1i32 [[SUBREG_TO_REG]], 1, killed [[COPY2]]
- ; CHECK-NEXT: [[LD1i32_1:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_]], 2, killed [[COPY3]]
- ; CHECK-NEXT: [[LD1i32_2:%[0-9]+]]:fpr128 = LD1i32 [[LD1i32_1]], 3, killed [[COPY4]]
- ; CHECK-NEXT: $q0 = COPY [[LD1i32_2]]
+ ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:gpr64common = COPY $x0
%1:gpr64common = COPY $x1
@@ -32,3 +34,53 @@ body: |
%9:fpr128 = LD1i32 %8, 3, killed %4
$q0 = COPY %9
RET_ReallyLR implicit $q0
+
+---
+name: split_loads_to_fpr128_ui
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128_ui
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:fpr32 = LDRSui %0, 0
+ %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+ %7:fpr128 = LD1i32 %6, 1, killed %1
+ %8:fpr128 = LD1i32 %7, 2, killed %2
+ %9:fpr128 = LD1i32 %8, 3, killed %3
+ $q0 = COPY %9
+ RET_ReallyLR implicit $q0
+
+---
+name: negative_pattern
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1
+
+ ; CHECK-LABEL: name: negative_pattern
+ ; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+ ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+
+ %0:gpr64common = COPY $x0
+ %1:fpr128 = LDRQui $x1, 0
+ %2:fpr128 = LD1i32 %1, 3, %0
+ $q0 = COPY %2
+ RET_ReallyLR implicit $q0
>From e3f9d7dd2afd6f0efdb04c0f1a136df6d8f8cec9 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 6 Jul 2025 15:09:53 +0300
Subject: [PATCH 3/8] Support additional data types
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 304 ++++++++++++------
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 4 +-
.../AArch64/aarch64-combine-split-loads.mir | 184 ++++++++++-
3 files changed, 393 insertions(+), 99 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7a2623320f53e..c00b96152aa7d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7329,7 +7329,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
- case AArch64MachineCombinerPattern::SPLIT_LD:
+ case AArch64MachineCombinerPattern::GATHER_i32:
+ case AArch64MachineCombinerPattern::GATHER_i16:
+ case AArch64MachineCombinerPattern::GATHER_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7370,32 +7372,27 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
-/// Search for patterns where we use LD1i32 instructions to load into
-/// 4 separate lanes of a 128 bit Neon register. We can increase ILP
-/// by loading into 2 Neon registers instead.
-static bool getLoadPatterns(MachineInstr &Root,
- SmallVectorImpl<unsigned> &Patterns) {
+static bool getGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode,
+ unsigned NumLanes) {
const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
const TargetRegisterInfo *TRI =
Root.getMF()->getSubtarget().getRegisterInfo();
- // Enable this only on Darwin targets, where it should be profitable. Other
- // targets can remove this check if it is profitable there as well.
- if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
- return false;
-
- // The pattern searches for loads into single lanes.
- if (Root.getOpcode() != AArch64::LD1i32)
- return false;
// The root of the pattern must load into the last lane of the vector.
- if (Root.getOperand(2).getImm() != 3)
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
return false;
// Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single debug use (since we will be replacing the virtual register)
+ // 2. That the addressing mode only uses a single offset register.
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- SmallSet<unsigned, 4> RemainingLanes({1, 2});
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
while (RemainingLanes.begin() != RemainingLanes.end() &&
- CurrInstr->getOpcode() == AArch64::LD1i32 &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
CurrInstr->getNumOperands() == 4) {
RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
@@ -7409,25 +7406,202 @@ static bool getLoadPatterns(MachineInstr &Root,
if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
return false;
- // Verify that the subreg to reg loads an i32 into the first lane.
+ // Verify that the subreg to reg loads an integer into the first lane.
auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
- if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != 32)
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
return false;
// Verify that it also has a single non debug use.
if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
return false;
- Patterns.push_back(AArch64MachineCombinerPattern::SPLIT_LD);
+ switch (NumLanes) {
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
+ }
+
return true;
}
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase MLP
+/// by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+ // Enable this only on Darwin targets, where it should be profitable. Other
+ // targets can remove this check if it is profitable there as well.
+ if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
+ return false;
+
+ // The pattern searches for loads into single lanes.
+ switch (Root.getOpcode()) {
+ case AArch64::LD1i32:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
+ }
+}
+
+static void generateGatherPattern(
+ MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
+ unsigned NumLanes) {
+
+ MachineFunction &MF = *Root.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ // Gather the initial load instructions to build the pattern
+ SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+ MachineInstr *CurrInstr = &Root;
+ for (unsigned i = 0; i < NumLanes - 1; ++i) {
+ LoadToLaneInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ MachineInstr *SubregToReg = CurrInstr;
+ LoadToLaneInstrs.push_back(
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+ auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on opcode
+ auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable("Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0); // immediate offset
+ };
+
+ // Load the remaining lanes into register 0.
+ auto LanesToLoadToReg0 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+ auto PrevReg = SubregToReg->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg0 = PrevReg;
+
+ // First load into register 1. Perform a LDRSui to zero out the upper lanes in a single instruction.
+ auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+ auto OriginalSplitLoad = *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction(
+ NumLanes, DestRegForMiddleIndex,
+ OriginalSplitLoad->getOperand(3).getReg());
+
+ InstrIdxForVirtReg.insert(std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+ InsInstrs.push_back(MiddleIndexLoadInstr);
+ DelInstrs.push_back(OriginalSplitLoad);
+
+ // Subreg To Reg instruction for register 1.
+ auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ unsigned SubregType;
+ switch (NumLanes) {
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable("Got invalid NumLanes for machine-combiner gather pattern");
+ }
+
+ auto SubRegToRegInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForMiddleIndex, getKillRegState(true))
+ .addImm(SubregType);
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load remaining lanes into register 1.
+ auto LanesToLoadToReg1 = llvm::make_range(
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, LoadToLaneInstrsAscending.end());
+ PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+ if (Index == NumLanes / 2 - 2) {
+ break;
+ }
+ DelInstrs.push_back(LoadInstr);
+ }
+ auto LastLoadReg1 = PrevReg;
+
+ // Create the final zip instruction to combine the results.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(LastLoadReg0)
+ .addReg(LastLoadReg1);
+ InsInstrs.push_back(ZipInstr);
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
- case AArch64MachineCombinerPattern::SPLIT_LD:
+ case AArch64MachineCombinerPattern::GATHER_i32:
+ case AArch64MachineCombinerPattern::GATHER_i16:
+ case AArch64MachineCombinerPattern::GATHER_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -8716,82 +8890,18 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
- case AArch64MachineCombinerPattern::SPLIT_LD: {
- // Gather the initial load instructions to build the pattern
- MachineInstr *Lane2Load = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- MachineInstr *Lane1Load =
- MRI.getUniqueVRegDef(Lane2Load->getOperand(1).getReg());
- MachineInstr *SubregToReg =
- MRI.getUniqueVRegDef(Lane1Load->getOperand(1).getReg());
- MachineInstr *Lane0Load =
- MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg());
-
- const TargetRegisterClass *FPR128RegClass =
- MRI.getRegClass(Root.getOperand(0).getReg());
-
- auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
- Register SrcRegister, unsigned Lane,
- Register OffsetRegister) {
- auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
- MachineInstrBuilder LoadIndexIntoRegister =
- BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
- NewRegister)
- .addReg(SrcRegister)
- .addImm(Lane)
- .addReg(OffsetRegister, getKillRegState(true));
- InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
- InsInstrs.push_back(LoadIndexIntoRegister);
- return NewRegister;
- };
-
- // Helper to create load instruction based on opcode
- auto CreateLoadInstruction = [&](unsigned Opcode, Register DestReg,
- Register OffsetReg) -> MachineInstrBuilder {
- return BuildMI(MF, MIMetadata(Root), TII->get(AArch64::LDRSui), DestReg)
- .addReg(OffsetReg)
- .addImm(0); // immediate offset
- };
-
- // Load index 1 into register 0 lane 1
- Register Index1LoadReg =
- LoadLaneToRegister(Lane1Load, SubregToReg->getOperand(0).getReg(), 1,
- Lane1Load->getOperand(3).getReg());
- DelInstrs.push_back(Lane1Load);
-
- // Load index 2 into register 1 lane 0
- auto DestRegForIndex2 = MRI.createVirtualRegister(
- MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-
- MachineInstrBuilder Index2LoadInstr = CreateLoadInstruction(
- Lane0Load->getOpcode(), DestRegForIndex2,
- Lane2Load->getOperand(3).getReg());
-
- InstrIdxForVirtReg.insert(std::make_pair(DestRegForIndex2, InsInstrs.size()));
- InsInstrs.push_back(Index2LoadInstr);
- DelInstrs.push_back(Lane2Load);
-
- // Convert fpr32 to fpr128 using subreg
- auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
- auto SubRegToRegInstr = BuildMI(MF, MIMetadata(Root),
- TII->get(SubregToReg->getOpcode()),
- DestRegForSubregToReg)
- .addImm(0)
- .addReg(DestRegForIndex2, getKillRegState(true))
- .addImm(AArch64::ssub);
- InstrIdxForVirtReg.insert(std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
- InsInstrs.push_back(SubRegToRegInstr);
-
- // Load index 3 into register 1 lane 1
- auto Index3LoadReg = LoadLaneToRegister(&Root, DestRegForSubregToReg, 1,
- Root.getOperand(3).getReg());
-
- // Create the final zip instruction to combine the results
- MachineInstrBuilder ZipInstr =
- BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
- Root.getOperand(0).getReg())
- .addReg(Index1LoadReg)
- .addReg(Index3LoadReg);
- InsInstrs.push_back(ZipInstr);
+ case AArch64MachineCombinerPattern::GATHER_i32: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 4);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_i16: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 8);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_i8: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 16);
+ break;
}
} // end switch (Pattern)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index c45e8e0a43a2e..3850e2cfecf4e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -173,7 +173,9 @@ enum AArch64MachineCombinerPattern : unsigned {
FNMADD,
- SPLIT_LD,
+ GATHER_i32,
+ GATHER_i16,
+ GATHER_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index a9c23d0100d35..04cc9c4a7cfbf 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mtriple=arm64e-apple-darwin -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -verify-machineinstrs %s -o - | FileCheck %s
---
name: split_loads_to_fpr128
@@ -69,6 +69,188 @@ body: |
$q0 = COPY %9
RET_ReallyLR implicit $q0
+---
+name: split_loads_to_fpr128_i16
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128_i16
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+ ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
+ ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
+ ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:gpr64common = COPY $x5
+ %6:gpr64common = COPY $x6
+ %7:gpr64common = COPY $x7
+ %8:gpr64common = COPY $x8
+ %9:fpr16 = LDRHroX %0, killed %1, 0, 1
+ %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+ %11:fpr128 = LD1i16 %10, 1, killed %2
+ %12:fpr128 = LD1i16 %11, 2, killed %3
+ %13:fpr128 = LD1i16 %12, 3, killed %4
+ %14:fpr128 = LD1i16 %13, 4, killed %5
+ %15:fpr128 = LD1i16 %14, 5, killed %6
+ %16:fpr128 = LD1i16 %15, 6, killed %7
+ %17:fpr128 = LD1i16 %16, 7, killed %8
+ $q0 = COPY %17
+ RET_ReallyLR implicit $q0
+
+---
+name: split_loads_to_fpr128_i16_ui
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+ ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
+ ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
+ ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
+ ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:gpr64common = COPY $x5
+ %6:gpr64common = COPY $x6
+ %7:gpr64common = COPY $x7
+ %8:gpr64common = COPY $x8
+ %9:fpr16 = LDRHui %0, 0
+ %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+ %11:fpr128 = LD1i16 %10, 1, killed %1
+ %12:fpr128 = LD1i16 %11, 2, killed %2
+ %13:fpr128 = LD1i16 %12, 3, killed %3
+ %14:fpr128 = LD1i16 %13, 4, killed %4
+ %15:fpr128 = LD1i16 %14, 5, killed %5
+ %16:fpr128 = LD1i16 %15, 6, killed %6
+ %17:fpr128 = LD1i16 %16, 7, killed %7
+ $q0 = COPY %17
+ RET_ReallyLR implicit $q0
+
+---
+name: split_loads_to_fpr128_i8
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128_i8
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16
+ ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]]
+ ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]]
+ ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]]
+ ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]]
+ ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]]
+ ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]]
+ ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]]
+ ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]]
+ ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]]
+ ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]]
+ ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]]
+ ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]]
+ ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:gpr64common = COPY $x5
+ %6:gpr64common = COPY $x6
+ %7:gpr64common = COPY $x7
+ %8:gpr64common = COPY $x8
+ %9:gpr64common = COPY $x9
+ %10:gpr64common = COPY $x10
+ %11:gpr64common = COPY $x11
+ %12:gpr64common = COPY $x12
+ %13:gpr64common = COPY $x13
+ %14:gpr64common = COPY $x14
+ %15:gpr64common = COPY $x15
+ %16:gpr64common = COPY $x16
+ %17:fpr8 = LDRBroX %0, killed %1, 0, 0
+ %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub
+ %19:fpr128 = LD1i8 %18, 1, killed %2
+ %20:fpr128 = LD1i8 %19, 2, killed %3
+ %21:fpr128 = LD1i8 %20, 3, killed %4
+ %22:fpr128 = LD1i8 %21, 4, killed %5
+ %23:fpr128 = LD1i8 %22, 5, killed %6
+ %24:fpr128 = LD1i8 %23, 6, killed %7
+ %25:fpr128 = LD1i8 %24, 7, killed %8
+ %26:fpr128 = LD1i8 %25, 8, killed %9
+ %27:fpr128 = LD1i8 %26, 9, killed %10
+ %28:fpr128 = LD1i8 %27, 10, killed %11
+ %29:fpr128 = LD1i8 %28, 11, killed %12
+ %30:fpr128 = LD1i8 %29, 12, killed %13
+ %31:fpr128 = LD1i8 %30, 13, killed %14
+ %32:fpr128 = LD1i8 %31, 14, killed %15
+ %33:fpr128 = LD1i8 %32, 15, killed %16
+ $q0 = COPY %33
+ RET_ReallyLR implicit $q0
+
---
name: negative_pattern
body: |
>From 6f301b8223640c144976f97e515838fb453da535 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Tue, 8 Jul 2025 22:06:31 +0300
Subject: [PATCH 4/8] Remove check for isOsDarwin()
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 4 ----
1 file changed, 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c00b96152aa7d..72e5ff6010f07 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7438,10 +7438,6 @@ static bool getGatherPattern(MachineInstr &Root,
/// by loading into 2 Neon registers instead.
static bool getLoadPatterns(MachineInstr &Root,
SmallVectorImpl<unsigned> &Patterns) {
- // Enable this only on Darwin targets, where it should be profitable. Other
- // targets can remove this check if it is profitable there as well.
- if (!Root.getMF()->getTarget().getTargetTriple().isOSDarwin())
- return false;
// The pattern searches for loads into single lanes.
switch (Root.getOpcode()) {
>From d92972e8b21da3616cfb0cadd848e866f8b3320c Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Tue, 8 Jul 2025 22:07:48 +0300
Subject: [PATCH 5/8] Formatting changes
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 161 ++++++++++---------
1 file changed, 86 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 72e5ff6010f07..9cf17d03f7288 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7373,9 +7373,8 @@ static bool getMiscPatterns(MachineInstr &Root,
}
static bool getGatherPattern(MachineInstr &Root,
- SmallVectorImpl<unsigned> &Patterns,
- unsigned LoadLaneOpCode,
- unsigned NumLanes) {
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
const TargetRegisterInfo *TRI =
Root.getMF()->getSubtarget().getRegisterInfo();
@@ -7386,7 +7385,8 @@ static bool getGatherPattern(MachineInstr &Root,
// Check that we have load into all lanes except lane 0.
// For each load we also want to check that:
- // 1. It has a single debug use (since we will be replacing the virtual register)
+ // 1. It has a single debug use (since we will be replacing the virtual
+ // register)
// 2. That the addressing mode only uses a single offset register.
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
@@ -7417,17 +7417,17 @@ static bool getGatherPattern(MachineInstr &Root,
return false;
switch (NumLanes) {
- case 4:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
- break;
- case 8:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
- break;
- case 16:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
- break;
- default:
- llvm_unreachable("Got bad number of lanes for gather pattern.");
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
}
return true;
@@ -7441,23 +7441,24 @@ static bool getLoadPatterns(MachineInstr &Root,
// The pattern searches for loads into single lanes.
switch (Root.getOpcode()) {
- case AArch64::LD1i32:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
- case AArch64::LD1i16:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
- case AArch64::LD1i8:
- return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
- default:
- return false;
+ case AArch64::LD1i32:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
}
}
-static void generateGatherPattern(
- MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
- unsigned NumLanes) {
-
+static void
+generateGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+ unsigned Pattern, unsigned NumLanes) {
+
MachineFunction &MF = *Root.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
@@ -7469,7 +7470,7 @@ static void generateGatherPattern(
LoadToLaneInstrs.push_back(CurrInstr);
CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
}
-
+
MachineInstr *SubregToReg = CurrInstr;
LoadToLaneInstrs.push_back(
MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
@@ -7494,26 +7495,27 @@ static void generateGatherPattern(
};
// Helper to create load instruction based on opcode
- auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
- Register OffsetReg) -> MachineInstrBuilder {
- unsigned Opcode;
- switch (NumLanes) {
- case 4:
- Opcode = AArch64::LDRSui;
- break;
- case 8:
- Opcode = AArch64::LDRHui;
- break;
- case 16:
- Opcode = AArch64::LDRBui;
- break;
- default:
- llvm_unreachable("Got unsupported number of lanes in machine-combiner gather pattern");
- }
- // Immediate offset load
- return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
- .addReg(OffsetReg)
- .addImm(0); // immediate offset
+ auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable(
+ "Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0); // immediate offset
};
// Load the remaining lanes into register 0.
@@ -7522,22 +7524,26 @@ static void generateGatherPattern(
LoadToLaneInstrsAscending.begin() + NumLanes / 2);
auto PrevReg = SubregToReg->getOperand(0).getReg();
for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
- PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
DelInstrs.push_back(LoadInstr);
}
auto LastLoadReg0 = PrevReg;
- // First load into register 1. Perform a LDRSui to zero out the upper lanes in a single instruction.
+ // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+ // a single instruction.
auto Lane0Load = *LoadToLaneInstrsAscending.begin();
- auto OriginalSplitLoad = *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ auto OriginalSplitLoad =
+ *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
auto DestRegForMiddleIndex = MRI.createVirtualRegister(
MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-
- MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction(
- NumLanes, DestRegForMiddleIndex,
- OriginalSplitLoad->getOperand(3).getReg());
-
- InstrIdxForVirtReg.insert(std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+
+ MachineInstrBuilder MiddleIndexLoadInstr =
+ CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+ OriginalSplitLoad->getOperand(3).getReg());
+
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
InsInstrs.push_back(MiddleIndexLoadInstr);
DelInstrs.push_back(OriginalSplitLoad);
@@ -7545,17 +7551,18 @@ static void generateGatherPattern(
auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
unsigned SubregType;
switch (NumLanes) {
- case 4:
- SubregType = AArch64::ssub;
- break;
- case 8:
- SubregType = AArch64::hsub;
- break;
- case 16:
- SubregType = AArch64::bsub;
- break;
- default:
- llvm_unreachable("Got invalid NumLanes for machine-combiner gather pattern");
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable(
+ "Got invalid NumLanes for machine-combiner gather pattern");
}
auto SubRegToRegInstr =
@@ -7569,11 +7576,13 @@ static void generateGatherPattern(
InsInstrs.push_back(SubRegToRegInstr);
// Load remaining lanes into register 1.
- auto LanesToLoadToReg1 = llvm::make_range(
- LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, LoadToLaneInstrsAscending.end());
+ auto LanesToLoadToReg1 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+ LoadToLaneInstrsAscending.end());
PrevReg = SubRegToRegInstr->getOperand(0).getReg();
for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
- PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, LoadInstr->getOperand(3).getReg());
+ PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+ LoadInstr->getOperand(3).getReg());
if (Index == NumLanes / 2 - 2) {
break;
}
@@ -8892,11 +8901,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
break;
}
case AArch64MachineCombinerPattern::GATHER_i16: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 8);
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 8);
break;
}
case AArch64MachineCombinerPattern::GATHER_i8: {
- generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, Pattern, 16);
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 16);
break;
}
>From 81052e90b931c49614911810a51cf6d2bb79b6dd Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 13 Jul 2025 13:05:49 +0300
Subject: [PATCH 6/8] Code review comments
- Early exit if optimizing for size
- Fix loop condition to check if CurrInstr is not null
- use .empty() instead of begin() != end()
- Rename pattern enum
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 34 +++++++++++---------
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 6 ++--
2 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9cf17d03f7288..35c9cb34c2222 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7329,9 +7329,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
- case AArch64MachineCombinerPattern::GATHER_i32:
- case AArch64MachineCombinerPattern::GATHER_i16:
- case AArch64MachineCombinerPattern::GATHER_i8:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return true;
} // end switch (Pattern)
return false;
@@ -7375,6 +7375,10 @@ static bool getMiscPatterns(MachineInstr &Root,
static bool getGatherPattern(MachineInstr &Root,
SmallVectorImpl<unsigned> &Patterns,
unsigned LoadLaneOpCode, unsigned NumLanes) {
+ // Early exit if optimizing for size.
+ if (Root.getMF()->getFunction().hasMinSize())
+ return false;
+
const MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
const TargetRegisterInfo *TRI =
Root.getMF()->getSubtarget().getRegisterInfo();
@@ -7391,7 +7395,7 @@ static bool getGatherPattern(MachineInstr &Root,
auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
- while (RemainingLanes.begin() != RemainingLanes.end() &&
+ while (!RemainingLanes.empty() && CurrInstr &&
CurrInstr->getOpcode() == LoadLaneOpCode &&
MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
CurrInstr->getNumOperands() == 4) {
@@ -7418,13 +7422,13 @@ static bool getGatherPattern(MachineInstr &Root,
switch (NumLanes) {
case 4:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i32);
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
break;
case 8:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i16);
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
break;
case 16:
- Patterns.push_back(AArch64MachineCombinerPattern::GATHER_i8);
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
break;
default:
llvm_unreachable("Got bad number of lanes for gather pattern.");
@@ -7434,8 +7438,8 @@ static bool getGatherPattern(MachineInstr &Root,
}
/// Search for patterns where we use LD1 instructions to load into
-/// separate lanes of an 128 bit Neon register. We can increase MLP
-/// by loading into 2 Neon registers instead.
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
static bool getLoadPatterns(MachineInstr &Root,
SmallVectorImpl<unsigned> &Patterns) {
@@ -7604,9 +7608,9 @@ AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
- case AArch64MachineCombinerPattern::GATHER_i32:
- case AArch64MachineCombinerPattern::GATHER_i16:
- case AArch64MachineCombinerPattern::GATHER_i8:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -8895,17 +8899,17 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
- case AArch64MachineCombinerPattern::GATHER_i32: {
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
Pattern, 4);
break;
}
- case AArch64MachineCombinerPattern::GATHER_i16: {
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
Pattern, 8);
break;
}
- case AArch64MachineCombinerPattern::GATHER_i8: {
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
Pattern, 16);
break;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 3850e2cfecf4e..02734866e7122 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -173,9 +173,9 @@ enum AArch64MachineCombinerPattern : unsigned {
FNMADD,
- GATHER_i32,
- GATHER_i16,
- GATHER_i8
+ GATHER_LANE_i32,
+ GATHER_LANE_i16,
+ GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
>From 8f68890cc312f28d3aa54350c62781bd603ebaac Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 13 Jul 2025 15:02:07 +0300
Subject: [PATCH 7/8] fix unit-tests
---
llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
index 04cc9c4a7cfbf..f663e215cef16 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-split-loads.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
---
name: split_loads_to_fpr128
>From 89d2767a412c7ec035b2e0e4d9e3b918f034d449 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh at apple.com>
Date: Sun, 13 Jul 2025 15:51:15 +0300
Subject: [PATCH 8/8] regenerate checks for affected llc tests
---
.../complex-deinterleaving-uniform-cases.ll | 134 +++----
llvm/test/CodeGen/AArch64/concat-vector.ll | 5 +-
.../AArch64/fp-maximumnum-minimumnum.ll | 50 +--
llvm/test/CodeGen/AArch64/fsh.ll | 113 +++---
llvm/test/CodeGen/AArch64/llvm.frexp.ll | 14 +-
llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 345 +++++++++---------
llvm/test/CodeGen/AArch64/nontemporal.ll | 48 +--
7 files changed, 363 insertions(+), 346 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 7686740aec302..13434fabefa78 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c)
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT: ldr s17, [sp, #40]
-; CHECK-NEXT: add x10, sp, #56
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT: ldr s17, [sp, #32]
+; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5
; CHECK-NEXT: add x9, sp, #48
+; CHECK-NEXT: add x10, sp, #64
; CHECK-NEXT: mov v1.s[1], v3.s[0]
-; CHECK-NEXT: ldr s3, [sp, #32]
-; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2
; CHECK-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-NEXT: ld1 { v17.s }[1], [x10]
-; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5
-; CHECK-NEXT: ldr s16, [sp, #8]
; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4
-; CHECK-NEXT: add x10, sp, #24
-; CHECK-NEXT: ld1 { v3.s }[1], [x9]
-; CHECK-NEXT: add x9, sp, #72
-; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT: add x11, sp, #72
+; CHECK-NEXT: ld1 { v17.s }[1], [x9]
+; CHECK-NEXT: ldr s18, [x10]
+; CHECK-NEXT: add x9, sp, #80
+; CHECK-NEXT: add x10, sp, #56
; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT: ldr s16, [sp, #8]
+; CHECK-NEXT: ldr s3, [sp, #96]
+; CHECK-NEXT: ld1 { v18.s }[1], [x9]
+; CHECK-NEXT: add x9, sp, #88
; CHECK-NEXT: ldr s2, [sp]
-; CHECK-NEXT: ld1 { v16.s }[1], [x10]
-; CHECK-NEXT: add x10, sp, #112
-; CHECK-NEXT: ldr s20, [sp, #136]
; CHECK-NEXT: mov v1.s[2], v5.s[0]
-; CHECK-NEXT: ld1 { v17.s }[2], [x9]
-; CHECK-NEXT: add x9, sp, #64
-; CHECK-NEXT: ldr s5, [sp, #96]
-; CHECK-NEXT: ld1 { v3.s }[2], [x9]
+; CHECK-NEXT: ldr s5, [sp, #40]
; CHECK-NEXT: mov v0.s[2], v4.s[0]
-; CHECK-NEXT: add x9, sp, #88
-; CHECK-NEXT: ldr s4, [sp, #104]
-; CHECK-NEXT: ldr s19, [sp, #192]
; CHECK-NEXT: ld1 { v5.s }[1], [x10]
-; CHECK-NEXT: add x10, sp, #80
-; CHECK-NEXT: ld1 { v17.s }[3], [x9]
-; CHECK-NEXT: mov v1.s[3], v7.s[0]
-; CHECK-NEXT: add x9, sp, #120
-; CHECK-NEXT: ld1 { v3.s }[3], [x10]
-; CHECK-NEXT: ld1 { v4.s }[1], [x9]
-; CHECK-NEXT: ldr s7, [sp, #128]
+; CHECK-NEXT: ldr s19, [x11]
; CHECK-NEXT: add x10, sp, #144
+; CHECK-NEXT: zip1 v4.2d, v17.2d, v18.2d
+; CHECK-NEXT: add x11, sp, #160
+; CHECK-NEXT: ldr s18, [sp, #136]
+; CHECK-NEXT: ld1 { v19.s }[1], [x9]
; CHECK-NEXT: mov v0.s[3], v6.s[0]
-; CHECK-NEXT: add x9, sp, #16
+; CHECK-NEXT: ldr s6, [sp, #128]
+; CHECK-NEXT: mov v1.s[3], v7.s[0]
+; CHECK-NEXT: add x9, sp, #24
+; CHECK-NEXT: ldr s7, [sp, #104]
+; CHECK-NEXT: ld1 { v16.s }[1], [x9]
+; CHECK-NEXT: add x9, sp, #112
+; CHECK-NEXT: ld1 { v6.s }[1], [x10]
+; CHECK-NEXT: zip1 v5.2d, v5.2d, v19.2d
+; CHECK-NEXT: add x10, sp, #120
+; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x10]
-; CHECK-NEXT: ld1 { v2.s }[1], [x9]
-; CHECK-NEXT: add x9, sp, #160
-; CHECK-NEXT: fmul v6.4s, v17.4s, v1.4s
-; CHECK-NEXT: fmul v18.4s, v4.4s, v16.4s
-; CHECK-NEXT: fmul v16.4s, v5.4s, v16.4s
-; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: add x10, sp, #208
-; CHECK-NEXT: ld1 { v7.s }[2], [x9]
-; CHECK-NEXT: add x9, sp, #152
-; CHECK-NEXT: ld1 { v19.s }[1], [x10]
-; CHECK-NEXT: ld1 { v20.s }[1], [x9]
+; CHECK-NEXT: ldr s17, [x11]
; CHECK-NEXT: add x9, sp, #176
-; CHECK-NEXT: add x10, sp, #184
-; CHECK-NEXT: fneg v6.4s, v6.4s
-; CHECK-NEXT: fneg v18.4s, v18.4s
-; CHECK-NEXT: fmla v16.4s, v2.4s, v4.4s
-; CHECK-NEXT: fmla v1.4s, v0.4s, v17.4s
-; CHECK-NEXT: ld1 { v7.s }[3], [x9]
-; CHECK-NEXT: add x9, sp, #168
-; CHECK-NEXT: ld1 { v20.s }[2], [x9]
-; CHECK-NEXT: ldr s4, [sp, #200]
+; CHECK-NEXT: add x10, sp, #16
+; CHECK-NEXT: add x11, sp, #168
+; CHECK-NEXT: ld1 { v17.s }[1], [x9]
+; CHECK-NEXT: ld1 { v2.s }[1], [x10]
+; CHECK-NEXT: add x9, sp, #152
+; CHECK-NEXT: fmul v19.4s, v5.4s, v1.4s
+; CHECK-NEXT: fmul v20.4s, v7.4s, v16.4s
+; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s
+; CHECK-NEXT: fmul v1.4s, v4.4s, v1.4s
+; CHECK-NEXT: ld1 { v18.s }[1], [x9]
+; CHECK-NEXT: ldr s21, [x11]
+; CHECK-NEXT: zip1 v6.2d, v6.2d, v17.2d
+; CHECK-NEXT: ldr s17, [sp, #192]
+; CHECK-NEXT: add x9, sp, #184
+; CHECK-NEXT: add x10, sp, #208
+; CHECK-NEXT: ld1 { v21.s }[1], [x9]
; CHECK-NEXT: add x9, sp, #216
-; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s
-; CHECK-NEXT: fmla v18.4s, v2.4s, v5.4s
-; CHECK-NEXT: ld1 { v4.s }[1], [x9]
-; CHECK-NEXT: fsub v0.4s, v7.4s, v1.4s
-; CHECK-NEXT: fsub v1.4s, v19.4s, v16.4s
-; CHECK-NEXT: ld1 { v20.s }[3], [x10]
-; CHECK-NEXT: fadd v2.4s, v4.4s, v18.4s
-; CHECK-NEXT: fadd v3.4s, v20.4s, v6.4s
+; CHECK-NEXT: fneg v19.4s, v19.4s
+; CHECK-NEXT: fneg v20.4s, v20.4s
+; CHECK-NEXT: fmla v16.4s, v2.4s, v7.4s
+; CHECK-NEXT: fmla v1.4s, v0.4s, v5.4s
+; CHECK-NEXT: ld1 { v17.s }[1], [x10]
+; CHECK-NEXT: ldr s5, [sp, #200]
+; CHECK-NEXT: zip1 v7.2d, v18.2d, v21.2d
+; CHECK-NEXT: ld1 { v5.s }[1], [x9]
+; CHECK-NEXT: fmla v19.4s, v0.4s, v4.4s
+; CHECK-NEXT: fmla v20.4s, v2.4s, v3.4s
+; CHECK-NEXT: fsub v0.4s, v6.4s, v1.4s
+; CHECK-NEXT: fsub v1.4s, v17.4s, v16.4s
+; CHECK-NEXT: fadd v2.4s, v7.4s, v19.4s
+; CHECK-NEXT: fadd v3.4s, v5.4s, v20.4s
; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #12
-; CHECK-NEXT: trn2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s
; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT: ext v5.16b, v3.16b, v5.16b, #8
+; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8
; CHECK-NEXT: rev64 v4.4s, v4.4s
-; CHECK-NEXT: trn2 v2.4s, v4.4s, v5.4s
-; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s
-; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT: mov v4.d[1], v2.d[0]
+; CHECK-NEXT: trn2 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT: zip2 v4.4s, v0.4s, v2.4s
+; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ext v1.16b, v3.16b, v1.16b, #8
+; CHECK-NEXT: mov v4.d[1], v3.d[0]
; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: stp q4, q1, [x8, #16]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index acf15f1bd1178..e6f27b95d92c8 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1]
-; CHECK-NEXT: ld1 { v0.s }[2], [x2]
-; CHECK-NEXT: ld1 { v0.s }[3], [x3]
+; CHECK-NEXT: ldr s1, [x2]
+; CHECK-NEXT: ld1 { v1.s }[1], [x3]
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%A = load <4 x i8>, ptr %ptrA
%B = load <4 x i8>, ptr %ptrB
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index c6b8e41f9bdfd..4906e2e15e51c 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
; FULLFP16-NEXT: add x9, sp, #16
; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3
; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT: add x10, sp, #40
; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5
; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6
; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7
@@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
; FULLFP16-NEXT: ld1 { v1.h }[1], [x9]
; FULLFP16-NEXT: add x9, sp, #24
; FULLFP16-NEXT: mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT: ldr h2, [sp]
; FULLFP16-NEXT: ld1 { v1.h }[2], [x9]
; FULLFP16-NEXT: add x9, sp, #32
-; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h
; FULLFP16-NEXT: mov v0.h[3], v3.h[0]
; FULLFP16-NEXT: ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT: add x9, sp, #40
-; FULLFP16-NEXT: ldr h3, [sp, #72]
-; FULLFP16-NEXT: ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT: ldr h2, [x10]
; FULLFP16-NEXT: add x9, sp, #48
+; FULLFP16-NEXT: ldr h3, [sp, #72]
+; FULLFP16-NEXT: ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT: add x9, sp, #56
; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h
; FULLFP16-NEXT: mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT: ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT: add x9, sp, #56
-; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT: mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT: ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT: ld1 { v2.h }[2], [x9]
; FULLFP16-NEXT: add x9, sp, #64
-; FULLFP16-NEXT: str h2, [x8, #16]
+; FULLFP16-NEXT: mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT: ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT: ldr h2, [sp]
; FULLFP16-NEXT: mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT: ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h
; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h
; FULLFP16-NEXT: mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h
; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT: str h2, [x8, #16]
; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h
; FULLFP16-NEXT: str q0, [x8]
; FULLFP16-NEXT: ret
@@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
; FULLFP16-NEXT: add x9, sp, #16
; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3
; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT: add x10, sp, #40
; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5
; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6
; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7
@@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
; FULLFP16-NEXT: ld1 { v1.h }[1], [x9]
; FULLFP16-NEXT: add x9, sp, #24
; FULLFP16-NEXT: mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT: ldr h2, [sp]
; FULLFP16-NEXT: ld1 { v1.h }[2], [x9]
; FULLFP16-NEXT: add x9, sp, #32
-; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h
; FULLFP16-NEXT: mov v0.h[3], v3.h[0]
; FULLFP16-NEXT: ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT: add x9, sp, #40
-; FULLFP16-NEXT: ldr h3, [sp, #72]
-; FULLFP16-NEXT: ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT: ldr h2, [x10]
; FULLFP16-NEXT: add x9, sp, #48
+; FULLFP16-NEXT: ldr h3, [sp, #72]
+; FULLFP16-NEXT: ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT: add x9, sp, #56
; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h
; FULLFP16-NEXT: mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT: ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT: add x9, sp, #56
-; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT: mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT: ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT: ld1 { v2.h }[2], [x9]
; FULLFP16-NEXT: add x9, sp, #64
-; FULLFP16-NEXT: str h2, [x8, #16]
+; FULLFP16-NEXT: mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT: ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT: ldr h2, [sp]
; FULLFP16-NEXT: mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT: ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h
; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h
; FULLFP16-NEXT: mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h
; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT: str h2, [x8, #16]
; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h
; FULLFP16-NEXT: str q0, [x8]
; FULLFP16-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 4c28c90824028..ae2ef2649102e 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
;
; CHECK-GI-LABEL: fshl_v7i32:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr s3, [sp, #48]
-; CHECK-GI-NEXT: ldr s20, [sp, #56]
-; CHECK-GI-NEXT: add x9, sp, #56
+; CHECK-GI-NEXT: ldr s17, [sp, #48]
+; CHECK-GI-NEXT: add x8, sp, #56
+; CHECK-GI-NEXT: add x9, sp, #64
; CHECK-GI-NEXT: ldr s4, [sp, #48]
-; CHECK-GI-NEXT: ldr s7, [sp, #80]
-; CHECK-GI-NEXT: mov w12, #-1 // =0xffffffff
-; CHECK-GI-NEXT: ldr s21, [sp, #88]
-; CHECK-GI-NEXT: mov v3.s[1], v20.s[0]
-; CHECK-GI-NEXT: fmov s20, w12
-; CHECK-GI-NEXT: ld1 { v4.s }[1], [x9]
-; CHECK-GI-NEXT: ldr s17, [sp]
-; CHECK-GI-NEXT: add x13, sp, #64
-; CHECK-GI-NEXT: mov v7.s[1], v21.s[0]
+; CHECK-GI-NEXT: ldr s21, [sp, #56]
+; CHECK-GI-NEXT: mov w10, #-1 // =0xffffffff
+; CHECK-GI-NEXT: ld1 { v17.s }[1], [x8]
+; CHECK-GI-NEXT: ldr s20, [x9]
+; CHECK-GI-NEXT: add x8, sp, #72
+; CHECK-GI-NEXT: mov v4.s[1], v21.s[0]
; CHECK-GI-NEXT: fmov s21, w7
+; CHECK-GI-NEXT: ldr s6, [sp]
+; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8]
; CHECK-GI-NEXT: ldr s19, [sp, #64]
-; CHECK-GI-NEXT: mov w11, #31 // =0x1f
-; CHECK-GI-NEXT: mov v20.s[1], w12
+; CHECK-GI-NEXT: ldr s7, [sp, #80]
+; CHECK-GI-NEXT: ldr s22, [sp, #88]
+; CHECK-GI-NEXT: mov w9, #31 // =0x1f
+; CHECK-GI-NEXT: mov w11, #1 // =0x1
+; CHECK-GI-NEXT: mov v21.s[1], v6.s[0]
+; CHECK-GI-NEXT: fmov s6, w9
; CHECK-GI-NEXT: ldr s18, [sp, #96]
-; CHECK-GI-NEXT: ld1 { v4.s }[2], [x13]
-; CHECK-GI-NEXT: mov w13, #1 // =0x1
-; CHECK-GI-NEXT: mov v3.s[2], v19.s[0]
-; CHECK-GI-NEXT: mov v21.s[1], v17.s[0]
-; CHECK-GI-NEXT: fmov s17, w11
-; CHECK-GI-NEXT: fmov s19, w13
+; CHECK-GI-NEXT: zip1 v17.2d, v17.2d, v20.2d
+; CHECK-GI-NEXT: fmov s20, w10
+; CHECK-GI-NEXT: mov v7.s[1], v22.s[0]
+; CHECK-GI-NEXT: mov v4.s[2], v19.s[0]
+; CHECK-GI-NEXT: fmov s19, w11
; CHECK-GI-NEXT: fmov s23, w0
-; CHECK-GI-NEXT: fmov s24, w11
-; CHECK-GI-NEXT: ldr s6, [sp, #8]
+; CHECK-GI-NEXT: mov v6.s[1], w9
+; CHECK-GI-NEXT: fmov s24, w9
+; CHECK-GI-NEXT: ldr s2, [sp, #8]
+; CHECK-GI-NEXT: mov v20.s[1], w10
; CHECK-GI-NEXT: ldr s0, [sp, #24]
; CHECK-GI-NEXT: ldr s5, [sp, #32]
+; CHECK-GI-NEXT: mov v19.s[1], w11
; CHECK-GI-NEXT: mov v7.s[2], v18.s[0]
-; CHECK-GI-NEXT: mov v17.s[1], w11
-; CHECK-GI-NEXT: mov v19.s[1], w13
-; CHECK-GI-NEXT: mov v20.s[2], w12
; CHECK-GI-NEXT: ldr s16, [sp, #72]
; CHECK-GI-NEXT: mov v23.s[1], w1
; CHECK-GI-NEXT: ldr s18, [sp, #80]
-; CHECK-GI-NEXT: mov v21.s[2], v6.s[0]
-; CHECK-GI-NEXT: mov v24.s[1], w11
+; CHECK-GI-NEXT: mov v21.s[2], v2.s[0]
+; CHECK-GI-NEXT: mov v24.s[1], w9
; CHECK-GI-NEXT: mov v0.s[1], v5.s[0]
-; CHECK-GI-NEXT: fmov s6, w4
-; CHECK-GI-NEXT: add x10, sp, #88
+; CHECK-GI-NEXT: fmov s5, w4
+; CHECK-GI-NEXT: mov v20.s[2], w10
+; CHECK-GI-NEXT: add x8, sp, #88
; CHECK-GI-NEXT: movi v22.4s, #31
-; CHECK-GI-NEXT: mov v3.s[3], v16.s[0]
-; CHECK-GI-NEXT: mov v17.s[2], w11
-; CHECK-GI-NEXT: mov v19.s[2], w13
-; CHECK-GI-NEXT: ldr s2, [sp, #16]
-; CHECK-GI-NEXT: ldr s1, [sp, #40]
-; CHECK-GI-NEXT: ld1 { v18.s }[1], [x10]
-; CHECK-GI-NEXT: eor v5.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT: mov v4.s[3], v16.s[0]
+; CHECK-GI-NEXT: mov v6.s[2], w9
+; CHECK-GI-NEXT: mov v19.s[2], w11
+; CHECK-GI-NEXT: ldr s1, [sp, #16]
+; CHECK-GI-NEXT: ldr s3, [sp, #40]
+; CHECK-GI-NEXT: ld1 { v18.s }[1], [x8]
; CHECK-GI-NEXT: mov v23.s[2], w2
-; CHECK-GI-NEXT: mov v6.s[1], w5
-; CHECK-GI-NEXT: add x8, sp, #72
-; CHECK-GI-NEXT: add x9, sp, #96
-; CHECK-GI-NEXT: mov v21.s[3], v2.s[0]
-; CHECK-GI-NEXT: mov v24.s[2], w11
-; CHECK-GI-NEXT: mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT: ld1 { v4.s }[3], [x8]
-; CHECK-GI-NEXT: bic v2.16b, v22.16b, v3.16b
-; CHECK-GI-NEXT: ld1 { v18.s }[2], [x9]
-; CHECK-GI-NEXT: and v1.16b, v5.16b, v17.16b
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: add x8, sp, #96
+; CHECK-GI-NEXT: eor v2.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT: mov v21.s[3], v1.s[0]
+; CHECK-GI-NEXT: mov v24.s[2], w9
+; CHECK-GI-NEXT: mov v0.s[2], v3.s[0]
+; CHECK-GI-NEXT: bic v1.16b, v22.16b, v4.16b
+; CHECK-GI-NEXT: ld1 { v18.s }[2], [x8]
; CHECK-GI-NEXT: neg v3.4s, v19.4s
+; CHECK-GI-NEXT: and v4.16b, v17.16b, v22.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b
; CHECK-GI-NEXT: mov v23.s[3], w3
-; CHECK-GI-NEXT: mov v6.s[2], w6
-; CHECK-GI-NEXT: and v4.16b, v4.16b, v22.16b
-; CHECK-GI-NEXT: ushr v5.4s, v21.4s, #1
-; CHECK-GI-NEXT: neg v2.4s, v2.4s
-; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b
+; CHECK-GI-NEXT: mov v5.s[2], w6
+; CHECK-GI-NEXT: ushr v6.4s, v21.4s, #1
; CHECK-GI-NEXT: neg v1.4s, v1.4s
+; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b
; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT: neg v2.4s, v2.4s
; CHECK-GI-NEXT: ushl v3.4s, v23.4s, v4.4s
-; CHECK-GI-NEXT: ushl v2.4s, v5.4s, v2.4s
-; CHECK-GI-NEXT: ushl v4.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: orr v1.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT: ushl v1.4s, v6.4s, v1.4s
+; CHECK-GI-NEXT: ushl v4.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: orr v1.16b, v3.16b, v1.16b
; CHECK-GI-NEXT: orr v0.16b, v4.16b, v0.16b
; CHECK-GI-NEXT: mov s2, v1.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[2]
; CHECK-GI-NEXT: mov s4, v1.s[3]
+; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: mov s5, v0.s[1]
; CHECK-GI-NEXT: mov s6, v0.s[2]
-; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: fmov w4, s0
; CHECK-GI-NEXT: fmov w1, s2
; CHECK-GI-NEXT: fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
index 2213aa1429dbd..4e1876db772ed 100644
--- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
@@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
; CHECK-NEXT: ldr s1, [sp, #44]
; CHECK-NEXT: ldr q2, [sp] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: ld1 { v1.s }[1], [x19]
; CHECK-NEXT: mov v2.s[3], v0.s[0]
-; CHECK-NEXT: ld1 { v1.s }[2], [x20]
+; CHECK-NEXT: ld1 { v1.s }[1], [x19]
+; CHECK-NEXT: ldr s0, [x20]
+; CHECK-NEXT: ld1 { v0.s }[1], [x21]
; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v1.s }[3], [x21]
; CHECK-NEXT: ldp x30, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 v1.2d, v1.2d, v0.2d
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
;
@@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
; CHECK-NEXT: bl frexpf
; CHECK-NEXT: ldr s0, [sp, #28]
; CHECK-NEXT: ld1 { v0.s }[1], [x19]
-; CHECK-NEXT: ld1 { v0.s }[2], [x20]
+; CHECK-NEXT: ldr s1, [x20]
+; CHECK-NEXT: ld1 { v1.s }[1], [x21]
; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ld1 { v0.s }[3], [x21]
; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
; CHECK-NEXT: add sp, sp, #64
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4f0c4080aa0ce..9443004ea434b 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: .cfi_offset w29, -16
-; CHECK-SD-NEXT: ldr b5, [sp, #208]
+; CHECK-SD-NEXT: ldr b0, [sp, #208]
; CHECK-SD-NEXT: add x8, sp, #216
-; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: add x9, sp, #272
+; CHECK-SD-NEXT: ldr b2, [sp, #80]
; CHECK-SD-NEXT: ldr b4, [sp, #976]
-; CHECK-SD-NEXT: add x9, sp, #984
-; CHECK-SD-NEXT: add x12, sp, #328
-; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8]
-; CHECK-SD-NEXT: add x8, sp, #224
-; CHECK-SD-NEXT: movi v1.16b, #1
-; CHECK-SD-NEXT: mov v0.b[1], w1
-; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9]
-; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT: add x11, sp, #992
; CHECK-SD-NEXT: ldr b6, [sp, #720]
-; CHECK-SD-NEXT: ldr b7, [sp, #80]
-; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #224
+; CHECK-SD-NEXT: fmov s16, w0
+; CHECK-SD-NEXT: ldr b17, [sp, #848]
+; CHECK-SD-NEXT: add x10, sp, #24
+; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
; CHECK-SD-NEXT: add x8, sp, #232
-; CHECK-SD-NEXT: add x13, sp, #88
-; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11]
-; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13]
-; CHECK-SD-NEXT: add x13, sp, #856
-; CHECK-SD-NEXT: mov v0.b[2], w2
-; CHECK-SD-NEXT: add x14, sp, #1008
-; CHECK-SD-NEXT: add x15, sp, #872
-; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT: mov v16.b[1], w1
+; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #240
-; CHECK-SD-NEXT: add x16, sp, #888
-; CHECK-SD-NEXT: add x10, sp, #16
-; CHECK-SD-NEXT: add x9, sp, #24
-; CHECK-SD-NEXT: add x11, sp, #40
-; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT: mov v16.b[2], w2
+; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: add x8, sp, #248
-; CHECK-SD-NEXT: mov v0.b[3], w3
-; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT: mov v16.b[3], w3
+; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #256
-; CHECK-SD-NEXT: mov v0.b[4], w4
-; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8]
; CHECK-SD-NEXT: add x8, sp, #264
-; CHECK-SD-NEXT: mov v0.b[5], w5
-; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8]
-; CHECK-SD-NEXT: add x8, sp, #272
-; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8]
+; CHECK-SD-NEXT: mov v16.b[4], w4
+; CHECK-SD-NEXT: ld1 { v0.b }[7], [x8]
+; CHECK-SD-NEXT: ldr b1, [x9]
; CHECK-SD-NEXT: add x8, sp, #280
-; CHECK-SD-NEXT: mov v0.b[6], w6
-; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8]
+; CHECK-SD-NEXT: add x9, sp, #88
+; CHECK-SD-NEXT: mov v16.b[5], w5
+; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #288
-; CHECK-SD-NEXT: mov v0.b[7], w7
-; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8]
+; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8]
; CHECK-SD-NEXT: add x8, sp, #296
-; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10]
-; CHECK-SD-NEXT: add x10, sp, #128
-; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8]
+; CHECK-SD-NEXT: mov v16.b[6], w6
+; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #304
-; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9]
-; CHECK-SD-NEXT: add x9, sp, #136
-; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8]
+; CHECK-SD-NEXT: mov v16.b[7], w7
+; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8]
; CHECK-SD-NEXT: add x8, sp, #312
-; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #320
-; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT: add x8, sp, #32
-; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8]
-; CHECK-SD-NEXT: add x8, sp, #144
-; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12]
-; CHECK-SD-NEXT: add x12, sp, #728
-; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12]
-; CHECK-SD-NEXT: add x12, sp, #1000
-; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11]
-; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12]
-; CHECK-SD-NEXT: add x12, sp, #736
-; CHECK-SD-NEXT: add x11, sp, #920
-; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT: ldr b5, [sp, #848]
-; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12]
-; CHECK-SD-NEXT: add x12, sp, #48
-; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13]
-; CHECK-SD-NEXT: add x13, sp, #744
-; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14]
-; CHECK-SD-NEXT: add x14, sp, #96
-; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12]
-; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13]
-; CHECK-SD-NEXT: add x13, sp, #864
-; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14]
-; CHECK-SD-NEXT: add x14, sp, #1016
-; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13]
-; CHECK-SD-NEXT: add x13, sp, #752
-; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14]
-; CHECK-SD-NEXT: add x14, sp, #104
-; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13]
-; CHECK-SD-NEXT: add x13, sp, #1024
-; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14]
-; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15]
-; CHECK-SD-NEXT: add x15, sp, #760
-; CHECK-SD-NEXT: add x14, sp, #112
-; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13]
-; CHECK-SD-NEXT: add x13, sp, #880
-; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15]
-; CHECK-SD-NEXT: add x15, sp, #1032
-; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14]
-; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13]
-; CHECK-SD-NEXT: add x14, sp, #768
-; CHECK-SD-NEXT: add x13, sp, #120
-; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15]
-; CHECK-SD-NEXT: add x15, sp, #1040
-; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14]
-; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13]
-; CHECK-SD-NEXT: add x13, sp, #776
-; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16]
-; CHECK-SD-NEXT: add x14, sp, #1048
-; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15]
-; CHECK-SD-NEXT: add x15, sp, #896
-; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13]
-; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
-; CHECK-SD-NEXT: add x10, sp, #784
-; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15]
-; CHECK-SD-NEXT: add x13, sp, #1056
-; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14]
-; CHECK-SD-NEXT: add x14, sp, #904
-; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10]
-; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9]
-; CHECK-SD-NEXT: add x9, sp, #792
-; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14]
-; CHECK-SD-NEXT: add x10, sp, #1064
-; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13]
-; CHECK-SD-NEXT: add x13, sp, #912
-; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9]
-; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8]
-; CHECK-SD-NEXT: add x9, sp, #800
-; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13]
+; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #328
+; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT: add x8, sp, #96
+; CHECK-SD-NEXT: add x9, sp, #144
+; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #104
+; CHECK-SD-NEXT: zip1 v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: movi v1.16b, #1
+; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #112
+; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #120
+; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #128
+; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #136
+; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT: ldr b3, [x9]
; CHECK-SD-NEXT: add x8, sp, #152
-; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10]
-; CHECK-SD-NEXT: add x10, sp, #1072
-; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9]
-; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8]
-; CHECK-SD-NEXT: add x9, sp, #808
-; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11]
-; CHECK-SD-NEXT: add x8, sp, #56
-; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10]
-; CHECK-SD-NEXT: add x10, sp, #160
-; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8]
-; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9]
-; CHECK-SD-NEXT: add x9, sp, #928
-; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10]
-; CHECK-SD-NEXT: add x10, sp, #1080
-; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9]
+; CHECK-SD-NEXT: add x9, sp, #984
+; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #160
+; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #168
+; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #176
+; CHECK-SD-NEXT: ld1 { v3.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #184
+; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #192
+; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #200
+; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9]
+; CHECK-SD-NEXT: add x8, sp, #992
+; CHECK-SD-NEXT: add x9, sp, #1040
+; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1000
+; CHECK-SD-NEXT: zip1 v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1008
+; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1016
+; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1024
+; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1032
+; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT: ldr b5, [x9]
+; CHECK-SD-NEXT: add x8, sp, #1048
+; CHECK-SD-NEXT: add x9, sp, #728
+; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1056
+; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1064
+; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1072
+; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1080
+; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1088
+; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #1096
+; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9]
+; CHECK-SD-NEXT: add x8, sp, #736
+; CHECK-SD-NEXT: add x9, sp, #784
+; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #744
+; CHECK-SD-NEXT: zip1 v4.2d, v4.2d, v5.2d
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #752
+; CHECK-SD-NEXT: sdot v19.4s, v4.16b, v1.16b
+; CHECK-SD-NEXT: sdot v5.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #760
+; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #768
+; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #776
+; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8]
+; CHECK-SD-NEXT: ldr b7, [x9]
+; CHECK-SD-NEXT: add x8, sp, #792
+; CHECK-SD-NEXT: add x9, sp, #856
+; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #800
+; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #808
+; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #816
-; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10]
-; CHECK-SD-NEXT: add x9, sp, #168
-; CHECK-SD-NEXT: add x10, sp, #176
-; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8]
-; CHECK-SD-NEXT: add x8, sp, #936
-; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9]
-; CHECK-SD-NEXT: add x9, sp, #1088
-; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8]
-; CHECK-SD-NEXT: add x8, sp, #64
-; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9]
-; CHECK-SD-NEXT: add x9, sp, #824
-; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8]
-; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9]
-; CHECK-SD-NEXT: add x9, sp, #944
-; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10]
-; CHECK-SD-NEXT: add x10, sp, #1096
-; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9]
+; CHECK-SD-NEXT: ld1 { v7.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #824
+; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #832
-; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10]
-; CHECK-SD-NEXT: add x9, sp, #184
-; CHECK-SD-NEXT: add x10, sp, #72
-; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8]
-; CHECK-SD-NEXT: add x8, sp, #952
-; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9]
-; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8]
; CHECK-SD-NEXT: add x8, sp, #840
-; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10]
-; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b
-; CHECK-SD-NEXT: add x9, sp, #192
-; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8]
+; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v17.b }[1], [x9]
+; CHECK-SD-NEXT: add x8, sp, #864
+; CHECK-SD-NEXT: add x9, sp, #16
+; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9]
+; CHECK-SD-NEXT: add x9, sp, #912
+; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #872
+; CHECK-SD-NEXT: zip1 v0.2d, v6.2d, v7.2d
+; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10]
+; CHECK-SD-NEXT: ld1 { v17.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #880
+; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT: ld1 { v17.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #888
+; CHECK-SD-NEXT: ld1 { v17.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #896
+; CHECK-SD-NEXT: ld1 { v17.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #904
+; CHECK-SD-NEXT: ld1 { v17.b }[7], [x8]
+; CHECK-SD-NEXT: ldr b18, [x9]
+; CHECK-SD-NEXT: add x8, sp, #920
+; CHECK-SD-NEXT: ld1 { v18.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #32
+; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8]
+; CHECK-SD-NEXT: add x8, sp, #928
+; CHECK-SD-NEXT: ld1 { v18.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #40
+; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8]
+; CHECK-SD-NEXT: add x8, sp, #936
+; CHECK-SD-NEXT: ld1 { v18.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #48
+; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8]
+; CHECK-SD-NEXT: add x8, sp, #944
+; CHECK-SD-NEXT: ld1 { v18.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #56
+; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8]
+; CHECK-SD-NEXT: add x8, sp, #952
+; CHECK-SD-NEXT: ld1 { v18.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #64
+; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8]
; CHECK-SD-NEXT: add x8, sp, #960
-; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9]
-; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT: add x8, sp, #200
-; CHECK-SD-NEXT: add x9, sp, #968
-; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b
-; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8]
-; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9]
-; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b
-; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: ld1 { v18.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #72
+; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8]
+; CHECK-SD-NEXT: add x8, sp, #968
+; CHECK-SD-NEXT: ld1 { v18.b }[7], [x8]
+; CHECK-SD-NEXT: sdot v5.4s, v16.16b, v1.16b
+; CHECK-SD-NEXT: zip1 v0.2d, v17.2d, v18.2d
+; CHECK-SD-NEXT: sdot v5.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT: add v0.4s, v5.4s, v19.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index f8ba150a0405f..f7a87ae340a73 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
;
; CHECK-BE-LABEL: test_stnp_v17f32:
; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4
+; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1
; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-BE-NEXT: ldr s16, [sp, #36]
+; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4
; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5
-; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1
-; CHECK-BE-NEXT: ldr s17, [sp, #4]
-; CHECK-BE-NEXT: add x8, sp, #44
-; CHECK-BE-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT: add x8, sp, #12
+; CHECK-BE-NEXT: add x9, sp, #20
+; CHECK-BE-NEXT: ldr s16, [sp, #36]
; CHECK-BE-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-BE-NEXT: ldr s1, [sp, #4]
+; CHECK-BE-NEXT: mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT: add x10, sp, #52
; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6
; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2
; CHECK-BE-NEXT: // kill: def $s7 killed $s7 def $q7
; CHECK-BE-NEXT: // kill: def $s3 killed $s3 def $q3
-; CHECK-BE-NEXT: ldr s1, [sp, #68]
-; CHECK-BE-NEXT: ld1 { v16.s }[1], [x8]
-; CHECK-BE-NEXT: add x8, sp, #12
-; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8]
-; CHECK-BE-NEXT: add x8, sp, #52
-; CHECK-BE-NEXT: str s1, [x0, #64]
-; CHECK-BE-NEXT: ld1 { v16.s }[2], [x8]
-; CHECK-BE-NEXT: add x8, sp, #20
+; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8]
+; CHECK-BE-NEXT: ldr s5, [x9]
+; CHECK-BE-NEXT: add x8, sp, #28
+; CHECK-BE-NEXT: add x9, sp, #44
+; CHECK-BE-NEXT: ld1 { v5.s }[1], [x8]
+; CHECK-BE-NEXT: ld1 { v16.s }[1], [x9]
+; CHECK-BE-NEXT: ldr s17, [x10]
+; CHECK-BE-NEXT: add x8, sp, #60
; CHECK-BE-NEXT: mov v4.s[2], v6.s[0]
; CHECK-BE-NEXT: mov v0.s[2], v2.s[0]
-; CHECK-BE-NEXT: ld1 { v17.s }[2], [x8]
-; CHECK-BE-NEXT: add x8, sp, #60
-; CHECK-BE-NEXT: ld1 { v16.s }[3], [x8]
-; CHECK-BE-NEXT: add x8, sp, #28
-; CHECK-BE-NEXT: ld1 { v17.s }[3], [x8]
+; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8]
+; CHECK-BE-NEXT: ldr s2, [sp, #68]
+; CHECK-BE-NEXT: add x8, x0, #32
+; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v5.2d
+; CHECK-BE-NEXT: add x9, x0, #48
+; CHECK-BE-NEXT: str s2, [x0, #64]
+; CHECK-BE-NEXT: zip1 v5.2d, v16.2d, v17.2d
; CHECK-BE-NEXT: mov v4.s[3], v7.s[0]
-; CHECK-BE-NEXT: add x8, x0, #48
; CHECK-BE-NEXT: mov v0.s[3], v3.s[0]
-; CHECK-BE-NEXT: st1 { v16.4s }, [x8]
-; CHECK-BE-NEXT: add x8, x0, #32
-; CHECK-BE-NEXT: st1 { v17.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
; CHECK-BE-NEXT: add x8, x0, #16
+; CHECK-BE-NEXT: st1 { v5.4s }, [x9]
; CHECK-BE-NEXT: st1 { v4.4s }, [x8]
; CHECK-BE-NEXT: st1 { v0.4s }, [x0]
; CHECK-BE-NEXT: ret
More information about the llvm-commits
mailing list