[llvm] [AArch64][MachineCombiner] Combine sequences of gather patterns (PR #152979)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 11 02:09:34 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Jonathan Cohen (jcohen-apple)
<details>
<summary>Changes</summary>
Reland of #<!-- -->142941
Squashed with fixes for #<!-- -->150004, #<!-- -->149585
Notably added a check to conservatively apply the optimization, only if there is no chance of an aliasing instruction between the LD1 instructions we combine. If there are any function calls or stores - bail out.
---
Patch is 74.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152979.diff
11 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+355)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (+4)
- (added) llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir (+45)
- (added) llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir (+400)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll (+69-65)
- (modified) llvm/test/CodeGen/AArch64/concat-vector.ll (+3-2)
- (modified) llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll (+26-24)
- (modified) llvm/test/CodeGen/AArch64/fsh.ll (+57-56)
- (modified) llvm/test/CodeGen/AArch64/llvm.frexp.ll (+8-6)
- (modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+175-170)
- (modified) llvm/test/CodeGen/AArch64/nontemporal.ll (+25-23)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index fb59c9f131fb2..eb759b36086a7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CFIInstBuilder.h"
#include "llvm/CodeGen/LivePhysRegs.h"
@@ -7412,11 +7413,346 @@ static bool getMiscPatterns(MachineInstr &Root,
return false;
}
+/// Check if there are any stores or calls between two instructions in the same
+/// basic block.
+static bool hasInterveningStoreOrCall(const MachineInstr *First, const MachineInstr *Last) {
+ if (!First || !Last || First == Last)
+ return false;
+
+ // Both instructions must be in the same basic block.
+ if (First->getParent() != Last->getParent())
+ return false;
+
+ // Sanity check that First comes before Last.
+ const MachineBasicBlock *MBB = First->getParent();
+ auto InstrIt = First->getIterator();
+ auto LastIt = Last->getIterator();
+
+ for (; InstrIt != MBB->end(); ++InstrIt) {
+ if (InstrIt == LastIt)
+ break;
+
+ // Check for stores or calls that could interfere
+ if (InstrIt->mayStore() || InstrIt->isCall())
+ return true;
+ }
+
+ // If we reached the end of the basic block, our instructions must have not
+ // been ordered correctly and the analysis is invalid.
+ assert(InstrIt != MBB->end() &&
+ "Got bad machine instructions, First should come before Last!");
+ return false;
+}
+
+/// Check if the given instruction forms a gather load pattern that can be
+/// optimized for better Memory-Level Parallelism (MLP). This function
+/// identifies chains of NEON lane load instructions that load data from
+/// different memory addresses into individual lanes of a 128-bit vector
+/// register, then attempts to split the pattern into parallel loads to break
+/// the serial dependency between instructions.
+///
+/// Pattern Matched:
+/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
+/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
+///
+/// Transformed Into:
+/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
+/// to combine the results, enabling better memory-level parallelism.
+///
+/// Supported Element Types:
+/// - 32-bit elements (LD1i32, 4 lanes total)
+/// - 16-bit elements (LD1i16, 8 lanes total)
+/// - 8-bit elements (LD1i8, 16 lanes total)
+static bool getGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns,
+ unsigned LoadLaneOpCode, unsigned NumLanes) {
+ const MachineFunction *MF = Root.getMF();
+
+ // Early exit if optimizing for size.
+ if (MF->getFunction().hasMinSize())
+ return false;
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+ // The root of the pattern must load into the last lane of the vector.
+ if (Root.getOperand(2).getImm() != NumLanes - 1)
+ return false;
+
+ // Check that we have load into all lanes except lane 0.
+ // For each load we also want to check that:
+ // 1. It has a single non-debug use (since we will be replacing the virtual
+ // register)
+ // 2. That the addressing mode only uses a single pointer operand
+ auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+ SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
+ SmallSet<const MachineInstr *, 16> LoadInstrs = {};
+ while (!RemainingLanes.empty() && CurrInstr &&
+ CurrInstr->getOpcode() == LoadLaneOpCode &&
+ MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+ CurrInstr->getNumOperands() == 4) {
+ RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+ LoadInstrs.insert(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Check that we have found a match for lanes N-1.. 1.
+ if (!RemainingLanes.empty())
+ return false;
+
+ // Match the SUBREG_TO_REG sequence.
+ if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+ return false;
+
+ // Verify that the subreg to reg loads an integer into the first lane.
+ auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+ unsigned SingleLaneSizeInBits = 128 / NumLanes;
+ if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+ return false;
+
+ // Verify that it also has a single non debug use.
+ if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+ return false;
+
+ LoadInstrs.insert(MRI.getUniqueVRegDef(Lane0LoadReg));
+
+ // Check for intervening stores or calls between the first and last load
+ // Sort load instructions by program order
+ SmallVector<const MachineInstr *, 16> SortedLoads(LoadInstrs.begin(),
+ LoadInstrs.end());
+ llvm::sort(SortedLoads, [](const MachineInstr *A, const MachineInstr *B) {
+ if (A->getParent() != B->getParent()) {
+ // If in different blocks, this shouldn't happen for gather patterns
+ return false;
+ }
+ // Compare positions within the same basic block
+ for (const MachineInstr &MI : *A->getParent()) {
+ if (&MI == A)
+ return true;
+ if (&MI == B)
+ return false;
+ }
+ return false;
+ });
+
+ const MachineInstr *FirstLoad = SortedLoads.front();
+ const MachineInstr *LastLoad = SortedLoads.back();
+
+ if (hasInterveningStoreOrCall(FirstLoad, LastLoad))
+ return false;
+
+ switch (NumLanes) {
+ case 4:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+ break;
+ case 8:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+ break;
+ case 16:
+ Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+ break;
+ default:
+ llvm_unreachable("Got bad number of lanes for gather pattern.");
+ }
+
+ return true;
+}
+
+/// Search for patterns of LD instructions we can optimize.
+static bool getLoadPatterns(MachineInstr &Root,
+ SmallVectorImpl<unsigned> &Patterns) {
+
+ // The pattern searches for loads into single lanes.
+ switch (Root.getOpcode()) {
+ case AArch64::LD1i32:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+ case AArch64::LD1i16:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+ case AArch64::LD1i8:
+ return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+ default:
+ return false;
+ }
+}
+
+/// Generate optimized instruction sequence for gather load patterns to improve
+/// Memory-Level Parallelism (MLP). This function transforms a chain of
+/// sequential NEON lane loads into parallel vector loads that can execute
+/// concurrently.
+static void
+generateGatherPattern(MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+ unsigned Pattern, unsigned NumLanes) {
+ MachineFunction &MF = *Root.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ // Gather the initial load instructions to build the pattern.
+ SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+ MachineInstr *CurrInstr = &Root;
+ for (unsigned i = 0; i < NumLanes - 1; ++i) {
+ LoadToLaneInstrs.push_back(CurrInstr);
+ CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+ }
+
+ // Sort the load instructions according to the lane.
+ llvm::sort(LoadToLaneInstrs,
+ [](const MachineInstr *A, const MachineInstr *B) {
+ return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+ });
+
+ MachineInstr *SubregToReg = CurrInstr;
+ LoadToLaneInstrs.push_back(
+ MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+ auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+ const TargetRegisterClass *FPR128RegClass =
+ MRI.getRegClass(Root.getOperand(0).getReg());
+
+ // Helper lambda to create a LD1 instruction.
+ auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
+ Register SrcRegister, unsigned Lane,
+ Register OffsetRegister,
+ bool OffsetRegisterKillState) {
+ auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+ MachineInstrBuilder LoadIndexIntoRegister =
+ BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+ NewRegister)
+ .addReg(SrcRegister)
+ .addImm(Lane)
+ .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
+ InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+ InsInstrs.push_back(LoadIndexIntoRegister);
+ return NewRegister;
+ };
+
+ // Helper to create load instruction based on the NumLanes in the NEON
+ // register we are rewriting.
+ auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
+ Register OffsetReg,
+ bool KillState) -> MachineInstrBuilder {
+ unsigned Opcode;
+ switch (NumLanes) {
+ case 4:
+ Opcode = AArch64::LDRSui;
+ break;
+ case 8:
+ Opcode = AArch64::LDRHui;
+ break;
+ case 16:
+ Opcode = AArch64::LDRBui;
+ break;
+ default:
+ llvm_unreachable(
+ "Got unsupported number of lanes in machine-combiner gather pattern");
+ }
+ // Immediate offset load
+ return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+ .addReg(OffsetReg)
+ .addImm(0);
+ };
+
+ // Load the remaining lanes into register 0.
+ auto LanesToLoadToReg0 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+ LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+ Register PrevReg = SubregToReg->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+ const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
+ PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
+ OffsetRegOperand.getReg(),
+ OffsetRegOperand.isKill());
+ DelInstrs.push_back(LoadInstr);
+ }
+ Register LastLoadReg0 = PrevReg;
+
+ // First load into register 1. Perform an integer load to zero out the upper
+ // lanes in a single instruction.
+ MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
+ MachineInstr *OriginalSplitLoad =
+ *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+ Register DestRegForMiddleIndex = MRI.createVirtualRegister(
+ MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+ const MachineOperand &OriginalSplitToLoadOffsetOperand =
+ OriginalSplitLoad->getOperand(3);
+ MachineInstrBuilder MiddleIndexLoadInstr =
+ CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
+ OriginalSplitToLoadOffsetOperand.getReg(),
+ OriginalSplitToLoadOffsetOperand.isKill());
+
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+ InsInstrs.push_back(MiddleIndexLoadInstr);
+ DelInstrs.push_back(OriginalSplitLoad);
+
+ // Subreg To Reg instruction for register 1.
+ Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+ unsigned SubregType;
+ switch (NumLanes) {
+ case 4:
+ SubregType = AArch64::ssub;
+ break;
+ case 8:
+ SubregType = AArch64::hsub;
+ break;
+ case 16:
+ SubregType = AArch64::bsub;
+ break;
+ default:
+ llvm_unreachable(
+ "Got invalid NumLanes for machine-combiner gather pattern");
+ }
+
+ auto SubRegToRegInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+ DestRegForSubregToReg)
+ .addImm(0)
+ .addReg(DestRegForMiddleIndex, getKillRegState(true))
+ .addImm(SubregType);
+ InstrIdxForVirtReg.insert(
+ std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+ InsInstrs.push_back(SubRegToRegInstr);
+
+ // Load remaining lanes into register 1.
+ auto LanesToLoadToReg1 =
+ llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+ LoadToLaneInstrsAscending.end());
+ PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+ for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+ const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
+ PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
+ OffsetRegOperand.getReg(),
+ OffsetRegOperand.isKill());
+
+ // Do not add the last reg to DelInstrs - it will be removed later.
+ if (Index == NumLanes / 2 - 2) {
+ break;
+ }
+ DelInstrs.push_back(LoadInstr);
+ }
+ Register LastLoadReg1 = PrevReg;
+
+ // Create the final zip instruction to combine the results.
+ MachineInstrBuilder ZipInstr =
+ BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+ Root.getOperand(0).getReg())
+ .addReg(LastLoadReg0)
+ .addReg(LastLoadReg1);
+ InsInstrs.push_back(ZipInstr);
+}
+
CombinerObjective
AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
switch (Pattern) {
case AArch64MachineCombinerPattern::SUBADD_OP1:
case AArch64MachineCombinerPattern::SUBADD_OP2:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8:
return CombinerObjective::MustReduceDepth;
default:
return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7446,6 +7782,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
if (getMiscPatterns(Root, Patterns))
return true;
+ // Load patterns
+ if (getLoadPatterns(Root, Patterns))
+ return true;
+
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);
}
@@ -8701,6 +9041,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
break;
}
+ case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 4);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 8);
+ break;
+ }
+ case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+ generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+ Pattern, 16);
+ break;
+ }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..02734866e7122 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
FMULv8i16_indexed_OP2,
FNMADD,
+
+ GATHER_LANE_i32,
+ GATHER_LANE_i16,
+ GATHER_LANE_i8
};
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir
new file mode 100644
index 0000000000000..6b338d98afb53
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir
@@ -0,0 +1,45 @@
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
+
+
+--- |
+ @external_func = external global i32
+ define void @negative_pattern_offset_reg_copied_to_physical(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) {
+ entry:
+ ret void
+ }
+...
+---
+name: negative_pattern_offset_reg_copied_to_physical
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3
+
+ ; CHECK-LABEL: name: negative_pattern_offset_reg_copied_to_physical
+ ; CHECK: [[BASE_REG:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[PTR_1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[PTR_2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[PTR_3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[BASE_REG]], killed [[PTR_1]], 0, 1
+ ; CHECK-NEXT: [[LD_LANE_0:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+ ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_0]], 1, [[PTR_2]]
+ ; CHECK-NEXT: $x0 = COPY [[PTR_2]]
+ ; CHECK-NEXT: BL @external_func, csr_aarch64_aapcs, implicit-def $lr, implicit $x0, implicit-def $x0
+ ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[PTR_2]]
+ ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[PTR_3]]
+ ; CHECK-NEXT: [[RESULT:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+ %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+ %7:fpr128 = LD1i32 %6, 1, %2
+ $x0 = COPY %2
+ BL @external_func, csr_aarch64_aapcs, implicit-def $lr, implicit $x0, implicit-def $x0
+ %8:fpr128 = LD1i32 %7, 2, killed %2
+ %9:fpr128 = LD1i32 %8, 3, killed %3
+ %10:gpr64common = COPY $x0
+ $q0 = COPY %9
+ RET_ReallyLR implicit $q0
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
new file mode 100644
index 0000000000000..a7570d2293f8a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
@@ -0,0 +1,400 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: split_loads_to_fpr128
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+ ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], [[COPY1]], 0, 1
+ ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, [[LD_i32]], %subreg.ssub
+ ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, [[COPY2]]
+ ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
+ ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+ ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, [[COPY4]]
+ ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+ ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0
+ %0:gpr64common = COPY $x0
+ %1:gpr64common = COPY $x1
+ %2:gpr64common = COPY $x2
+ %3:gpr64common = COPY $x3
+ %4:gpr64common = COPY $x4
+ %5:fpr32 = LDRSroX %0, %1, 0, 1
+ %6:fpr128 = SUBREG_TO_REG 0, %5, %subreg.ssub
+ %7:fpr128 = LD1i32 %6, 1, %2
+ %8:fpr128 = LD1i32 %7, 2, %3
+ %9:fpr128 = LD1i32 %8, 3, %4
+ $q0 = COPY %9
+ RET_ReallyLR implicit $q0
+
+---
+name: split_loads_to_fpr128_ui
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $x2, $x3, $x4
+
+ ; CHECK-LABEL: name: split_loads_to_fpr128_ui
+ ; CHE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/152979
More information about the llvm-commits
mailing list