[llvm] [MachineCombiner] Reassociate long chains of accumulation instructions into a tree to increase ILP (PR #126060)

via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 6 05:04:36 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Jonathan Cohen (jcohen-apple)

<details>
<summary>Changes</summary>

This pattern shows up often in media codes. The optimization should only kick in for O3. Currently only supports a single accumulation instruction, but can easily be expanded to support additional instructions in the future.

---
Full diff: https://github.com/llvm/llvm-project/pull/126060.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+240) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (+2) 
- (added) llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir (+105) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 0f2b969fba35c7..af8d6a8031c33d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -78,6 +79,18 @@ static cl::opt<unsigned>
     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
                       cl::desc("Restrict range of B instructions (DEBUG)"));
 
+static cl::opt<bool>
+    EnableAccReassociation("aarch64-acc-reassoc", cl::Hidden, cl::init(true), 
+                      cl::desc("Enable reassociation of accumulation chains"));
+
+static cl::opt<unsigned int>
+    MinAccumulatorDepth("aarch64-acc-min-depth", cl::Hidden, cl::init(8), 
+                      cl::desc("Minimum length of accumulator chains required for the optimization to kick in"));
+
+static cl::opt<unsigned int>
+    MaxAccumulatorWidth("aarch64-acc-max-width", cl::Hidden, cl::init(3), cl::desc("Maximum number of branches in the accumulator tree"));
+
+
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
                           AArch64::CATCHRET),
@@ -6674,6 +6687,118 @@ static bool getMaddPatterns(MachineInstr &Root,
   }
   return Found;
 }
+
+static bool isAccumulationOpcode(unsigned Opcode) {
+  switch(Opcode) {
+    default:
+      break;
+    case AArch64::UABALB_ZZZ_D:
+    case AArch64::UABALB_ZZZ_H:
+    case AArch64::UABALB_ZZZ_S:
+    case AArch64::UABALT_ZZZ_D:
+    case AArch64::UABALT_ZZZ_H:
+    case AArch64::UABALT_ZZZ_S:
+    case AArch64::UABALv16i8_v8i16:
+    case AArch64::UABALv2i32_v2i64:
+    case AArch64::UABALv4i16_v4i32:
+    case AArch64::UABALv4i32_v2i64:
+    case AArch64::UABALv8i16_v4i32:
+    case AArch64::UABALv8i8_v8i16:
+      return true;
+  }
+  
+  return false;
+}
+
+static unsigned getAccumulationStartOpCode(unsigned AccumulationOpcode) {
+  switch(AccumulationOpcode) {
+    default:
+      llvm_unreachable("Unknown accumulator opcode");
+    case AArch64::UABALB_ZZZ_D:
+      return AArch64::UABDLB_ZZZ_D;
+    case AArch64::UABALB_ZZZ_H:
+      return AArch64::UABDLB_ZZZ_H;
+    case AArch64::UABALB_ZZZ_S:
+      return AArch64::UABDLB_ZZZ_S;
+    case AArch64::UABALT_ZZZ_D:
+      return AArch64::UABDLT_ZZZ_D;
+    case AArch64::UABALT_ZZZ_H:
+        return AArch64::UABDLT_ZZZ_H;
+    case AArch64::UABALT_ZZZ_S:
+        return AArch64::UABDLT_ZZZ_S;
+    case AArch64::UABALv16i8_v8i16:
+        return AArch64::UABDLv16i8_v8i16;
+    case AArch64::UABALv2i32_v2i64:
+        return AArch64::UABDLv2i32_v2i64;
+    case AArch64::UABALv4i16_v4i32:
+        return AArch64::UABDLv4i16_v4i32;
+    case AArch64::UABALv4i32_v2i64:
+        return AArch64::UABDLv4i32_v2i64;
+    case AArch64::UABALv8i16_v4i32:
+        return AArch64::UABDLv8i16_v4i32;
+    case AArch64::UABALv8i8_v8i16:
+        return AArch64::UABDLv8i8_v8i16;
+  }
+}
+
+static void getAccumulatorChain(MachineInstr *CurrentInstr, MachineBasicBlock &MBB, MachineRegisterInfo &MRI, SmallVectorImpl<Register> &Chain) {
+  // Walk up the chain of accumulation instructions and collect them in the vector.
+  unsigned AccumulatorOpcode = CurrentInstr->getOpcode();
+  unsigned ChainStartOpCode = getAccumulationStartOpCode(AccumulatorOpcode);
+  while(CurrentInstr && (canCombine(MBB, CurrentInstr->getOperand(1), AccumulatorOpcode) || canCombine(MBB, CurrentInstr->getOperand(1), ChainStartOpCode))) {
+    Chain.push_back(CurrentInstr->getOperand(0).getReg());
+    CurrentInstr = MRI.getUniqueVRegDef(CurrentInstr->getOperand(1).getReg());
+  }
+
+  // Add the instruction at the top of the chain.
+  if (CurrentInstr->getOpcode() == ChainStartOpCode)
+    Chain.push_back(CurrentInstr->getOperand(0).getReg());
+}
+
+/// Find chains of accumulations, likely linearized by reassocation pass, 
+/// that can be rewritten as a tree for increased ILP.
+static bool getAccumulatorReassociationPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+  // find a chain of depth 4, which would make it profitable to rewrite
+  // as a tree. This pattern should be applied recursively in case we
+  // have a longer chain.
+  if (!EnableAccReassociation)
+    return false;
+  
+  unsigned Opc = Root.getOpcode();
+  if (!isAccumulationOpcode(Opc))
+    return false;
+
+  // Verify that this is the end of the chain.
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  if (!MRI.hasOneNonDBGUser(Root.getOperand(0).getReg()))
+    return false;
+
+  auto User = MRI.use_instr_begin(Root.getOperand(0).getReg());
+  if (User->getOpcode() == Opc)
+    return false;
+  
+  // Walk up the use chain and collect the reduction chain.
+  SmallVector<Register, 32> Chain;
+  getAccumulatorChain(&Root, MBB, MRI, Chain);
+
+  // Reject chains which are too short to be worth modifying.
+  if (Chain.size() < MinAccumulatorDepth)
+    return false;
+
+  // Check if the MBB this instruction is a part of contains any other chains. If so, don't apply it.
+  SmallSet<Register, 32> ReductionChain(Chain.begin(), Chain.end());
+  for (const auto &I : MBB) {
+    if (I.getOpcode() == Opc && !ReductionChain.contains(I.getOperand(0).getReg()))
+      return false;
+  }
+
+  typedef AArch64MachineCombinerPattern MCP;
+  Patterns.push_back(MCP::ACC_CHAIN);
+  return true;
+}
+
 /// Floating-Point Support
 
 /// Find instructions that can be turned into madd.
@@ -7061,6 +7186,7 @@ AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::ACC_CHAIN:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7078,6 +7204,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   // Integer patterns
   if (getMaddPatterns(Root, Patterns))
     return true;
+  if (getAccumulatorReassociationPatterns(Root, Patterns))
+    return true;
   // Floating point patterns
   if (getFMULPatterns(Root, Patterns))
     return true;
@@ -7436,6 +7564,72 @@ genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
   DelInstrs.push_back(&Root);
 }
 
+static unsigned int getReduceOpCodeForAccumulator(unsigned int AccumulatorOpCode) {
+  switch (AccumulatorOpCode) {
+    case AArch64::UABALB_ZZZ_D:
+      return AArch64::ADD_ZZZ_D;
+    case AArch64::UABALB_ZZZ_H:
+      return AArch64::ADD_ZZZ_H;
+    case AArch64::UABALB_ZZZ_S:
+      return AArch64::ADD_ZZZ_S;
+    case AArch64::UABALT_ZZZ_D:
+      return AArch64::ADD_ZZZ_D;
+    case AArch64::UABALT_ZZZ_H:
+      return AArch64::ADD_ZZZ_H;
+    case AArch64::UABALT_ZZZ_S:
+      return AArch64::ADD_ZZZ_S;
+    case AArch64::UABALv16i8_v8i16:
+      return AArch64::ADDv8i16;
+    case AArch64::UABALv2i32_v2i64:
+      return AArch64::ADDv2i64;
+    case AArch64::UABALv4i16_v4i32:
+      return AArch64::ADDv4i32;
+    case AArch64::UABALv4i32_v2i64:
+      return AArch64::ADDv2i64;
+    case AArch64::UABALv8i16_v4i32:
+      return AArch64::ADDv4i32;
+    case AArch64::UABALv8i8_v8i16:
+      return AArch64::ADDv8i16;
+    default:
+      llvm_unreachable("Unknown accumulator opcode");
+  }
+}
+
+// Reduce branches of the accumulator tree by adding them together.
+static void reduceAccumulatorTree(SmallVectorImpl<Register> &RegistersToReduce, SmallVectorImpl<MachineInstr *> &InsInstrs, 
+                   MachineFunction &MF, MachineInstr &Root, MachineRegisterInfo &MRI, 
+                   DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, Register ResultReg) {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<Register, 8> NewRegs;
+  for (unsigned int i = 1; i <= (RegistersToReduce.size() / 2); i+=2) {
+    auto RHS = RegistersToReduce[i - 1];
+    auto LHS = RegistersToReduce[i];
+    Register Dest;
+    // If we are reducing 2 registers, reuse the original result register.
+    if (RegistersToReduce.size() == 2)
+      Dest = ResultReg;
+    // Otherwise, create a new virtual register to hold the partial sum.
+    else {
+      auto NewVR = MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(0).getReg()));
+      Dest = NewVR;
+      NewRegs.push_back(Dest);
+      InstrIdxForVirtReg.insert(std::make_pair(Dest, InsInstrs.size()));
+    }
+    
+    // Create the new add instruction.
+    MachineInstrBuilder MIB = BuildMI(MF, MIMetadata(Root), TII->get(getReduceOpCodeForAccumulator(Root.getOpcode())), Dest).addReg(RHS, getKillRegState(true)).addReg(LHS, getKillRegState(true));
+    // Copy any flags needed from the original instruction.
+    MIB->setFlags(Root.getFlags());
+    InsInstrs.push_back(MIB);
+  }
+
+  // If the number of registers to reduce is odd, add the reminaing register to the vector of registers to reduce.
+  if (RegistersToReduce.size() % 2 != 0)
+    NewRegs.push_back(RegistersToReduce[RegistersToReduce.size() - 1]);
+
+  RegistersToReduce = NewRegs;
+}
+
 /// When getMachineCombinerPatterns() finds potential patterns,
 /// this function generates the instructions that could replace the
 /// original code sequence
@@ -7671,7 +7865,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
     break;
   }
+  case AArch64MachineCombinerPattern::ACC_CHAIN: {
+    SmallVector<Register, 32> ChainRegs;
+    getAccumulatorChain(&Root, MBB, MRI, ChainRegs);
+
+    unsigned int Depth = ChainRegs.size();
+    assert(MaxAccumulatorWidth > 1 && "Max accumulator width set to illegal value");
+    unsigned int MaxWidth = Log2_32(Depth) < MaxAccumulatorWidth ? Log2_32(Depth) : MaxAccumulatorWidth;
+    
+    // Walk down the chain and rewrite it as a tree.
+    for (auto IndexedReg : llvm::enumerate(llvm::reverse(ChainRegs))) {
+      // No need to rewrite the first node, it is already perfect as it is.
+      if (IndexedReg.index() == 0)
+        continue;
+
+      MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value());
+      MachineInstrBuilder MIB;
+      Register AccReg;
+      if (IndexedReg.index() < MaxWidth) {
+        // Now we need to create new instructions for the first row.
+        AccReg = Instr->getOperand(0).getReg();
+        MIB = BuildMI(MF, MIMetadata(*Instr), TII->get(MRI.getUniqueVRegDef(ChainRegs.back())->getOpcode()), AccReg).addReg(Instr->getOperand(2).getReg(), getKillRegState(Instr->getOperand(2).isKill())).addReg(Instr->getOperand(3).getReg(), getKillRegState(Instr->getOperand(3).isKill()));       
+      } else {
+        // For the remaining cases, we need ot use an output register of one of the newly inserted instuctions as operand 1
+        AccReg = Instr->getOperand(0).getReg() == Root.getOperand(0).getReg() ? MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(0).getReg())) : Instr->getOperand(0).getReg();
+        assert(IndexedReg.index() - MaxWidth >= 0);
+        auto AccumulatorInput = ChainRegs[Depth - (IndexedReg.index() - MaxWidth) - 1];
+        MIB = BuildMI(MF, MIMetadata(*Instr), TII->get(Instr->getOpcode()), AccReg).addReg(AccumulatorInput, getKillRegState(true)).addReg(Instr->getOperand(2).getReg(), getKillRegState(Instr->getOperand(2).isKill())).addReg(Instr->getOperand(3).getReg(), getKillRegState(Instr->getOperand(3).isKill()));
+      }
+
+      MIB->setFlags(Instr->getFlags());
+      InstrIdxForVirtReg.insert(std::make_pair(AccReg, InsInstrs.size()));
+      InsInstrs.push_back(MIB);
+      DelInstrs.push_back(Instr);
+    }
+    
+    SmallVector<Register, 8> RegistersToReduce;
+    for (int i = (InsInstrs.size() - MaxWidth); i < InsInstrs.size(); ++i) {
+      auto Reg = InsInstrs[i]->getOperand(0).getReg();
+      RegistersToReduce.push_back(Reg);
+    }
 
+    while (RegistersToReduce.size() > 1)
+      reduceAccumulatorTree(RegistersToReduce, InsInstrs, MF, Root, MRI, InstrIdxForVirtReg, Root.getOperand(0).getReg());
+    
+    // We don't want to break, we handle setting flags and adding Root to DelInstrs from here.
+    return;
+  }
   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
     Opc = AArch64::MLAv8i8;
     RC = &AArch64::FPR64RegClass;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 9a0034223ab9ba..76acdc22d1c589 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,8 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  ACC_CHAIN
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
new file mode 100644
index 00000000000000..a5cc1c5af13eb2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
@@ -0,0 +1,105 @@
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -aarch64-acc-max-width=2   %s -o -   | FileCheck %s  --check-prefix=NARROW-TREE
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -aarch64-acc-min-depth=100  %s -o -  | FileCheck %s  --check-prefix=NO-TREE
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -aarch64-acc-reassoc=false  %s -o -  | FileCheck %s  --check-prefix=NO-TREE
+
+# A chain of UABAL instructions that can be reassociated for better ILP.
+# Before the optimization, we accumulate in a single long chain.
+# CHECK-LABEL:   uabal_accumulation
+# CHECK:          [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[START2:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[START3:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]] 
+# CHECK:          [[B1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START2]] 
+# CHECK:          [[C1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START3]] 
+# CHECK:          [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]] 
+# CHECK:          [[B2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B1]] 
+# CHECK:          [[C2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[C1]] 
+# CHECK:          [[PARTIAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[A2]], killed [[B2]]
+# CHECK:          [[TOTAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[PARTIAL_SUM]], killed [[C2]]
+# CHECK:          [[END:%.*]]:fpr32 = ADDVv4i32v killed [[TOTAL_SUM]]
+
+# NARROW-TREE:    [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NARROW-TREE:    [[START2:%.*]]:fpr128 = UABDLv4i16_v4i32          
+# NARROW-TREE:    [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# NARROW-TREE:    [[B1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START2]]
+# NARROW-TREE:    [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]              
+# NARROW-TREE:    [[B2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B1]]
+# NARROW-TREE:    [[A3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A2]]
+# NARROW-TREE:    [[B3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B2]]
+# NARROW-TREE:    [[A4:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A3]]
+# NARROW-TREE:    [[PARTIAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[B3]], killed [[A4]]
+# NARROW-TREE:    [[END:%.*]]:fpr32 = ADDVv4i32v killed [[PARTIAL_SUM]]  
+
+# NO-TREE:        [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NO-TREE:        [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# NO-TREE:        [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]
+# NO-TREE:        [[A3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A2]]
+# NO-TREE:        [[A4:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A3]]              
+# NO-TREE:        [[A5:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A4]]
+# NO-TREE:        [[A6:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A5]]
+# NO-TREE:        [[A7:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A6]]
+# NO-TREE:        [[A8:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A7]]
+# NO-TREE:        [[END:%.*]]:fpr32 = ADDVv4i32v killed [[A8]]
+
+---
+name:            uabal_accumulation
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3
+  
+    %3:gpr64 = COPY $x3
+    %2:gpr64common = COPY $x2
+    %1:gpr64 = COPY $x1
+    %0:gpr64common = COPY $x0
+    %4:fpr64 = LDRDui %0, 0 :: (load (s64))
+    %5:fpr64 = LDRDui %2, 0 :: (load (s64))
+    %6:gpr64common = ADDXrr %0, %1
+    %7:gpr64common = ADDXrr %2, %3
+    %8:fpr64 = LDRDui %6, 0 :: (load (s64))
+    %9:fpr64 = LDRDui %7, 0 :: (load (s64))
+    %10:fpr128 = UABDLv4i16_v4i32 killed %8, killed %9
+    %11:fpr128 = UABALv4i16_v4i32 killed %10, killed %4, killed %5
+    %12:gpr64common = ADDXrr %6, %1
+    %13:gpr64common = ADDXrr %7, %3
+    %14:fpr64 = LDRDui %12, 0 :: (load (s64))
+    %15:fpr64 = LDRDui %13, 0 :: (load (s64))
+    %16:fpr128 = UABALv4i16_v4i32 killed %11, killed %14, killed %15
+    %17:gpr64common = ADDXrr %12, %1
+    %18:gpr64common = ADDXrr %13, %3
+    %19:fpr64 = LDRDui %17, 0 :: (load (s64))
+    %20:fpr64 = LDRDui %18, 0 :: (load (s64))
+    %21:fpr128 = UABALv4i16_v4i32 killed %16, killed %19, killed %20
+    %22:gpr64common = ADDXrr %17, %1
+    %23:gpr64common = ADDXrr %18, %3
+    %24:fpr64 = LDRDui %22, 0 :: (load (s64))
+    %25:fpr64 = LDRDui %23, 0 :: (load (s64))
+    %26:fpr128 = UABALv4i16_v4i32 killed %21, killed %24, killed %25
+    %27:gpr64common = ADDXrr %22, %1
+    %28:gpr64common = ADDXrr %23, %3
+    %29:fpr64 = LDRDui %27, 0 :: (load (s64))
+    %30:fpr64 = LDRDui %28, 0 :: (load (s64))
+    %31:fpr128 = UABALv4i16_v4i32 killed %26, killed %29, killed %30
+    %32:gpr64common = ADDXrr %27, %1
+    %33:gpr64common = ADDXrr %28, %3
+    %34:fpr64 = LDRDui %32, 0 :: (load (s64))
+    %35:fpr64 = LDRDui %33, 0 :: (load (s64))
+    %36:fpr128 = UABALv4i16_v4i32 killed %31, killed %34, killed %35
+    %37:gpr64common = ADDXrr %32, %1
+    %38:gpr64common = ADDXrr %33, %3
+    %39:fpr64 = LDRDui %37, 0 :: (load (s64))
+    %40:fpr64 = LDRDui %38, 0 :: (load (s64))
+    %41:fpr128 = UABALv4i16_v4i32 killed %36, killed %39, killed %40
+    %42:gpr64common = ADDXrr %37, %1
+    %43:gpr64common = ADDXrr %38, %3
+    %44:fpr64 = LDRDui %42, 0 :: (load (s64))
+    %45:fpr64 = LDRDui %43, 0 :: (load (s64))
+    %46:fpr128 = UABALv4i16_v4i32 killed %41, killed %44, killed %45
+    %47:fpr32 = ADDVv4i32v killed %46
+    %48:fpr128 = IMPLICIT_DEF
+    %49:fpr128 = INSERT_SUBREG %48, killed %47, %subreg.ssub
+    %50:gpr32all = COPY %49.ssub
+    $w0 = COPY %50
+    RET_ReallyLR implicit $w0
+
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/126060


More information about the llvm-commits mailing list