[llvm] [AArch64][MachineCombiner] Reassociate long chains of accumulation instructions into a tree to increase ILP (PR #126060)

Thu Feb 13 04:34:33 PST 2025

https://github.com/jcohen-apple updated https://github.com/llvm/llvm-project/pull/126060

>From b56e716f1e03327a75b455b3177e1b73e0dc3842 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <jcohen22 at apple.com>
Date: Mon, 3 Feb 2025 17:36:04 +0200
Subject: [PATCH 1/2] Add unit test to demonstrate behavior before the
 optimization.

In this example we do not leverage the multiple neon ports in the processor
to compute the output using multiple accumulation registers.
---
 ...hine-combiner-reassociate-accumulators.mir | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir

diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
new file mode 100644
index 0000000000000..39ef529d33b47
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
@@ -0,0 +1,77 @@
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown %s -o - | FileCheck %s
+
+# A chain of UABAL instructions that can be reassociated for better ILP.
+# Before the optimization, we accumulate in a single long chain.
+# CHECK-LABEL:   uabal_accumulation
+# CHECK:          [[START:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[A:%.*]]:fpr128 = UABALv4i16_v4i32 [[START]]
+# CHECK:          [[B:%.*]]:fpr128 = UABALv4i16_v4i32 [[A]]
+# CHECK:          [[C:%.*]]:fpr128 = UABALv4i16_v4i32 [[B]]
+# CHECK:          [[D:%.*]]:fpr128 = UABALv4i16_v4i32 [[C]]
+# CHECK:          [[E:%.*]]:fpr128 = UABALv4i16_v4i32 [[D]]
+# CHECK:          [[F:%.*]]:fpr128 = UABALv4i16_v4i32 [[E]]
+# CHECK:          [[G:%.*]]:fpr128 = UABALv4i16_v4i32 [[F]]
+# CHECK:          [[H:%.*]]:fpr128 = UABALv4i16_v4i32 [[G]]
+# CHECK:          [[END:%.*]]:fpr32 = ADDVv4i32v killed [[H]]
+
+---
+name:            uabal_accumulation
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3
+  
+    %3:gpr64 = COPY $x3
+    %2:gpr64common = COPY $x2
+    %1:gpr64 = COPY $x1
+    %0:gpr64common = COPY $x0
+    %4:fpr64 = LDRDui %0, 0 :: (load (s64))
+    %5:fpr64 = LDRDui %2, 0 :: (load (s64))
+    %6:gpr64common = ADDXrr %0, %1
+    %7:gpr64common = ADDXrr %2, %3
+    %8:fpr64 = LDRDui %6, 0 :: (load (s64))
+    %9:fpr64 = LDRDui %7, 0 :: (load (s64))
+    %10:fpr128 = UABDLv4i16_v4i32 killed %8, killed %9
+    %11:fpr128 = UABALv4i16_v4i32 %10, killed %4, killed %5
+    %12:gpr64common = ADDXrr %6, %1
+    %13:gpr64common = ADDXrr %7, %3
+    %14:fpr64 = LDRDui %12, 0 :: (load (s64))
+    %15:fpr64 = LDRDui %13, 0 :: (load (s64))
+    %16:fpr128 = UABALv4i16_v4i32 %11, killed %14, killed %15
+    %17:gpr64common = ADDXrr %12, %1
+    %18:gpr64common = ADDXrr %13, %3
+    %19:fpr64 = LDRDui %17, 0 :: (load (s64))
+    %20:fpr64 = LDRDui %18, 0 :: (load (s64))
+    %21:fpr128 = UABALv4i16_v4i32 %16, killed %19, killed %20
+    %22:gpr64common = ADDXrr %17, %1
+    %23:gpr64common = ADDXrr %18, %3
+    %24:fpr64 = LDRDui %22, 0 :: (load (s64))
+    %25:fpr64 = LDRDui %23, 0 :: (load (s64))
+    %26:fpr128 = UABALv4i16_v4i32 %21, killed %24, killed %25
+    %27:gpr64common = ADDXrr %22, %1
+    %28:gpr64common = ADDXrr %23, %3
+    %29:fpr64 = LDRDui %27, 0 :: (load (s64))
+    %30:fpr64 = LDRDui %28, 0 :: (load (s64))
+    %31:fpr128 = UABALv4i16_v4i32 %26, killed %29, killed %30
+    %32:gpr64common = ADDXrr %27, %1
+    %33:gpr64common = ADDXrr %28, %3
+    %34:fpr64 = LDRDui %32, 0 :: (load (s64))
+    %35:fpr64 = LDRDui %33, 0 :: (load (s64))
+    %36:fpr128 = UABALv4i16_v4i32 %31, killed %34, killed %35
+    %37:gpr64common = ADDXrr %32, %1
+    %38:gpr64common = ADDXrr %33, %3
+    %39:fpr64 = LDRDui %37, 0 :: (load (s64))
+    %40:fpr64 = LDRDui %38, 0 :: (load (s64))
+    %41:fpr128 = UABALv4i16_v4i32 %36, killed %39, killed %40
+    %42:gpr64common = ADDXrr %37, %1
+    %43:gpr64common = ADDXrr %38, %3
+    %44:fpr64 = LDRDui %42, 0 :: (load (s64))
+    %45:fpr64 = LDRDui %43, 0 :: (load (s64))
+    %46:fpr128 = UABALv4i16_v4i32 %41, killed %44, killed %45
+    %47:fpr32 = ADDVv4i32v killed %46
+    %48:fpr128 = IMPLICIT_DEF
+    %49:fpr128 = INSERT_SUBREG %48, killed %47, %subreg.ssub
+    %50:gpr32all = COPY %49.ssub
+    $w0 = COPY %50
+    RET_ReallyLR implicit $w0
+
+...

>From 0214b16ffecdeec34964d5c4926752926070c002 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <jcohen22 at apple.com>
Date: Thu, 13 Feb 2025 14:22:10 +0200
Subject: [PATCH 2/2] [MachineCombiner] Add a pass to reassociate chains of
 accumulation instructions into a tree

This pass is designed to increase ILP by performing accumulation into multiple registers. It currently supports only the UABAL accumulation instruction, but can easily be extended to support additional instructions.
---
 .../llvm/CodeGen/MachineCombinerPattern.h     |   1 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  34 +++
 llvm/lib/CodeGen/TargetInstrInfo.cpp          | 269 +++++++++++++++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 164 ++++++++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  17 +-
 .../aarch64-reassociate-accumulators.ll       | 146 ++++++++++
 ...hine-combiner-reassociate-accumulators.mir |  64 +++--
 7 files changed, 663 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll

diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index 3428c4dde5c7f..25fce679323ee 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -32,6 +32,7 @@ enum MachineCombinerPattern : unsigned {
   REASSOC_AX_YB,
   REASSOC_XA_BY,
   REASSOC_XA_YB,
+  ACC_CHAIN,
 
   TARGET_PATTERN_START
 };
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index c8eba71c9bb0a..8eb1cfbc1e892 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -35,6 +35,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -1275,6 +1276,39 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
+  /// Find chains of accumulations that can be rewritten as a tree for increased
+  /// ILP.
+  bool getAccumulatorReassociationPatterns(
+      MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns) const;
+
+  /// Find the chain of accumulator instructions in \P MBB and return them in
+  /// \P Chain.
+  void getAccumulatorChain(MachineInstr *CurrentInstr,
+                           SmallVectorImpl<Register> &Chain) const;
+
+  /// Return true when \P OpCode is an instruction which performs
+  /// accumulation into one of its operand registers.
+  virtual bool isAccumulationOpcode(unsigned Opcode) const { return false; }
+
+  /// Returns an opcode which defines the accumulator used by \P Opcode.
+  virtual std::optional<unsigned>
+  getAccumulationStartOpcode(unsigned Opcode) const {
+    return std::nullopt;
+  }
+
+  virtual std::optional<unsigned>
+  getReduceOpcodeForAccumulator(unsigned int AccumulatorOpCode) const {
+    return std::nullopt;
+  }
+
+  /// Reduces branches of the accumulator tree into a single register.
+  void reduceAccumulatorTree(SmallVectorImpl<Register> &RegistersToReduce,
+                             SmallVectorImpl<MachineInstr *> &InsInstrs,
+                             MachineFunction &MF, MachineInstr &Root,
+                             MachineRegisterInfo &MRI,
+                             DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                             Register ResultReg) const;
+
   /// Return the inverse operation opcode if it exists for \P Opcode (e.g. add
   /// for sub and vice versa).
   virtual std::optional<unsigned> getInverseOpcode(unsigned Opcode) const {
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 7a905b65f26e5..a453a7ffe3624 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
@@ -42,6 +43,19 @@ static cl::opt<bool> DisableHazardRecognizer(
   "disable-sched-hazard", cl::Hidden, cl::init(false),
   cl::desc("Disable hazard detection during preRA scheduling"));
 
+static cl::opt<bool> EnableAccReassociation(
+    "acc-reassoc", cl::Hidden, cl::init(true),
+    cl::desc("Enable reassociation of accumulation chains"));
+
+static cl::opt<unsigned int>
+    MinAccumulatorDepth("acc-min-depth", cl::Hidden, cl::init(8),
+                        cl::desc("Minimum length of accumulator chains "
+                                 "required for the optimization to kick in"));
+
+static cl::opt<unsigned int> MaxAccumulatorWidth(
+    "acc-max-width", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of branches in the accumulator tree"));
+
 TargetInstrInfo::~TargetInstrInfo() = default;
 
 const TargetRegisterClass*
@@ -899,6 +913,152 @@ bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
          hasReassociableSibling(Inst, Commuted);
 }
 
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+                       unsigned CombineOpc) {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineInstr *MI = nullptr;
+
+  if (MO.isReg() && MO.getReg().isVirtual())
+    MI = MRI.getUniqueVRegDef(MO.getReg());
+  // And it needs to be in the trace (otherwise, it won't have a depth).
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
+    return false;
+  // Must only used by the user we combine with.
+  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+    return false;
+
+  return true;
+}
+
+// A chain of accumulation instructions will be selected IFF:
+//    1. All the accumulation instructions in the chain have the same opcode,
+//       besides the first that has a slightly different opcode because it does
+//       not perform the accumulation, just defines it.
+//    2. All the instructions in the chain are combinable (have a single use
+//       which itself is part of the chain).
+//    3. Meets the required minimum length.
+void TargetInstrInfo::getAccumulatorChain(
+    MachineInstr *CurrentInstr, SmallVectorImpl<Register> &Chain) const {
+  // Walk up the chain of accumulation instructions and collect them in the
+  // vector.
+  MachineBasicBlock &MBB = *CurrentInstr->getParent();
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  unsigned AccumulatorOpcode = CurrentInstr->getOpcode();
+  std::optional<unsigned> ChainStartOpCode =
+      getAccumulationStartOpcode(AccumulatorOpcode);
+
+  if (!ChainStartOpCode.has_value())
+    return;
+
+  while (CurrentInstr &&
+         (canCombine(MBB, CurrentInstr->getOperand(1), AccumulatorOpcode) ||
+          canCombine(MBB, CurrentInstr->getOperand(1),
+                     ChainStartOpCode.value()))) {
+    Chain.push_back(CurrentInstr->getOperand(0).getReg());
+    CurrentInstr = MRI.getUniqueVRegDef(CurrentInstr->getOperand(1).getReg());
+  }
+
+  // Add the instruction at the top of the chain.
+  if (CurrentInstr->getOpcode() == ChainStartOpCode.value())
+    Chain.push_back(CurrentInstr->getOperand(0).getReg());
+}
+
+/// Find chains of accumulations that can be rewritten as a tree for increased
+/// ILP.
+bool TargetInstrInfo::getAccumulatorReassociationPatterns(
+    MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns) const {
+  if (!EnableAccReassociation)
+    return false;
+
+  unsigned Opc = Root.getOpcode();
+  if (!isAccumulationOpcode(Opc))
+    return false;
+
+  // Verify that this is the end of the chain.
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  if (!MRI.hasOneNonDBGUser(Root.getOperand(0).getReg()))
+    return false;
+
+  auto User = MRI.use_instr_begin(Root.getOperand(0).getReg());
+  if (User->getOpcode() == Opc)
+    return false;
+
+  // Walk up the use chain and collect the reduction chain.
+  SmallVector<Register, 32> Chain;
+  getAccumulatorChain(&Root, Chain);
+
+  // Reject chains which are too short to be worth modifying.
+  if (Chain.size() < MinAccumulatorDepth)
+    return false;
+
+  // Check if the MBB this instruction is a part of contains any other chains.
+  // If so, don't apply it.
+  SmallSet<Register, 32> ReductionChain(Chain.begin(), Chain.end());
+  for (const auto &I : MBB) {
+    if (I.getOpcode() == Opc &&
+        !ReductionChain.contains(I.getOperand(0).getReg()))
+      return false;
+  }
+
+  Patterns.push_back(MachineCombinerPattern::ACC_CHAIN);
+  return true;
+}
+
+// Reduce branches of the accumulator tree by adding them together.
+void TargetInstrInfo::reduceAccumulatorTree(
+    SmallVectorImpl<Register> &RegistersToReduce,
+    SmallVectorImpl<MachineInstr *> &InsInstrs, MachineFunction &MF,
+    MachineInstr &Root, MachineRegisterInfo &MRI,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+    Register ResultReg) const {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<Register, 8> NewRegs;
+
+  // Get the opcode for the reduction instruction we will need to build.
+  // If for some reason it is not defined, early exit and don't apply this.
+  std::optional<unsigned> ReduceOpCode =
+      getReduceOpcodeForAccumulator(Root.getOpcode());
+
+  if (!ReduceOpCode.value())
+    return;
+
+  for (unsigned int i = 1; i <= (RegistersToReduce.size() / 2); i += 2) {
+    auto RHS = RegistersToReduce[i - 1];
+    auto LHS = RegistersToReduce[i];
+    Register Dest;
+    // If we are reducing 2 registers, reuse the original result register.
+    if (RegistersToReduce.size() == 2)
+      Dest = ResultReg;
+    // Otherwise, create a new virtual register to hold the partial sum.
+    else {
+      auto NewVR = MRI.createVirtualRegister(
+          MRI.getRegClass(Root.getOperand(0).getReg()));
+      Dest = NewVR;
+      NewRegs.push_back(Dest);
+      InstrIdxForVirtReg.insert(std::make_pair(Dest, InsInstrs.size()));
+    }
+
+    // Create the new reduction instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MIMetadata(Root), TII->get(ReduceOpCode.value()), Dest)
+            .addReg(RHS, getKillRegState(true))
+            .addReg(LHS, getKillRegState(true));
+    // Copy any flags needed from the original instruction.
+    MIB->setFlags(Root.getFlags());
+    InsInstrs.push_back(MIB);
+  }
+
+  // If the number of registers to reduce is odd, add the reminaing register to
+  // the vector of registers to reduce.
+  if (RegistersToReduce.size() % 2 != 0)
+    NewRegs.push_back(RegistersToReduce[RegistersToReduce.size() - 1]);
+
+  RegistersToReduce = NewRegs;
+}
+
 // The concept of the reassociation pass is that these operations can benefit
 // from this kind of transformation:
 //
@@ -938,6 +1098,8 @@ bool TargetInstrInfo::getMachineCombinerPatterns(
     }
     return true;
   }
+  if (getAccumulatorReassociationPatterns(Root, Patterns))
+    return true;
 
   return false;
 }
@@ -949,7 +1111,12 @@ bool TargetInstrInfo::isThroughputPattern(unsigned Pattern) const {
 
 CombinerObjective
 TargetInstrInfo::getCombinerObjective(unsigned Pattern) const {
-  return CombinerObjective::Default;
+  switch (Pattern) {
+  case MachineCombinerPattern::ACC_CHAIN:
+    return CombinerObjective::MustReduceDepth;
+  default:
+    return CombinerObjective::Default;
+  }
 }
 
 std::pair<unsigned, unsigned>
@@ -1252,19 +1419,99 @@ void TargetInstrInfo::genAlternativeCodeSequence(
     SmallVectorImpl<MachineInstr *> &DelInstrs,
     DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
   MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
 
-  // Select the previous instruction in the sequence based on the input pattern.
-  std::array<unsigned, 5> OperandIndices;
-  getReassociateOperandIndices(Root, Pattern, OperandIndices);
-  MachineInstr *Prev =
-      MRI.getUniqueVRegDef(Root.getOperand(OperandIndices[0]).getReg());
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+  case MachineCombinerPattern::REASSOC_XA_YB: {
+    // Select the previous instruction in the sequence based on the input
+    // pattern.
+    std::array<unsigned, 5> OperandIndices;
+    getReassociateOperandIndices(Root, Pattern, OperandIndices);
+    MachineInstr *Prev =
+        MRI.getUniqueVRegDef(Root.getOperand(OperandIndices[0]).getReg());
+
+    // Don't reassociate if Prev and Root are in different blocks.
+    if (Prev->getParent() != Root.getParent())
+      return;
 
-  // Don't reassociate if Prev and Root are in different blocks.
-  if (Prev->getParent() != Root.getParent())
-    return;
+    reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, OperandIndices,
+                   InstIdxForVirtReg);
+    break;
+  }
+  case MachineCombinerPattern::ACC_CHAIN: {
+    SmallVector<Register, 32> ChainRegs;
+    getAccumulatorChain(&Root, ChainRegs);
+    unsigned int Depth = ChainRegs.size();
+    assert(MaxAccumulatorWidth > 1 &&
+           "Max accumulator width set to illegal value");
+    unsigned int MaxWidth = Log2_32(Depth) < MaxAccumulatorWidth
+                                ? Log2_32(Depth)
+                                : MaxAccumulatorWidth;
+
+    // Walk down the chain and rewrite it as a tree.
+    for (auto IndexedReg : llvm::enumerate(llvm::reverse(ChainRegs))) {
+      // No need to rewrite the first node, it is already perfect as it is.
+      if (IndexedReg.index() == 0)
+        continue;
+
+      MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value());
+      MachineInstrBuilder MIB;
+      Register AccReg;
+      if (IndexedReg.index() < MaxWidth) {
+        // Now we need to create new instructions for the first row.
+        AccReg = Instr->getOperand(0).getReg();
+        MIB = BuildMI(
+                  MF, MIMetadata(*Instr),
+                  TII->get(MRI.getUniqueVRegDef(ChainRegs.back())->getOpcode()),
+                  AccReg)
+                  .addReg(Instr->getOperand(2).getReg(),
+                          getKillRegState(Instr->getOperand(2).isKill()))
+                  .addReg(Instr->getOperand(3).getReg(),
+                          getKillRegState(Instr->getOperand(3).isKill()));
+      } else {
+        // For the remaining cases, we need ot use an output register of one of
+        // the newly inserted instuctions as operand 1
+        AccReg = Instr->getOperand(0).getReg() == Root.getOperand(0).getReg()
+                     ? MRI.createVirtualRegister(
+                           MRI.getRegClass(Root.getOperand(0).getReg()))
+                     : Instr->getOperand(0).getReg();
+        assert(IndexedReg.index() - MaxWidth >= 0);
+        auto AccumulatorInput =
+            ChainRegs[Depth - (IndexedReg.index() - MaxWidth) - 1];
+        MIB = BuildMI(MF, MIMetadata(*Instr), TII->get(Instr->getOpcode()),
+                      AccReg)
+                  .addReg(AccumulatorInput, getKillRegState(true))
+                  .addReg(Instr->getOperand(2).getReg(),
+                          getKillRegState(Instr->getOperand(2).isKill()))
+                  .addReg(Instr->getOperand(3).getReg(),
+                          getKillRegState(Instr->getOperand(3).isKill()));
+      }
 
-  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, OperandIndices,
-                 InstIdxForVirtReg);
+      MIB->setFlags(Instr->getFlags());
+      InstIdxForVirtReg.insert(std::make_pair(AccReg, InsInstrs.size()));
+      InsInstrs.push_back(MIB);
+      DelInstrs.push_back(Instr);
+    }
+
+    SmallVector<Register, 8> RegistersToReduce;
+    for (unsigned i = (InsInstrs.size() - MaxWidth); i < InsInstrs.size();
+         ++i) {
+      auto Reg = InsInstrs[i]->getOperand(0).getReg();
+      RegistersToReduce.push_back(Reg);
+    }
+
+    while (RegistersToReduce.size() > 1)
+      reduceAccumulatorTree(RegistersToReduce, InsInstrs, MF, Root, MRI,
+                            InstIdxForVirtReg, Root.getOperand(0).getReg());
+
+    break;
+  }
+  }
 }
 
 MachineTraceStrategy TargetInstrInfo::getMachineCombinerTraceStrategy() const {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 0f2b969fba35c..2beb6e5a3bea5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6674,6 +6674,115 @@ static bool getMaddPatterns(MachineInstr &Root,
   }
   return Found;
 }
+
+bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    break;
+  case AArch64::UABALB_ZZZ_D:
+  case AArch64::UABALB_ZZZ_H:
+  case AArch64::UABALB_ZZZ_S:
+  case AArch64::UABALT_ZZZ_D:
+  case AArch64::UABALT_ZZZ_H:
+  case AArch64::UABALT_ZZZ_S:
+  case AArch64::UABALv16i8_v8i16:
+  case AArch64::UABALv2i32_v2i64:
+  case AArch64::UABALv4i16_v4i32:
+  case AArch64::UABALv4i32_v2i64:
+  case AArch64::UABALv8i16_v4i32:
+  case AArch64::UABALv8i8_v8i16:
+  case AArch64::UABAv16i8:
+  case AArch64::UABAv2i32:
+  case AArch64::UABAv4i16:
+  case AArch64::UABAv4i32:
+  case AArch64::UABAv8i16:
+  case AArch64::UABAv8i8:
+  case AArch64::SABALv16i8_v8i16:
+  case AArch64::SABALv2i32_v2i64:
+  case AArch64::SABALv4i16_v4i32:
+  case AArch64::SABALv4i32_v2i64:
+  case AArch64::SABALv8i16_v4i32:
+  case AArch64::SABALv8i8_v8i16:
+  case AArch64::SABAv16i8:
+  case AArch64::SABAv2i32:
+  case AArch64::SABAv4i16:
+  case AArch64::SABAv4i32:
+  case AArch64::SABAv8i16:
+  case AArch64::SABAv8i8:
+    return true;
+  }
+
+  return false;
+}
+
+std::optional<unsigned> AArch64InstrInfo::getAccumulationStartOpcode(
+    unsigned AccumulationOpcode) const {
+  switch (AccumulationOpcode) {
+  default:
+    llvm_unreachable("Unknown accumulator opcode");
+  case AArch64::UABALB_ZZZ_D:
+    return AArch64::UABDLB_ZZZ_D;
+  case AArch64::UABALB_ZZZ_H:
+    return AArch64::UABDLB_ZZZ_H;
+  case AArch64::UABALB_ZZZ_S:
+    return AArch64::UABDLB_ZZZ_S;
+  case AArch64::UABALT_ZZZ_D:
+    return AArch64::UABDLT_ZZZ_D;
+  case AArch64::UABALT_ZZZ_H:
+    return AArch64::UABDLT_ZZZ_H;
+  case AArch64::UABALT_ZZZ_S:
+    return AArch64::UABDLT_ZZZ_S;
+  case AArch64::UABALv16i8_v8i16:
+    return AArch64::UABDLv16i8_v8i16;
+  case AArch64::UABALv2i32_v2i64:
+    return AArch64::UABDLv2i32_v2i64;
+  case AArch64::UABALv4i16_v4i32:
+    return AArch64::UABDLv4i16_v4i32;
+  case AArch64::UABALv4i32_v2i64:
+    return AArch64::UABDLv4i32_v2i64;
+  case AArch64::UABALv8i16_v4i32:
+    return AArch64::UABDLv8i16_v4i32;
+  case AArch64::UABALv8i8_v8i16:
+    return AArch64::UABDLv8i8_v8i16;
+  case AArch64::UABAv16i8:
+    return AArch64::UABDv16i8;
+  case AArch64::UABAv2i32:
+    return AArch64::UABDv2i32;
+  case AArch64::UABAv4i16:
+    return AArch64::UABDv4i16;
+  case AArch64::UABAv4i32:
+    return AArch64::UABDv4i32;
+  case AArch64::UABAv8i16:
+    return AArch64::UABDv8i16;
+  case AArch64::UABAv8i8:
+    return AArch64::UABDv8i8;
+  case AArch64::SABALv16i8_v8i16:
+    return AArch64::SABDLv16i8_v8i16;
+  case AArch64::SABALv2i32_v2i64:
+    return AArch64::SABDLv2i32_v2i64;
+  case AArch64::SABALv4i16_v4i32:
+    return AArch64::SABDLv4i16_v4i32;
+  case AArch64::SABALv4i32_v2i64:
+    return AArch64::SABDLv4i32_v2i64;
+  case AArch64::SABALv8i16_v4i32:
+    return AArch64::SABDLv8i16_v4i32;
+  case AArch64::SABALv8i8_v8i16:
+    return AArch64::SABDLv8i8_v8i16;
+  case AArch64::SABAv16i8:
+    return AArch64::SABDv16i8;
+  case AArch64::SABAv2i32:
+    return AArch64::SABAv2i32;
+  case AArch64::SABAv4i16:
+    return AArch64::SABDv4i16;
+  case AArch64::SABAv4i32:
+    return AArch64::SABDv4i32;
+  case AArch64::SABAv8i16:
+    return AArch64::SABDv8i16;
+  case AArch64::SABAv8i8:
+    return AArch64::SABDv8i8;
+  }
+}
+
 /// Floating-Point Support
 
 /// Find instructions that can be turned into madd.
@@ -7436,6 +7545,60 @@ genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
   DelInstrs.push_back(&Root);
 }
 
+std::optional<unsigned> AArch64InstrInfo::getReduceOpcodeForAccumulator(
+    unsigned int AccumulatorOpCode) const {
+  switch (AccumulatorOpCode) {
+  case AArch64::UABALB_ZZZ_D:
+    return AArch64::ADD_ZZZ_D;
+  case AArch64::UABALB_ZZZ_H:
+    return AArch64::ADD_ZZZ_H;
+  case AArch64::UABALB_ZZZ_S:
+    return AArch64::ADD_ZZZ_S;
+  case AArch64::UABALT_ZZZ_D:
+    return AArch64::ADD_ZZZ_D;
+  case AArch64::UABALT_ZZZ_H:
+    return AArch64::ADD_ZZZ_H;
+  case AArch64::UABALT_ZZZ_S:
+    return AArch64::ADD_ZZZ_S;
+  case AArch64::UABALv16i8_v8i16:
+  case AArch64::SABALv8i8_v8i16:
+  case AArch64::SABAv8i16:
+  case AArch64::UABAv8i16:
+    return AArch64::ADDv8i16;
+  case AArch64::SABALv2i32_v2i64:
+  case AArch64::UABALv2i32_v2i64:
+  case AArch64::SABALv4i32_v2i64:
+    return AArch64::ADDv2i64;
+  case AArch64::UABALv4i16_v4i32:
+  case AArch64::SABALv4i16_v4i32:
+  case AArch64::SABALv8i16_v4i32:
+  case AArch64::SABAv4i32:
+  case AArch64::UABAv4i32:
+    return AArch64::ADDv4i32;
+  case AArch64::UABALv4i32_v2i64:
+    return AArch64::ADDv2i64;
+  case AArch64::UABALv8i16_v4i32:
+    return AArch64::ADDv4i32;
+  case AArch64::UABALv8i8_v8i16:
+  case AArch64::SABALv16i8_v8i16:
+    return AArch64::ADDv8i16;
+  case AArch64::UABAv16i8:
+  case AArch64::SABAv16i8:
+    return AArch64::ADDv16i8;
+  case AArch64::UABAv4i16:
+  case AArch64::SABAv4i16:
+    return AArch64::ADDv4i16;
+  case AArch64::UABAv2i32:
+  case AArch64::SABAv2i32:
+    return AArch64::ADDv2i32;
+  case AArch64::UABAv8i8:
+  case AArch64::SABAv8i8:
+    return AArch64::ADDv8i8;
+  default:
+    llvm_unreachable("Unknown accumulator opcode");
+  }
+}
+
 /// When getMachineCombinerPatterns() finds potential patterns,
 /// this function generates the instructions that could replace the
 /// original code sequence
@@ -7671,7 +7834,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
     break;
   }
-
   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
     Opc = AArch64::MLAv8i8;
     RC = &AArch64::FPR64RegClass;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 9a0034223ab9b..cdf8187ee0ae8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -448,8 +448,21 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// be checked.
   bool isAssociativeAndCommutative(const MachineInstr &Inst,
                                    bool Invert) const override;
-  /// When getMachineCombinerPatterns() finds patterns, this function generates
-  /// the instructions that could replace the original code sequence
+
+  /// Returns true if \P Opcode is an instruction which performs accumulation
+  /// into a destination register.
+  bool isAccumulationOpcode(unsigned Opcode) const override;
+
+  /// Returns an opcode which defines the accumulator used by \P Opcode.
+  std::optional<unsigned>
+  getAccumulationStartOpcode(unsigned Opcode) const override;
+
+  std::optional<unsigned>
+  getReduceOpcodeForAccumulator(unsigned int AccumulatorOpCode) const override;
+
+  /// When getMachineCombinerPatterns() finds patterns, this function
+  /// generates the instructions that could replace the original code
+  /// sequence
   void genAlternativeCodeSequence(
       MachineInstr &Root, unsigned Pattern,
       SmallVectorImpl<MachineInstr *> &InsInstrs,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
new file mode 100644
index 0000000000000..bba879269ec78
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
@@ -0,0 +1,146 @@
+; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=arm64e-apple-darwin -o - | Filecheck %s
+
+define i64 @uabal_i32_to_i64_accumulation(i32* %ptr1, i32* %ptr2) {
+entry:
+  br label %loop
+
+loop:                                           ; Loop header
+  %i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
+  %acc_phi = phi <2 x i64> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
+
+  ; Load values from ptr1 and ptr2
+  %ptr1_i = getelementptr i32, i32* %ptr1, i32 %i
+  %ptr2_i = getelementptr i32, i32* %ptr2, i32 %i
+  %a = load <2 x i32>, <2 x i32>* %ptr1_i, align 1
+  %b = load <2 x i32>, <2 x i32>* %ptr2_i, align 1
+
+  ; Perform the intrinsic operation
+  %vabd = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmov = zext <2 x i32> %vabd to <2 x i64>
+  %acc_next = add <2 x i64> %vmov, %acc_phi
+
+  ; Increment loop counter and check the bound
+  %next_i = add i32 %i, 2                          ; Increment by 4 (processing 4 elements per iteration)
+  %cmp = icmp slt i32 %next_i, 16                  ; Loop for 8 iterations (4 elements per iteration → 8 * 4 = 32)
+  br i1 %cmp, label %loop, label %exit
+
+exit:                                             ; Exit block
+  %reduce = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %acc_next)
+  ret i64 %reduce
+}
+
+; CHECK: uabdl.2d v1
+; CHECK: uabdl.2d v0
+; CHECK: uabdl.2d v2
+; CHECK: uabal.2d v1
+; CHECK: uabal.2d v0
+; CHECK: uabal.2d v2
+; CHECK: uabal.2d v1
+; CHECK: uabal.2d v0
+; CHECK: add.2d v1, v2, v1
+; CHECK: add.2d v0, v1, v0
+; CHECK: addp.2d 
+
+define i16 @uabal2_accumulation(i8* %ptr1, i8* %ptr2) {
+entry:
+  br label %loop
+
+loop:                                           ; Loop header
+  %i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
+  %acc_phi_hi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_hi, %loop ]
+  %acc_phi_lo = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ]
+
+  ; Load values from ptr1 and ptr2
+  %ptr1_i = getelementptr i8, i8* %ptr1, i32 %i
+  %ptr2_i = getelementptr i8, i8* %ptr2, i32 %i
+  %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
+  %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
+
+  ; Perform the intrinsic operation
+  %a_hi = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b_hi = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %a_lo = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %b_lo = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vabd_hi = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a_hi, <8 x i8> %b_hi)
+  %vabd_lo = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a_lo, <8 x i8> %b_lo)
+  %vmov_hi = zext <8 x i8> %vabd_hi to <8 x i16>
+  %vmov_lo = zext <8 x i8> %vabd_lo to <8 x i16>
+  %acc_next_hi = add <8 x i16> %vmov_hi, %acc_phi_hi
+  %acc_next_lo = add <8 x i16> %vmov_lo, %acc_phi_lo
+
+  ; Increment loop counter and check the bound
+  %next_i = add i32 %i, 16                          ; Increment by 4 (processing 4 elements per iteration)
+  %cmp = icmp slt i32 %next_i, 128                  ; Loop for 8 iterations (4 elements per iteration → 8 * 4 = 32)
+  br i1 %cmp, label %loop, label %exit
+
+exit:                                             ; Exit block
+  %hi_plus_lo = add <8 x i16> %acc_next_hi, %acc_next_lo
+  %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %hi_plus_lo)
+  ret i16 %reduce
+}
+
+; CHECK:  uabdl2.8h       v4
+; CHECK:  uabdl.8h        v1 
+; CHECK:  uabdl2.8h       v24
+; CHECK:  uabdl2.8h       v25
+; CHECK:  uabal2.8h       v4 
+; CHECK:  uabal2.8h       v24
+; CHECK:  uabal2.8h       v25
+; CHECK:  uabal2.8h       v4
+; CHECK:  uabal2.8h       v24
+; CHECK:  add.8h          v4, v25, v4
+; CHECK:  add.8h          v4, v4, v24
+; CHECK:  uabdl.8h        v0
+; CHECK:  uabdl.8h        v2
+; CHECK:  uabal.8h        v1
+; CHECK:  uabal.8h        v0
+; CHECK:  uabal.8h        v2
+; CHECK:  uabal.8h        v1
+; CHECK:  uabal.8h        v0
+; CHECK:  add.8h          v1, v2, v1
+; CHECK:  add.8h          v0, v1, v0
+; CHECK:  add.8h          v0, v4, v0
+; CHECK:  addv.8h         h0, v0 
+
+define i32 @uaba_accumulation(i32* %ptr1, i32* %ptr2) {
+entry:
+  br label %loop
+
+loop:                                           ; Loop header
+  %i = phi i32 [ 0, %entry ], [ %next_i, %loop ]
+  %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
+
+  ; Load values from ptr1 and ptr2
+  %ptr1_i = getelementptr i32, i32* %ptr1, i32 %i
+  %ptr2_i = getelementptr i32, i32* %ptr2, i32 %i
+  %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
+  %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
+
+  ; Perform the intrinsic operation
+  %vabd = tail call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %a, <4 x i32> %b)
+  %acc_next = add <4 x i32> %acc_phi, %vabd
+
+  ; Increment loop counter and check the bound
+  %next_i = add i32 %i, 4                          ; Increment by 4 (processing 4 elements per iteration)
+  %cmp = icmp slt i32 %next_i, 32                  ; Loop for 8 iterations (4 elements per iteration → 8 * 4 = 32)
+  br i1 %cmp, label %loop, label %exit
+
+exit:                                             ; Exit block
+  %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc_next)
+  ret i32 %reduce
+}
+
+
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; CHECK: uabd.4s v0
+; CHECK: uabd.4s v1
+; CHECK: uabd.4s v2
+; CHECK: uaba.4s v0
+; CHECK: uaba.4s v1
+; CHECK: uaba.4s v2
+; CHECK: uaba.4s v0
+; CHECK: uaba.4s v1
+; CHECK: add.4s v0, v2, v0
+; CHECK: add.4s v0, v0, v1
+; CHECK: addv.4s 
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
index 39ef529d33b47..6c0723952c4d5 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate-accumulators.mir
@@ -1,18 +1,46 @@
 # RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown %s -o - | FileCheck %s
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -acc-max-width=2   %s -o -   | FileCheck %s  --check-prefix=NARROW-TREE
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -acc-min-depth=100  %s -o -  | FileCheck %s  --check-prefix=NO-TREE
+# RUN: llc -run-pass=machine-combiner -mtriple=arm64-unknown-unknown -acc-reassoc=false  %s -o -  | FileCheck %s  --check-prefix=NO-TREE
 
 # A chain of UABAL instructions that can be reassociated for better ILP.
 # Before the optimization, we accumulate in a single long chain.
 # CHECK-LABEL:   uabal_accumulation
-# CHECK:          [[START:%.*]]:fpr128 = UABDLv4i16_v4i32
-# CHECK:          [[A:%.*]]:fpr128 = UABALv4i16_v4i32 [[START]]
-# CHECK:          [[B:%.*]]:fpr128 = UABALv4i16_v4i32 [[A]]
-# CHECK:          [[C:%.*]]:fpr128 = UABALv4i16_v4i32 [[B]]
-# CHECK:          [[D:%.*]]:fpr128 = UABALv4i16_v4i32 [[C]]
-# CHECK:          [[E:%.*]]:fpr128 = UABALv4i16_v4i32 [[D]]
-# CHECK:          [[F:%.*]]:fpr128 = UABALv4i16_v4i32 [[E]]
-# CHECK:          [[G:%.*]]:fpr128 = UABALv4i16_v4i32 [[F]]
-# CHECK:          [[H:%.*]]:fpr128 = UABALv4i16_v4i32 [[G]]
-# CHECK:          [[END:%.*]]:fpr32 = ADDVv4i32v killed [[H]]
+# CHECK:          [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[START2:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[START3:%.*]]:fpr128 = UABDLv4i16_v4i32
+# CHECK:          [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]] 
+# CHECK:          [[B1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START2]] 
+# CHECK:          [[C1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START3]] 
+# CHECK:          [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]] 
+# CHECK:          [[B2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B1]] 
+# CHECK:          [[C2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[C1]] 
+# CHECK:          [[PARTIAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[A2]], killed [[B2]]
+# CHECK:          [[TOTAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[PARTIAL_SUM]], killed [[C2]]
+# CHECK:          [[END:%.*]]:fpr32 = ADDVv4i32v killed [[TOTAL_SUM]]
+
+# NARROW-TREE:    [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NARROW-TREE:    [[START2:%.*]]:fpr128 = UABDLv4i16_v4i32          
+# NARROW-TREE:    [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# NARROW-TREE:    [[B1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START2]]
+# NARROW-TREE:    [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]              
+# NARROW-TREE:    [[B2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B1]]
+# NARROW-TREE:    [[A3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A2]]
+# NARROW-TREE:    [[B3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[B2]]
+# NARROW-TREE:    [[A4:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A3]]
+# NARROW-TREE:    [[PARTIAL_SUM:%.*]]:fpr128 = ADDv4i32 killed [[B3]], killed [[A4]]
+# NARROW-TREE:    [[END:%.*]]:fpr32 = ADDVv4i32v killed [[PARTIAL_SUM]]  
+
+# NO-TREE:        [[START1:%.*]]:fpr128 = UABDLv4i16_v4i32
+# NO-TREE:        [[A1:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[START1]]
+# NO-TREE:        [[A2:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A1]]
+# NO-TREE:        [[A3:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A2]]
+# NO-TREE:        [[A4:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A3]]              
+# NO-TREE:        [[A5:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A4]]
+# NO-TREE:        [[A6:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A5]]
+# NO-TREE:        [[A7:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A6]]
+# NO-TREE:        [[A8:%.*]]:fpr128 = UABALv4i16_v4i32 killed [[A7]]
+# NO-TREE:        [[END:%.*]]:fpr32 = ADDVv4i32v killed [[A8]]
 
 ---
 name:            uabal_accumulation
@@ -31,42 +59,42 @@ body:             |
     %8:fpr64 = LDRDui %6, 0 :: (load (s64))
     %9:fpr64 = LDRDui %7, 0 :: (load (s64))
     %10:fpr128 = UABDLv4i16_v4i32 killed %8, killed %9
-    %11:fpr128 = UABALv4i16_v4i32 %10, killed %4, killed %5
+    %11:fpr128 = UABALv4i16_v4i32 killed %10, killed %4, killed %5
     %12:gpr64common = ADDXrr %6, %1
     %13:gpr64common = ADDXrr %7, %3
     %14:fpr64 = LDRDui %12, 0 :: (load (s64))
     %15:fpr64 = LDRDui %13, 0 :: (load (s64))
-    %16:fpr128 = UABALv4i16_v4i32 %11, killed %14, killed %15
+    %16:fpr128 = UABALv4i16_v4i32 killed %11, killed %14, killed %15
     %17:gpr64common = ADDXrr %12, %1
     %18:gpr64common = ADDXrr %13, %3
     %19:fpr64 = LDRDui %17, 0 :: (load (s64))
     %20:fpr64 = LDRDui %18, 0 :: (load (s64))
-    %21:fpr128 = UABALv4i16_v4i32 %16, killed %19, killed %20
+    %21:fpr128 = UABALv4i16_v4i32 killed %16, killed %19, killed %20
     %22:gpr64common = ADDXrr %17, %1
     %23:gpr64common = ADDXrr %18, %3
     %24:fpr64 = LDRDui %22, 0 :: (load (s64))
     %25:fpr64 = LDRDui %23, 0 :: (load (s64))
-    %26:fpr128 = UABALv4i16_v4i32 %21, killed %24, killed %25
+    %26:fpr128 = UABALv4i16_v4i32 killed %21, killed %24, killed %25
     %27:gpr64common = ADDXrr %22, %1
     %28:gpr64common = ADDXrr %23, %3
     %29:fpr64 = LDRDui %27, 0 :: (load (s64))
     %30:fpr64 = LDRDui %28, 0 :: (load (s64))
-    %31:fpr128 = UABALv4i16_v4i32 %26, killed %29, killed %30
+    %31:fpr128 = UABALv4i16_v4i32 killed %26, killed %29, killed %30
     %32:gpr64common = ADDXrr %27, %1
     %33:gpr64common = ADDXrr %28, %3
     %34:fpr64 = LDRDui %32, 0 :: (load (s64))
     %35:fpr64 = LDRDui %33, 0 :: (load (s64))
-    %36:fpr128 = UABALv4i16_v4i32 %31, killed %34, killed %35
+    %36:fpr128 = UABALv4i16_v4i32 killed %31, killed %34, killed %35
     %37:gpr64common = ADDXrr %32, %1
     %38:gpr64common = ADDXrr %33, %3
     %39:fpr64 = LDRDui %37, 0 :: (load (s64))
     %40:fpr64 = LDRDui %38, 0 :: (load (s64))
-    %41:fpr128 = UABALv4i16_v4i32 %36, killed %39, killed %40
+    %41:fpr128 = UABALv4i16_v4i32 killed %36, killed %39, killed %40
     %42:gpr64common = ADDXrr %37, %1
     %43:gpr64common = ADDXrr %38, %3
     %44:fpr64 = LDRDui %42, 0 :: (load (s64))
     %45:fpr64 = LDRDui %43, 0 :: (load (s64))
-    %46:fpr128 = UABALv4i16_v4i32 %41, killed %44, killed %45
+    %46:fpr128 = UABALv4i16_v4i32 killed %41, killed %44, killed %45
     %47:fpr32 = ADDVv4i32v killed %46
     %48:fpr128 = IMPLICIT_DEF
     %49:fpr128 = INSERT_SUBREG %48, killed %47, %subreg.ssub