[llvm] [llvm] Fix crash when complex deinterleaving operates on an unrolled loop (PR #129735)
Nicholas Guy via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 06:43:43 PDT 2025
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/129735
>From 4ab5fed824c61efc361e5aff6f2c3917fb13f9dd Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 4 Mar 2025 16:02:01 +0000
Subject: [PATCH 1/5] [llvm] Fix crash when complex deinterleaving operates on
an unrolled loop
---
.../lib/CodeGen/ComplexDeinterleavingPass.cpp | 48 ++++++++++++++++---
1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 92053ed561901..fdf48fa0ff306 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -61,6 +61,7 @@
#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -274,6 +275,13 @@ class ComplexDeinterleavingGraph {
/// `llvm.vector.reduce.fadd` when unroll factor isn't one.
MapVector<Instruction *, std::pair<PHINode *, Instruction *>> ReductionInfo;
+ /// In the case of reductions in unrolled loops, the %OutsideUser from
+ /// ReductionInfo is an add instruction that precedes the reduction.
+ /// UnrollInfo pairs values together if they are both operands of the same
+ /// add. This pairing info is then used to add the resulting complex
+ /// operations together before the final reduction.
+ MapVector<Value *, Value *> UnrollInfo;
+
/// In the process of detecting a reduction, we consider a pair of
/// %ReductionOP, which we refer to as real and imag (or vice versa), and
/// traverse the use-tree to detect complex operations. As this is a reduction
@@ -2253,8 +2261,31 @@ void ComplexDeinterleavingGraph::processReductionSingle(
auto *FinalReduction = ReductionInfo[Real].second;
Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
- auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
+ Value *Other;
+ bool EraseFinalReductionHere = false;
+ if (match(FinalReduction, m_c_Add(m_Specific(Real), m_Value(Other)))) {
+ UnrollInfo[Real] = OperationReplacement;
+ if (!UnrollInfo.contains(Other) || !FinalReduction->hasOneUser())
+ return;
+
+ auto *User = *FinalReduction->user_begin();
+ if (!match(User, m_Intrinsic<Intrinsic::vector_reduce_add>()))
+ return;
+
+ FinalReduction = cast<Instruction>(User);
+ Builder.SetInsertPoint(FinalReduction);
+ OperationReplacement =
+ Builder.CreateAdd(OperationReplacement, UnrollInfo[Other]);
+
+ UnrollInfo.erase(Real);
+ UnrollInfo.erase(Other);
+ EraseFinalReductionHere = true;
+ }
+
+ Value *AddReduce = Builder.CreateAddReduce(OperationReplacement);
FinalReduction->replaceAllUsesWith(AddReduce);
+ if (EraseFinalReductionHere)
+ FinalReduction->eraseFromParent();
}
void ComplexDeinterleavingGraph::processReductionOperation(
@@ -2299,7 +2330,7 @@ void ComplexDeinterleavingGraph::processReductionOperation(
}
void ComplexDeinterleavingGraph::replaceNodes() {
- SmallVector<Instruction *, 16> DeadInstrRoots;
+ SmallSetVector<Instruction *, 16> DeadInstrRoots;
for (auto *RootInstruction : OrderedRoots) {
// Check if this potential root went through check process and we can
// deinterleave it
@@ -2316,20 +2347,25 @@ void ComplexDeinterleavingGraph::replaceNodes() {
auto *RootImag = cast<Instruction>(RootNode->Imag);
ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.push_back(RootReal);
- DeadInstrRoots.push_back(RootImag);
+ DeadInstrRoots.insert(RootReal);
+ DeadInstrRoots.insert(RootImag);
} else if (RootNode->Operation ==
ComplexDeinterleavingOperation::ReductionSingle) {
auto *RootInst = cast<Instruction>(RootNode->Real);
ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
+ DeadInstrRoots.insert(ReductionInfo[RootInst].second);
} else {
assert(R && "Unable to find replacement for RootInstruction");
- DeadInstrRoots.push_back(RootInstruction);
+ DeadInstrRoots.insert(RootInstruction);
RootInstruction->replaceAllUsesWith(R);
}
}
+ assert(UnrollInfo.empty() &&
+ "UnrollInfo should be empty after replacing all nodes");
+
+ for (auto *I : DeadInstrRoots)
+ dbgs() << "Dead Instr Root: " << *I << "\n";
for (auto *I : DeadInstrRoots)
RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
}
>From 0305d3b0455b13b3e7ee09f3a9e5c79e7c557969 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 4 Mar 2025 16:53:06 +0000
Subject: [PATCH 2/5] Add test
---
.../complex-deinterleaving-unrolled-cdot.ll | 181 ++++++++++++++++++
1 file changed, 181 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
new file mode 100644
index 0000000000000..e680fd883a1ac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a0, <vscale x 32 x i8> %b0, <vscale x 32 x i8> %a1, <vscale x 32 x i8> %b1) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A0:%.*]], <vscale x 32 x i8> [[B0:%.*]], <vscale x 32 x i8> [[A1:%.*]], <vscale x 32 x i8> [[B1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2: [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT: [[TMP0:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT: [[TMP1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A0]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B0]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A0]], i64 16)
+; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B0]], i64 16)
+; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 4)
+; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i32 0)
+; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i32 0)
+; CHECK-SVE2-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP11]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-SVE2-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A1]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B1]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A1]], i64 16)
+; CHECK-SVE2-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B1]], i64 16)
+; CHECK-SVE2-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP1]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP1]], i64 4)
+; CHECK-SVE2-NEXT: [[TMP18:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], i32 0)
+; CHECK-SVE2-NEXT: [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP17]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]], i32 0)
+; CHECK-SVE2-NEXT: [[TMP20:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP18]], i64 0)
+; CHECK-SVE2-NEXT: [[TMP21]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP19]], i64 4)
+; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2: [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT: [[TMP22:%.*]] = add <vscale x 8 x i32> [[TMP21]], [[TMP11]]
+; CHECK-SVE2-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP22]])
+; CHECK-SVE2-NEXT: ret i32 [[TMP23]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A0:%.*]], <vscale x 32 x i8> [[B0:%.*]], <vscale x 32 x i8> [[A1:%.*]], <vscale x 32 x i8> [[B1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT: [[ENTRY:.*]]:
+; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE: [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[VEC_PHI25:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT: [[A0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A0]])
+; CHECK-SVE-NEXT: [[A0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT: [[A0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT: [[A1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A1]])
+; CHECK-SVE-NEXT: [[A1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT: [[A1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT: [[A0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[A1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[B0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B0]])
+; CHECK-SVE-NEXT: [[B0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT: [[B0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT: [[B1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B1]])
+; CHECK-SVE-NEXT: [[B1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT: [[B1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT: [[B0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[B1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[TMP0:%.*]] = mul nsw <vscale x 16 x i32> [[B0_REAL_EXT]], [[A0_REAL_EXT]]
+; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nsw <vscale x 16 x i32> [[B1_REAL_EXT]], [[A1_REAL_EXT]]
+; CHECK-SVE-NEXT: [[A0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[A1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[B0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT: [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
+; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-SVE-NEXT: [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
+; CHECK-SVE-NEXT: [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-SVE-NEXT: [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE: [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
+; CHECK-SVE-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-SVE-NEXT: ret i32 [[TMP6]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A0:%.*]], <vscale x 32 x i8> [[B0:%.*]], <vscale x 32 x i8> [[A1:%.*]], <vscale x 32 x i8> [[B1:%.*]]) {
+; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE: [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT: [[VEC_PHI25:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT: [[A0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A0]])
+; CHECK-NOSVE-NEXT: [[A0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT: [[A0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT: [[A1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A1]])
+; CHECK-NOSVE-NEXT: [[A1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT: [[A1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT: [[A0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[A1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[B0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B0]])
+; CHECK-NOSVE-NEXT: [[B0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT: [[B0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT: [[B1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B1]])
+; CHECK-NOSVE-NEXT: [[B1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT: [[B1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT: [[B0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[B1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = mul nsw <vscale x 16 x i32> [[B0_REAL_EXT]], [[A0_REAL_EXT]]
+; CHECK-NOSVE-NEXT: [[TMP1:%.*]] = mul nsw <vscale x 16 x i32> [[B1_REAL_EXT]], [[A1_REAL_EXT]]
+; CHECK-NOSVE-NEXT: [[A0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[A1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[B0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT: [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
+; CHECK-NOSVE-NEXT: [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
+; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-NOSVE-NEXT: [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
+; CHECK-NOSVE-NEXT: [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
+; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
+; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
+; CHECK-NOSVE-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NOSVE-NEXT: ret i32 [[TMP6]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce33, %vector.body ]
+ %vec.phi25 = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce34, %vector.body ]
+ %a0.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a0)
+ %a0.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a0.deinterleaved, 0
+ %a0.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a0.deinterleaved, 1
+ %a1.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a1)
+ %a1.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a1.deinterleaved, 0
+ %a1.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a1.deinterleaved, 1
+ %a0.real.ext = sext <vscale x 16 x i8> %a0.real to <vscale x 16 x i32>
+ %a1.real.ext = sext <vscale x 16 x i8> %a1.real to <vscale x 16 x i32>
+ %b0.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b0)
+ %b0.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b0.deinterleaved, 0
+ %b0.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b0.deinterleaved, 1
+ %b1.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b1)
+ %b1.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b1.deinterleaved, 0
+ %b1.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b1.deinterleaved, 1
+ %b0.real.ext = sext <vscale x 16 x i8> %b0.real to <vscale x 16 x i32>
+ %b1.real.ext = sext <vscale x 16 x i8> %b1.real to <vscale x 16 x i32>
+ %18 = mul nsw <vscale x 16 x i32> %b0.real.ext, %a0.real.ext
+ %19 = mul nsw <vscale x 16 x i32> %b1.real.ext, %a1.real.ext
+ %a0.imag.ext = sext <vscale x 16 x i8> %a0.imag to <vscale x 16 x i32>
+ %a1.imag.ext = sext <vscale x 16 x i8> %a1.imag to <vscale x 16 x i32>
+ %b0.imag.ext = sext <vscale x 16 x i8> %b0.imag to <vscale x 16 x i32>
+ %b1.imag.ext = sext <vscale x 16 x i8> %b1.imag to <vscale x 16 x i32>
+ %24 = mul nsw <vscale x 16 x i32> %b0.imag.ext, %a0.imag.ext
+ %25 = mul nsw <vscale x 16 x i32> %b1.imag.ext, %a1.imag.ext
+ %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %18)
+ %partial.reduce32 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi25, <vscale x 16 x i32> %19)
+ %26 = sub nsw <vscale x 16 x i32> zeroinitializer, %24
+ %27 = sub nsw <vscale x 16 x i32> zeroinitializer, %25
+ %partial.reduce33 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce, <vscale x 16 x i32> %26)
+ %partial.reduce34 = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %partial.reduce32, <vscale x 16 x i32> %27)
+ br i1 true, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %bin.rdx = add <vscale x 4 x i32> %partial.reduce34, %partial.reduce33
+ %29 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %bin.rdx)
+ ret i32 %29
+}
+
+declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
>From 3ed40f48e144349e18982baaae709591b48a44bb Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 4 Mar 2025 17:08:49 +0000
Subject: [PATCH 3/5] Remove debug statement
---
llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index fdf48fa0ff306..e1e0961874b1b 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2364,8 +2364,6 @@ void ComplexDeinterleavingGraph::replaceNodes() {
assert(UnrollInfo.empty() &&
"UnrollInfo should be empty after replacing all nodes");
- for (auto *I : DeadInstrRoots)
- dbgs() << "Dead Instr Root: " << *I << "\n";
for (auto *I : DeadInstrRoots)
RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
}
>From df6dbbad534bebdc7e4da0d252b1de3c4daa4065 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Fri, 14 Mar 2025 17:28:32 +0000
Subject: [PATCH 4/5] Prevent ReductionSingle operations from resulting in
non-scalar values
---
.../lib/CodeGen/ComplexDeinterleavingPass.cpp | 56 +++++-------------
.../complex-deinterleaving-unrolled-cdot.ll | 58 +++++++++++--------
2 files changed, 50 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index e1e0961874b1b..70d2e95d477a5 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -61,7 +61,6 @@
#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -275,13 +274,6 @@ class ComplexDeinterleavingGraph {
/// `llvm.vector.reduce.fadd` when unroll factor isn't one.
MapVector<Instruction *, std::pair<PHINode *, Instruction *>> ReductionInfo;
- /// In the case of reductions in unrolled loops, the %OutsideUser from
- /// ReductionInfo is an add instruction that precedes the reduction.
- /// UnrollInfo pairs values together if they are both operands of the same
- /// add. This pairing info is then used to add the resulting complex
- /// operations together before the final reduction.
- MapVector<Value *, Value *> UnrollInfo;
-
/// In the process of detecting a reduction, we consider a pair of
/// %ReductionOP, which we refer to as real and imag (or vice versa), and
/// traverse the use-tree to detect complex operations. As this is a reduction
@@ -1749,6 +1741,16 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
LLVM_DEBUG(
dbgs() << "Identified single reduction starting from instruction: "
<< *Real << "/" << *ReductionInfo[Real].second << "\n");
+
+ // Reducing to a single vector is not supported, only permit reducing down
+ // to scalar values.
+ // Doing this here will leave the prior node in the graph,
+ // however with no uses the node will be unreachable by the replacement
+ // process. That along with the usage outside the graph should prevent the
+ // replacement process from kicking off at all for this graph.
+ if (ReductionInfo[Real].second->getType()->isVectorTy())
+ continue;
+
Processed[i] = true;
auto RootNode = prepareCompositeNode(
ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
@@ -2261,31 +2263,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
auto *FinalReduction = ReductionInfo[Real].second;
Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
- Value *Other;
- bool EraseFinalReductionHere = false;
- if (match(FinalReduction, m_c_Add(m_Specific(Real), m_Value(Other)))) {
- UnrollInfo[Real] = OperationReplacement;
- if (!UnrollInfo.contains(Other) || !FinalReduction->hasOneUser())
- return;
-
- auto *User = *FinalReduction->user_begin();
- if (!match(User, m_Intrinsic<Intrinsic::vector_reduce_add>()))
- return;
-
- FinalReduction = cast<Instruction>(User);
- Builder.SetInsertPoint(FinalReduction);
- OperationReplacement =
- Builder.CreateAdd(OperationReplacement, UnrollInfo[Other]);
-
- UnrollInfo.erase(Real);
- UnrollInfo.erase(Other);
- EraseFinalReductionHere = true;
- }
-
- Value *AddReduce = Builder.CreateAddReduce(OperationReplacement);
+ auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
FinalReduction->replaceAllUsesWith(AddReduce);
- if (EraseFinalReductionHere)
- FinalReduction->eraseFromParent();
}
void ComplexDeinterleavingGraph::processReductionOperation(
@@ -2330,7 +2309,7 @@ void ComplexDeinterleavingGraph::processReductionOperation(
}
void ComplexDeinterleavingGraph::replaceNodes() {
- SmallSetVector<Instruction *, 16> DeadInstrRoots;
+ SmallVector<Instruction *, 16> DeadInstrRoots;
for (auto *RootInstruction : OrderedRoots) {
// Check if this potential root went through check process and we can
// deinterleave it
@@ -2347,23 +2326,20 @@ void ComplexDeinterleavingGraph::replaceNodes() {
auto *RootImag = cast<Instruction>(RootNode->Imag);
ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.insert(RootReal);
- DeadInstrRoots.insert(RootImag);
+ DeadInstrRoots.push_back(RootReal);
+ DeadInstrRoots.push_back(RootImag);
} else if (RootNode->Operation ==
ComplexDeinterleavingOperation::ReductionSingle) {
auto *RootInst = cast<Instruction>(RootNode->Real);
ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.insert(ReductionInfo[RootInst].second);
+ DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
} else {
assert(R && "Unable to find replacement for RootInstruction");
- DeadInstrRoots.insert(RootInstruction);
+ DeadInstrRoots.push_back(RootInstruction);
RootInstruction->replaceAllUsesWith(R);
}
}
- assert(UnrollInfo.empty() &&
- "UnrollInfo should be empty after replacing all nodes");
-
for (auto *I : DeadInstrRoots)
RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
index e680fd883a1ac..faefaf9bad7b1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll
@@ -12,32 +12,42 @@ define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a0, <vscale x 32 x i8> %b0, <vscal
; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A0]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B0]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A0]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B0]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP11]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP12:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A1]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B1]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A1]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B1]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP1]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP1]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP18:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP17]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP20:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP18]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP21]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP19]], i64 4)
+; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT: [[VEC_PHI25:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT: [[A0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A0]])
+; CHECK-SVE2-NEXT: [[A0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT: [[A0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A0_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT: [[A1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A1]])
+; CHECK-SVE2-NEXT: [[A1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT: [[A1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A1_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT: [[A0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[A1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[B0_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B0]])
+; CHECK-SVE2-NEXT: [[B0_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT: [[B0_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B0_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT: [[B1_DEINTERLEAVED:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B1]])
+; CHECK-SVE2-NEXT: [[B1_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT: [[B1_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B1_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT: [[B0_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[B1_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[TMP0:%.*]] = mul nsw <vscale x 16 x i32> [[B0_REAL_EXT]], [[A0_REAL_EXT]]
+; CHECK-SVE2-NEXT: [[TMP1:%.*]] = mul nsw <vscale x 16 x i32> [[B1_REAL_EXT]], [[A1_REAL_EXT]]
+; CHECK-SVE2-NEXT: [[A0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[A1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[B0_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B0_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[B1_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B1_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT: [[TMP2:%.*]] = mul nsw <vscale x 16 x i32> [[B0_IMAG_EXT]], [[A0_IMAG_EXT]]
+; CHECK-SVE2-NEXT: [[TMP3:%.*]] = mul nsw <vscale x 16 x i32> [[B1_IMAG_EXT]], [[A1_IMAG_EXT]]
+; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP0]])
+; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE32:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI25]], <vscale x 16 x i32> [[TMP1]])
+; CHECK-SVE2-NEXT: [[TMP4:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP2]]
+; CHECK-SVE2-NEXT: [[TMP5:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP3]]
+; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE33]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP4]])
+; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE34]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE32]], <vscale x 16 x i32> [[TMP5]])
; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP22:%.*]] = add <vscale x 8 x i32> [[TMP21]], [[TMP11]]
-; CHECK-SVE2-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP22]])
+; CHECK-SVE2-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]]
+; CHECK-SVE2-NEXT: [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
; CHECK-SVE2-NEXT: ret i32 [[TMP23]]
;
; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
>From 7a7ff048f0b331076478eb0ca9a56ebc869e9fef Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 19 Mar 2025 13:42:14 +0000
Subject: [PATCH 5/5] [NFC] Add TODO
---
llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 70d2e95d477a5..4cd378f9aa595 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1748,6 +1748,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
// however with no uses the node will be unreachable by the replacement
// process. That along with the usage outside the graph should prevent the
// replacement process from kicking off at all for this graph.
+ // TODO Add support for reducing to a single vector value
if (ReductionInfo[Real].second->getType()->isVectorTy())
continue;
More information about the llvm-commits
mailing list