[llvm] [AArch64] Lower partial add reduction to udot or svdot (PR #101010)
Sam Tebbs via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 21 08:06:00 PDT 2024
https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/101010
>From 0b9ce21c0019fea07188ffd142bd9cf580d09f35 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 10:46:16 +0100
Subject: [PATCH 01/19] [AArch64] Lower add partial reduction to udot
This patch introduces lowering of the partial add reduction intrinsic to
a udot or svdot for AArch64.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 6 +
.../Target/AArch64/AArch64ISelLowering.cpp | 77 +++++++++++++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
.../AArch64/AArch64TargetTransformInfo.cpp | 30 +++++
.../AArch64/AArch64TargetTransformInfo.h | 6 +
.../AArch64/partial-reduce-dot-product.ll | 109 ++++++++++++++++++
7 files changed, 236 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9d9886f4920a29..07d99aec47122a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -453,6 +453,12 @@ class TargetLoweringBase {
return true;
}
+ /// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
+ /// should be expanded using generic code in SelectionDAGBuilder.
+ virtual bool shouldExpandPartialReductionIntrinsic(const CallInst *I) const {
+ return true;
+ }
+
/// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
/// using generic code in SelectionDAGBuilder.
virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1791f1b503379e..c70ab253c1aabc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7985,6 +7985,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
case Intrinsic::experimental_vector_partial_reduce_add: {
+
+ if (!TLI.shouldExpandPartialReductionIntrinsic(&I)) {
+ visitTargetIntrinsic(I, Intrinsic);
+ return;
+ }
+
SDValue OpNode = getValue(I.getOperand(1));
EVT ReducedTy = EVT::getEVT(I.getType());
EVT FullTy = OpNode.getValueType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d86e52d49000ae..d1ee58668ecbd7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1971,6 +1971,57 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
return false;
}
+bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
+ const CallInst *CI) const {
+ const bool TargetLowers = false;
+ const bool GenericLowers = true;
+
+ auto *I = dyn_cast<IntrinsicInst>(CI);
+ if (!I)
+ return GenericLowers;
+
+ ScalableVectorType *RetTy = dyn_cast<ScalableVectorType>(I->getType());
+
+ if (!RetTy)
+ return GenericLowers;
+
+ ScalableVectorType *InputTy = nullptr;
+
+ auto RetScalarTy = RetTy->getScalarType();
+ if (RetScalarTy->isIntegerTy(64)) {
+ InputTy = ScalableVectorType::get(Type::getInt16Ty(I->getContext()), 8);
+ } else if (RetScalarTy->isIntegerTy(32)) {
+ InputTy = ScalableVectorType::get(Type::getInt8Ty(I->getContext()), 16);
+ }
+
+ if (!InputTy)
+ return GenericLowers;
+
+ Value *InputA;
+ Value *InputB;
+
+ auto Pattern = m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
+ m_Value(), m_OneUse(m_Mul(m_OneUse(m_ZExtOrSExt(m_Value(InputA))),
+ m_OneUse(m_ZExtOrSExt(m_Value(InputB))))));
+
+ if (!match(I, Pattern))
+ return GenericLowers;
+
+ auto Mul = cast<Instruction>(I->getOperand(1));
+
+ auto getOpcodeOfOperand = [&](unsigned Idx) {
+ return cast<Instruction>(Mul->getOperand(Idx))->getOpcode();
+ };
+
+ if (getOpcodeOfOperand(0) != getOpcodeOfOperand(1))
+ return GenericLowers;
+
+ if (InputA->getType() != InputTy || InputB->getType() != InputTy)
+ return GenericLowers;
+
+ return TargetLowers;
+}
+
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
if (!Subtarget->isSVEorStreamingSVEAvailable())
return true;
@@ -21237,6 +21288,32 @@ static SDValue performIntrinsicCombine(SDNode *N,
switch (IID) {
default:
break;
+ case Intrinsic::experimental_vector_partial_reduce_add: {
+ SDLoc DL(N);
+
+ auto NarrowOp = N->getOperand(1);
+ auto MulOp = N->getOperand(2);
+
+ auto ExtA = MulOp->getOperand(0);
+ auto ExtB = MulOp->getOperand(1);
+
+ unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
+
+ if (ExtA->getOpcode() == ISD::SIGN_EXTEND)
+ DotIntrinsicId = Intrinsic::aarch64_sve_sdot;
+ else if (ExtA->getOpcode() == ISD::ZERO_EXTEND)
+ DotIntrinsicId = Intrinsic::aarch64_sve_udot;
+
+ assert(DotIntrinsicId != Intrinsic::not_intrinsic &&
+ "Unexpected dot product case encountered.");
+
+ auto A = ExtA->getOperand(0);
+ auto B = ExtB->getOperand(0);
+
+ auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NarrowOp.getValueType(),
+ {IntrinsicId, NarrowOp, A, B});
+ }
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 81e15185f985d5..fc79d9766719bc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -991,6 +991,8 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
+ bool shouldExpandPartialReductionIntrinsic(const CallInst *I) const override;
+
bool shouldExpandCttzElements(EVT VT) const override;
/// If a change in streaming mode is required on entry to/return from a
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 45148449dfb821..792bd546019192 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3533,6 +3533,36 @@ AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
return Cost;
}
+bool AArch64TTIImpl::isPartialReductionSupported(
+ const Instruction *ReductionInstr, Type *InputType, unsigned ScaleFactor,
+ bool IsInputASignExtended, bool IsInputBSignExtended,
+ const Instruction *BinOp) const {
+ if (ReductionInstr->getOpcode() != Instruction::Add)
+ return false;
+
+ // Check that both extends are of the same type
+ if (IsInputASignExtended != IsInputBSignExtended)
+ return false;
+
+ if (!BinOp || BinOp->getOpcode() != Instruction::Mul)
+ return false;
+
+ // Dot product only supports a scale factor of 4
+ if (ScaleFactor != 4)
+ return false;
+
+ Type *ReductionType = ReductionInstr->getType();
+ if (ReductionType->isIntegerTy(32)) {
+ if (!InputType->isIntegerTy(8))
+ return false;
+ } else if (ReductionType->isIntegerTy(64)) {
+ if (!InputType->isIntegerTy(16))
+ return false;
+ }
+
+ return true;
+}
+
unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a9189fd53f40bb..592b452134e778 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -155,6 +155,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return VF.getKnownMinValue() * ST->getVScaleForTuning();
}
+ bool isPartialReductionSupported(const Instruction *ReductionInstr,
+ Type *InputType, unsigned ScaleFactor,
+ bool IsInputASignExtended,
+ bool IsInputBSignExtended,
+ const Instruction *BinOp = nullptr) const;
+
unsigned getMaxInterleaveFactor(ElementCount VF);
bool prefersVectorizedAddressing() const;
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
new file mode 100644
index 00000000000000..23b39387fb7a0c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-unknwon-linux-gnu -mattr=+sve2 -O3 %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define <vscale x 4 x i32> @dotp(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: dotp:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.s, #0 // =0x0
+; CHECK-NEXT: udot z2.s, z0.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+ %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
+ %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mult)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 2 x i64> @dotp_wide(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: dotp_wide:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.d, #0 // =0x0
+; CHECK-NEXT: udot z2.d, z0.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+ %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
+ %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> zeroinitializer, <vscale x 8 x i64> %mult)
+ ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i32> @dotp_sext(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: dotp_sext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.s, #0 // =0x0
+; CHECK-NEXT: sdot z2.s, z0.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+ %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
+ %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mult)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 2 x i64> @dotp_wide_sext(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: dotp_wide_sext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.d, #0 // =0x0
+; CHECK-NEXT: sdot z2.d, z0.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+ %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
+ %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> zeroinitializer, <vscale x 8 x i64> %mult)
+ ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i32> @not_dotp(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
+; CHECK-LABEL: not_dotp:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and z0.h, z0.h, #0xff
+; CHECK-NEXT: and z1.h, z1.h, #0xff
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uunpkhi z2.s, z0.h
+; CHECK-NEXT: uunpkhi z3.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: mul z2.s, z2.s, z3.s
+; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
+ %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
+ %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> zeroinitializer, <vscale x 8 x i32> %mult)
+ ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 2 x i64> @not_dotp_wide(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
+; CHECK-LABEL: not_dotp_wide:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: and z1.s, z1.s, #0xffff
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpkhi z3.d, z1.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: mul z2.d, z2.d, z3.d
+; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
+ %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
+ %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> zeroinitializer, <vscale x 4 x i64> %mult)
+ ret <vscale x 2 x i64> %partial.reduce
+}
+
+attributes #0 = { "target-features"="+sve2" }
>From 7a5661702155198e6e4f9eed4d83bbc2cd0cee6e Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 18:08:52 +0100
Subject: [PATCH 02/19] Remove TargetLowers and GenericLowers
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d1ee58668ecbd7..b936e4cb4dccf3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1973,17 +1973,15 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
const CallInst *CI) const {
- const bool TargetLowers = false;
- const bool GenericLowers = true;
auto *I = dyn_cast<IntrinsicInst>(CI);
if (!I)
- return GenericLowers;
+ return true;
ScalableVectorType *RetTy = dyn_cast<ScalableVectorType>(I->getType());
if (!RetTy)
- return GenericLowers;
+ return true;
ScalableVectorType *InputTy = nullptr;
@@ -1995,7 +1993,7 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
}
if (!InputTy)
- return GenericLowers;
+ return true;
Value *InputA;
Value *InputB;
@@ -2005,7 +2003,7 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
m_OneUse(m_ZExtOrSExt(m_Value(InputB))))));
if (!match(I, Pattern))
- return GenericLowers;
+ return true;
auto Mul = cast<Instruction>(I->getOperand(1));
@@ -2014,12 +2012,12 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
};
if (getOpcodeOfOperand(0) != getOpcodeOfOperand(1))
- return GenericLowers;
+ return true;
if (InputA->getType() != InputTy || InputB->getType() != InputTy)
- return GenericLowers;
+ return true;
- return TargetLowers;
+ return false;
}
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
>From 7bddd3b71c858d21ea626eb7f453126ad1a1b65b Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 18:16:16 +0100
Subject: [PATCH 03/19] Assert that shouldExpandPartialReductionIntrinsic sees
an intrinsic
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b936e4cb4dccf3..fdcce7a9d124b1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1975,8 +1975,7 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
const CallInst *CI) const {
auto *I = dyn_cast<IntrinsicInst>(CI);
- if (!I)
- return true;
+ assert(I && "shouldExpandPartialReductionIntrinsic expects an intrinisc");
ScalableVectorType *RetTy = dyn_cast<ScalableVectorType>(I->getType());
>From 48798c1e4ec770f6a47c69e841c048a83bb9bff6 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 18:31:24 +0100
Subject: [PATCH 04/19] Allow non-scalable vector types
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fdcce7a9d124b1..28e13e2e2841cf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1977,18 +1977,17 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
auto *I = dyn_cast<IntrinsicInst>(CI);
assert(I && "shouldExpandPartialReductionIntrinsic expects an intrinisc");
- ScalableVectorType *RetTy = dyn_cast<ScalableVectorType>(I->getType());
-
+ VectorType *RetTy = dyn_cast<VectorType>(I->getType());
if (!RetTy)
return true;
- ScalableVectorType *InputTy = nullptr;
+ VectorType *InputTy = nullptr;
auto RetScalarTy = RetTy->getScalarType();
if (RetScalarTy->isIntegerTy(64)) {
- InputTy = ScalableVectorType::get(Type::getInt16Ty(I->getContext()), 8);
+ InputTy = VectorType::get(Type::getInt16Ty(I->getContext()), 8, RetTy->isScalableTy());
} else if (RetScalarTy->isIntegerTy(32)) {
- InputTy = ScalableVectorType::get(Type::getInt8Ty(I->getContext()), 16);
+ InputTy = VectorType::get(Type::getInt8Ty(I->getContext()), 16, RetTy->isScalableTy());
}
if (!InputTy)
>From 955c84e7aa1f811f2a78585d9dbf985672d3e21e Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 29 Jul 2024 19:10:08 +0100
Subject: [PATCH 05/19] Clean up type checking
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 28e13e2e2841cf..8cf997cf0a3f29 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1984,13 +1984,11 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
VectorType *InputTy = nullptr;
auto RetScalarTy = RetTy->getScalarType();
- if (RetScalarTy->isIntegerTy(64)) {
+ if (RetScalarTy->isIntegerTy(64))
InputTy = VectorType::get(Type::getInt16Ty(I->getContext()), 8, RetTy->isScalableTy());
- } else if (RetScalarTy->isIntegerTy(32)) {
+ else if (RetScalarTy->isIntegerTy(32))
InputTy = VectorType::get(Type::getInt8Ty(I->getContext()), 16, RetTy->isScalableTy());
- }
-
- if (!InputTy)
+ else
return true;
Value *InputA;
@@ -2004,7 +2002,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
return true;
auto Mul = cast<Instruction>(I->getOperand(1));
-
auto getOpcodeOfOperand = [&](unsigned Idx) {
return cast<Instruction>(Mul->getOperand(Idx))->getOpcode();
};
>From 7acadbc84967f045e803ce0d9e9008ab1554cdf8 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 1 Aug 2024 11:04:37 +0100
Subject: [PATCH 06/19] Restrict to scalable vector types and clean up type
checking
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
.../lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 11 +++--------
2 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8cf997cf0a3f29..510a92dc8d7345 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1978,7 +1978,7 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
assert(I && "shouldExpandPartialReductionIntrinsic expects an intrinisc");
VectorType *RetTy = dyn_cast<VectorType>(I->getType());
- if (!RetTy)
+ if (!RetTy || !RetTy->isScalableTy())
return true;
VectorType *InputTy = nullptr;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 792bd546019192..afa9acf5c8de3f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3552,15 +3552,10 @@ bool AArch64TTIImpl::isPartialReductionSupported(
return false;
Type *ReductionType = ReductionInstr->getType();
- if (ReductionType->isIntegerTy(32)) {
- if (!InputType->isIntegerTy(8))
- return false;
- } else if (ReductionType->isIntegerTy(64)) {
- if (!InputType->isIntegerTy(16))
- return false;
- }
- return true;
+ return ((ReductionType->isIntegerTy(32) && InputType->isIntegerTy(8)) ||
+ (ReductionType->isIntegerTy(64) && InputType->isIntegerTy(16))) &&
+ ReductionType->isScalableTy();
}
unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
>From 0213f5d0be6f27888b150e2f3b8af8d6ae64ccee Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 1 Aug 2024 11:36:50 +0100
Subject: [PATCH 07/19] Simplify instruction matching in
shouldExpandPartialReduction
---
.../Target/AArch64/AArch64ISelLowering.cpp | 56 +++++++++----------
1 file changed, 27 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 510a92dc8d7345..4fbe3231170716 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1981,38 +1981,36 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
if (!RetTy || !RetTy->isScalableTy())
return true;
- VectorType *InputTy = nullptr;
-
- auto RetScalarTy = RetTy->getScalarType();
- if (RetScalarTy->isIntegerTy(64))
- InputTy = VectorType::get(Type::getInt16Ty(I->getContext()), 8, RetTy->isScalableTy());
- else if (RetScalarTy->isIntegerTy(32))
- InputTy = VectorType::get(Type::getInt8Ty(I->getContext()), 16, RetTy->isScalableTy());
- else
- return true;
-
Value *InputA;
Value *InputB;
+ if (match(I, m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
+ m_Value(),
+ m_OneUse(m_Mul(m_OneUse(m_ZExtOrSExt(m_Value(InputA))),
+ m_OneUse(m_ZExtOrSExt(m_Value(InputB)))))))) {
+ VectorType *InputAType = dyn_cast<VectorType>(InputA->getType());
+ VectorType *InputBType = dyn_cast<VectorType>(InputB->getType());
+ if (!InputAType || !InputBType)
+ return true;
+ ElementCount ExpectedCount8 = ElementCount::get(8, RetTy->isScalableTy());
+ ElementCount ExpectedCount16 = ElementCount::get(16, RetTy->isScalableTy());
+ if ((RetTy->getScalarType()->isIntegerTy(64) &&
+ InputAType->getElementType()->isIntegerTy(16) &&
+ InputAType->getElementCount() == ExpectedCount8 &&
+ InputAType == InputBType) ||
+
+ (RetTy->getScalarType()->isIntegerTy(32) &&
+ InputAType->getElementType()->isIntegerTy(8) &&
+ InputAType->getElementCount() == ExpectedCount16 &&
+ InputAType == InputBType)) {
+ auto *Mul = cast<Instruction>(I->getOperand(1));
+ auto *Mul0 = cast<Instruction>(Mul->getOperand(0));
+ auto *Mul1 = cast<Instruction>(Mul->getOperand(1));
+ if (Mul0->getOpcode() == Mul1->getOpcode())
+ return false;
+ }
+ }
- auto Pattern = m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
- m_Value(), m_OneUse(m_Mul(m_OneUse(m_ZExtOrSExt(m_Value(InputA))),
- m_OneUse(m_ZExtOrSExt(m_Value(InputB))))));
-
- if (!match(I, Pattern))
- return true;
-
- auto Mul = cast<Instruction>(I->getOperand(1));
- auto getOpcodeOfOperand = [&](unsigned Idx) {
- return cast<Instruction>(Mul->getOperand(Idx))->getOpcode();
- };
-
- if (getOpcodeOfOperand(0) != getOpcodeOfOperand(1))
- return true;
-
- if (InputA->getType() != InputTy || InputB->getType() != InputTy)
- return true;
-
- return false;
+ return true;
}
bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
>From 82769ae7bbacda12114c5a389bab9ddd13900e46 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 9 Aug 2024 16:38:22 +0100
Subject: [PATCH 08/19] Add fallback in case the nodes aren't as we expect at
lowering time
---
.../Target/AArch64/AArch64ISelLowering.cpp | 67 ++++++++++++++++---
1 file changed, 59 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4fbe3231170716..2ddcb5686042da 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21282,28 +21282,79 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::experimental_vector_partial_reduce_add: {
SDLoc DL(N);
+ bool IsValidDotProduct = false;
+
auto NarrowOp = N->getOperand(1);
auto MulOp = N->getOperand(2);
+ if (MulOp->getOpcode() == ISD::MUL)
+ IsValidDotProduct = true;
auto ExtA = MulOp->getOperand(0);
auto ExtB = MulOp->getOperand(1);
+ bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
+ bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
+ if (ExtA->getOpcode() == ExtB->getOpcode() && (IsSExt || IsZExt))
+ IsValidDotProduct = true;
unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
- if (ExtA->getOpcode() == ISD::SIGN_EXTEND)
+ if (IsSExt && IsValidDotProduct)
DotIntrinsicId = Intrinsic::aarch64_sve_sdot;
- else if (ExtA->getOpcode() == ISD::ZERO_EXTEND)
+ else if (IsZExt && IsValidDotProduct)
DotIntrinsicId = Intrinsic::aarch64_sve_udot;
- assert(DotIntrinsicId != Intrinsic::not_intrinsic &&
+ assert((!IsValidDotProduct || DotIntrinsicId != Intrinsic::not_intrinsic) &&
"Unexpected dot product case encountered.");
- auto A = ExtA->getOperand(0);
- auto B = ExtB->getOperand(0);
+ if (IsValidDotProduct) {
+ auto A = ExtA->getOperand(0);
+ auto B = ExtB->getOperand(0);
+
+ auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NarrowOp.getValueType(),
+ {IntrinsicId, NarrowOp, A, B});
+ } else {
+ // If the node doesn't match a dot product, lower to a series of ADDs
+ // instead.
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT Type0 = Op0->getValueType(0);
+ EVT Type1 = Op1->getValueType(0);
+
+ // Canonicalise so that Op1 has the larger type
+ if (Type1.getVectorNumElements() > Type0.getVectorNumElements()) {
+ std::swap(Op0, Op1);
+ std::swap(Type0, Type1);
+ }
- auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NarrowOp.getValueType(),
- {IntrinsicId, NarrowOp, A, B});
+ auto Type0Elements = Type0.getVectorNumElements();
+ auto Type1Elements = Type1.getVectorNumElements();
+ auto Type0ElementSize =
+ Type0.getVectorElementType().getScalarSizeInBits();
+ auto Type1ElementSize =
+ Type1.getVectorElementType().getScalarSizeInBits();
+
+ // If the types are equal then a single ADD is fine
+ if (Type0 == Type1)
+ return DAG.getNode(ISD::ADD, DL, Type0, {Op0, Op1});
+
+ // Otherwise, we need to add each subvector together so that the output is
+ // the intrinsic's return type. For example, <4 x i32>
+ // partial.reduction(<4 x i32> a, <16 x i32> b) becomes a + b[0..3] +
+ // b[4..7] + b[8..11] + b[12..15]
+ SDValue Add = Op0;
+ for (unsigned i = 0; i < Type1Elements / Type0Elements; i++) {
+ SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Type0, Op1,
+ DAG.getConstant(i, DL, MVT::i64));
+
+ if (Type1ElementSize < Type0ElementSize)
+ Subvec = DAG.getNode(ISD::ANY_EXTEND, DL, Type0, Subvec);
+ else if (Type1ElementSize > Type0ElementSize)
+ Subvec = DAG.getNode(ISD::TRUNCATE, DL, Type0, Subvec);
+ Add = DAG.getNode(ISD::ADD, DL, Type0, {Add, Subvec});
+ }
+ return Add;
+ }
}
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
>From f8ee528af87e5fa2bd88538961addc1d3ea92afb Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 12 Aug 2024 11:02:28 +0100
Subject: [PATCH 09/19] Fix logic error with fallback case
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2ddcb5686042da..d39e0a512f9d6c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21282,19 +21282,19 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::experimental_vector_partial_reduce_add: {
SDLoc DL(N);
- bool IsValidDotProduct = false;
+ bool IsValidDotProduct = true;
auto NarrowOp = N->getOperand(1);
auto MulOp = N->getOperand(2);
- if (MulOp->getOpcode() == ISD::MUL)
- IsValidDotProduct = true;
+ if (MulOp->getOpcode() != ISD::MUL)
+ IsValidDotProduct = false;
auto ExtA = MulOp->getOperand(0);
auto ExtB = MulOp->getOperand(1);
bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
- if (ExtA->getOpcode() == ExtB->getOpcode() && (IsSExt || IsZExt))
- IsValidDotProduct = true;
+ if (ExtA->getOpcode() != ExtB->getOpcode() || (!IsSExt && !IsZExt))
+ IsValidDotProduct = false;
unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
@@ -21316,8 +21316,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
} else {
// If the node doesn't match a dot product, lower to a series of ADDs
// instead.
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
+ SDValue Op0 = N->getOperand(1);
+ SDValue Op1 = N->getOperand(2);
EVT Type0 = Op0->getValueType(0);
EVT Type1 = Op1->getValueType(0);
>From a3df2e98a02b6fa4e568288f1ac8607eec86aab7 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 13 Aug 2024 14:18:53 +0100
Subject: [PATCH 10/19] Pass IntrinsicInst to
shouldExpandPartialReductionIntrinsic
---
llvm/include/llvm/CodeGen/TargetLowering.h | 3 ++-
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 +----
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 ++-
4 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 07d99aec47122a..b20c6be16f9e8a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -455,7 +455,8 @@ class TargetLoweringBase {
/// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
/// should be expanded using generic code in SelectionDAGBuilder.
- virtual bool shouldExpandPartialReductionIntrinsic(const CallInst *I) const {
+ virtual bool
+ shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
return true;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c70ab253c1aabc..7211c00240bb27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7986,7 +7986,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
case Intrinsic::experimental_vector_partial_reduce_add: {
- if (!TLI.shouldExpandPartialReductionIntrinsic(&I)) {
+ if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
visitTargetIntrinsic(I, Intrinsic);
return;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d39e0a512f9d6c..7957451173c012 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1972,10 +1972,7 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
}
bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
- const CallInst *CI) const {
-
- auto *I = dyn_cast<IntrinsicInst>(CI);
- assert(I && "shouldExpandPartialReductionIntrinsic expects an intrinisc");
+ const IntrinsicInst *I) const {
VectorType *RetTy = dyn_cast<VectorType>(I->getType());
if (!RetTy || !RetTy->isScalableTy())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index fc79d9766719bc..a870fb5f551209 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -991,7 +991,8 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
- bool shouldExpandPartialReductionIntrinsic(const CallInst *I) const override;
+ bool
+ shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
bool shouldExpandCttzElements(EVT VT) const override;
>From e47670e6897218917519b7648a31930787949b6e Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 13 Aug 2024 14:31:22 +0100
Subject: [PATCH 11/19] Remove one-use restriction
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7957451173c012..362faad4d925cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1980,10 +1980,10 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
Value *InputA;
Value *InputB;
- if (match(I, m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
- m_Value(),
- m_OneUse(m_Mul(m_OneUse(m_ZExtOrSExt(m_Value(InputA))),
- m_OneUse(m_ZExtOrSExt(m_Value(InputB)))))))) {
+ if (match(I,
+ m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
+ m_Value(), m_OneUse(m_Mul(m_ZExtOrSExt(m_Value(InputA)),
+ m_ZExtOrSExt(m_Value(InputB))))))) {
VectorType *InputAType = dyn_cast<VectorType>(InputA->getType());
VectorType *InputBType = dyn_cast<VectorType>(InputB->getType());
if (!InputAType || !InputBType)
>From ff5f96a8588cf9c9f359bcf092df2791cebc70cb Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 13 Aug 2024 14:32:10 +0100
Subject: [PATCH 12/19] Remove new line
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 362faad4d925cb..20417aab02c8e4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1994,7 +1994,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
InputAType->getElementType()->isIntegerTy(16) &&
InputAType->getElementCount() == ExpectedCount8 &&
InputAType == InputBType) ||
-
(RetTy->getScalarType()->isIntegerTy(32) &&
InputAType->getElementType()->isIntegerTy(8) &&
InputAType->getElementCount() == ExpectedCount16 &&
>From 3ff122c183431b472a85105500287e422c7861f3 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 13 Aug 2024 20:21:43 +0100
Subject: [PATCH 13/19] Remove extending/truncating for fallback case
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 --------
1 file changed, 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 20417aab02c8e4..4b7a9cab8a57e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21325,10 +21325,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
auto Type0Elements = Type0.getVectorNumElements();
auto Type1Elements = Type1.getVectorNumElements();
- auto Type0ElementSize =
- Type0.getVectorElementType().getScalarSizeInBits();
- auto Type1ElementSize =
- Type1.getVectorElementType().getScalarSizeInBits();
// If the types are equal then a single ADD is fine
if (Type0 == Type1)
@@ -21343,10 +21339,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Type0, Op1,
DAG.getConstant(i, DL, MVT::i64));
- if (Type1ElementSize < Type0ElementSize)
- Subvec = DAG.getNode(ISD::ANY_EXTEND, DL, Type0, Subvec);
- else if (Type1ElementSize > Type0ElementSize)
- Subvec = DAG.getNode(ISD::TRUNCATE, DL, Type0, Subvec);
Add = DAG.getNode(ISD::ADD, DL, Type0, {Add, Subvec});
}
return Add;
>From 81d5b0c7036af75e6a452e14901eca1adc66608a Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 13 Aug 2024 20:27:04 +0100
Subject: [PATCH 14/19] Clean up test target
---
llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
index 23b39387fb7a0c..0facb2049135f6 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
@@ -1,8 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-unknwon-linux-gnu -mattr=+sve2 -O3 %s -o - | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s
define <vscale x 4 x i32> @dotp(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
; CHECK-LABEL: dotp:
>From 127bfc4def806e5ed5260d77aac0cbd226965606 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 14 Aug 2024 09:42:32 +0100
Subject: [PATCH 15/19] Remove #0 attribute from test
---
.../CodeGen/AArch64/partial-reduce-dot-product.ll | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
index 0facb2049135f6..16ef219a93c9bf 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s
-define <vscale x 4 x i32> @dotp(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+define <vscale x 4 x i32> @dotp(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: dotp:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z2.s, #0 // =0x0
@@ -16,7 +16,7 @@ entry:
ret <vscale x 4 x i32> %partial.reduce
}
-define <vscale x 2 x i64> @dotp_wide(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+define <vscale x 2 x i64> @dotp_wide(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: dotp_wide:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z2.d, #0 // =0x0
@@ -31,7 +31,7 @@ entry:
ret <vscale x 2 x i64> %partial.reduce
}
-define <vscale x 4 x i32> @dotp_sext(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+define <vscale x 4 x i32> @dotp_sext(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: dotp_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z2.s, #0 // =0x0
@@ -46,7 +46,7 @@ entry:
ret <vscale x 4 x i32> %partial.reduce
}
-define <vscale x 2 x i64> @dotp_wide_sext(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+define <vscale x 2 x i64> @dotp_wide_sext(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: dotp_wide_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov z2.d, #0 // =0x0
@@ -61,7 +61,7 @@ entry:
ret <vscale x 2 x i64> %partial.reduce
}
-define <vscale x 4 x i32> @not_dotp(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
+define <vscale x 4 x i32> @not_dotp(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
; CHECK-LABEL: not_dotp:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z0.h, z0.h, #0xff
@@ -82,7 +82,7 @@ entry:
ret <vscale x 4 x i32> %partial.reduce
}
-define <vscale x 2 x i64> @not_dotp_wide(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
+define <vscale x 2 x i64> @not_dotp_wide(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
; CHECK-LABEL: not_dotp_wide:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z0.s, z0.s, #0xffff
@@ -102,5 +102,3 @@ entry:
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> zeroinitializer, <vscale x 4 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
-
-attributes #0 = { "target-features"="+sve2" }
>From 9f791a1f4b79e2bc58c9efc3a277c9bf3e9292ad Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 14 Aug 2024 10:55:12 +0100
Subject: [PATCH 16/19] Allow i8 to i64 dot products
---
.../Target/AArch64/AArch64ISelLowering.cpp | 34 ++++++++-
.../AArch64/partial-reduce-dot-product.ll | 72 +++++++++++++++++++
2 files changed, 103 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4b7a9cab8a57e5..d084fc3f969f34 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1990,11 +1990,15 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
return true;
ElementCount ExpectedCount8 = ElementCount::get(8, RetTy->isScalableTy());
ElementCount ExpectedCount16 = ElementCount::get(16, RetTy->isScalableTy());
+ // Check that the input type is 4 times smaller than the output type. If the
+ // output type is 64 bit then we can accept 8 bit inputs if we do a 32 bit
+ // dot product and add a zext/sext.
if ((RetTy->getScalarType()->isIntegerTy(64) &&
InputAType->getElementType()->isIntegerTy(16) &&
InputAType->getElementCount() == ExpectedCount8 &&
InputAType == InputBType) ||
- (RetTy->getScalarType()->isIntegerTy(32) &&
+ ((RetTy->getScalarType()->isIntegerTy(32) ||
+ RetTy->getScalarType()->isIntegerTy(64)) &&
InputAType->getElementType()->isIntegerTy(8) &&
InputAType->getElementCount() == ExpectedCount16 &&
InputAType == InputBType)) {
@@ -21305,10 +21309,34 @@ static SDValue performIntrinsicCombine(SDNode *N,
if (IsValidDotProduct) {
auto A = ExtA->getOperand(0);
auto B = ExtB->getOperand(0);
+ EVT Type = NarrowOp.getValueType();
+
+ // 8 bit input to 64 bit output can be done by doing a 32 bit dot product
+ // and extending the output
+ bool Extend = A->getValueType(0).getScalarSizeInBits() == 8 &&
+ Type.getScalarSizeInBits() == 64;
+ SDValue Accumulator = NarrowOp;
+ if (Extend) {
+ Type = Type.changeVectorElementType(
+ EVT::getIntegerVT(*DAG.getContext(), 32));
+ // The accumulator is of the wider type so we insert a 0 accumulator and
+ // add the proper one after extending
+ Accumulator = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv4i32,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NarrowOp.getValueType(),
- {IntrinsicId, NarrowOp, A, B});
+ auto DotProduct = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Type,
+ {IntrinsicId, Accumulator, A, B});
+ if (Extend) {
+ auto Extended =
+ DAG.getNode(IsZExt ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL,
+ NarrowOp.getValueType(), {DotProduct});
+ auto AccAdd = DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(),
+ {NarrowOp, Extended});
+ DotProduct = AccAdd;
+ }
+ return DotProduct;
} else {
// If the node doesn't match a dot product, lower to a series of ADDs
// instead.
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
index 16ef219a93c9bf..c1cf9026d693ce 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
@@ -61,6 +61,78 @@ entry:
ret <vscale x 2 x i64> %partial.reduce
}
+define <vscale x 4 x i64> @dotp_8to64(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: dotp_8to64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.s, #0 // =0x0
+; CHECK-NEXT: udot z2.s, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.d, z2.s
+; CHECK-NEXT: uunpkhi z1.d, z2.s
+; CHECK-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+ %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+ %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
+ <vscale x 4 x i64> zeroinitializer, <vscale x 16 x i64> %mult)
+ ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @dotp_sext_8to64(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: dotp_sext_8to64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z2.s, #0 // =0x0
+; CHECK-NEXT: sdot z2.s, z0.b, z1.b
+; CHECK-NEXT: sunpklo z0.d, z2.s
+; CHECK-NEXT: sunpkhi z1.d, z2.s
+; CHECK-NEXT: ret
+entry:
+ %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+ %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+ %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
+ <vscale x 4 x i64> zeroinitializer, <vscale x 16 x i64> %mult)
+ ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @dotp_8to64_accumulator(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i64> %acc) {
+; CHECK-LABEL: dotp_8to64_accumulator:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z4.s, #0 // =0x0
+; CHECK-NEXT: udot z4.s, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.d, z4.s
+; CHECK-NEXT: uunpkhi z1.d, z4.s
+; CHECK-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEXT: add z1.d, z3.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+ %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+ %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
+ <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
+ ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @dotp_sext_8to64_accumulator(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i64> %acc) {
+; CHECK-LABEL: dotp_sext_8to64_accumulator:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z4.s, #0 // =0x0
+; CHECK-NEXT: sdot z4.s, z0.b, z1.b
+; CHECK-NEXT: sunpklo z0.d, z4.s
+; CHECK-NEXT: sunpkhi z1.d, z4.s
+; CHECK-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEXT: add z1.d, z3.d, z1.d
+; CHECK-NEXT: ret
+entry:
+ %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+ %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
+ %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
+ <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
+ ret <vscale x 4 x i64> %partial.reduce
+}
+
define <vscale x 4 x i32> @not_dotp(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
; CHECK-LABEL: not_dotp:
; CHECK: // %bb.0: // %entry
>From 0f2dfedb57c64c49cdbbc71187e94338ee5b49d8 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 20 Aug 2024 13:53:11 +0100
Subject: [PATCH 17/19] Remove isPartialReductionSupported
---
.../AArch64/AArch64TargetTransformInfo.cpp | 25 -------------------
.../AArch64/AArch64TargetTransformInfo.h | 6 -----
2 files changed, 31 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index afa9acf5c8de3f..45148449dfb821 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3533,31 +3533,6 @@ AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
return Cost;
}
-bool AArch64TTIImpl::isPartialReductionSupported(
- const Instruction *ReductionInstr, Type *InputType, unsigned ScaleFactor,
- bool IsInputASignExtended, bool IsInputBSignExtended,
- const Instruction *BinOp) const {
- if (ReductionInstr->getOpcode() != Instruction::Add)
- return false;
-
- // Check that both extends are of the same type
- if (IsInputASignExtended != IsInputBSignExtended)
- return false;
-
- if (!BinOp || BinOp->getOpcode() != Instruction::Mul)
- return false;
-
- // Dot product only supports a scale factor of 4
- if (ScaleFactor != 4)
- return false;
-
- Type *ReductionType = ReductionInstr->getType();
-
- return ((ReductionType->isIntegerTy(32) && InputType->isIntegerTy(8)) ||
- (ReductionType->isIntegerTy(64) && InputType->isIntegerTy(16))) &&
- ReductionType->isScalableTy();
-}
-
unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 592b452134e778..a9189fd53f40bb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -155,12 +155,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return VF.getKnownMinValue() * ST->getVScaleForTuning();
}
- bool isPartialReductionSupported(const Instruction *ReductionInstr,
- Type *InputType, unsigned ScaleFactor,
- bool IsInputASignExtended,
- bool IsInputBSignExtended,
- const Instruction *BinOp = nullptr) const;
-
unsigned getMaxInterleaveFactor(ElementCount VF);
bool prefersVectorizedAddressing() const;
>From 2cf3fa5390f57cca1b263cabffa3165bc989d96a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 21 Aug 2024 15:02:17 +0100
Subject: [PATCH 18/19] Share expansion code in SelectionDAG
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 30 +++
.../SelectionDAG/SelectionDAGBuilder.cpp | 29 +--
.../Target/AArch64/AArch64ISelLowering.cpp | 217 ++++++++----------
4 files changed, 130 insertions(+), 150 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 24eab7b4086752..66bf5550a8a8cc 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1590,6 +1590,10 @@ class SelectionDAG {
/// the target's desired shift amount type.
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
+ /// Expand a partial reduction intrinsic call.
+ /// Op1 and Op2 are its operands and ReducedTY is the intrinsic's return type.
+ SDValue expandPartialReductionIntrinsic(EVT ReducedTy, SDValue Op1, SDValue Op2, SDLoc DL);
+
/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
SDValue expandVAArg(SDNode *Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bbc44a47164059..c947e44e7d1a9a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -74,6 +74,7 @@
#include <cassert>
#include <cstdint>
#include <cstdlib>
+#include <deque>
#include <limits>
#include <optional>
#include <set>
@@ -2412,6 +2413,35 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
}
+SDValue SelectionDAG::expandPartialReductionIntrinsic(EVT ReducedTy, SDValue Op1, SDValue Op2, SDLoc DL) {
+ EVT FullTy = Op2.getValueType();
+
+ unsigned Stride = ReducedTy.getVectorMinNumElements();
+ unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
+
+ // Collect all of the subvectors
+ std::deque<SDValue> Subvectors = {Op1};
+ for (unsigned I = 0; I < ScaleFactor; I++) {
+ auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
+ Subvectors.push_back(getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy,
+ {Op2, SourceIndex}));
+ }
+
+ // Flatten the subvector tree
+ while (Subvectors.size() > 1) {
+ Subvectors.push_back(getNode(ISD::ADD, DL, ReducedTy,
+ {Subvectors[0], Subvectors[1]}));
+ Subvectors.pop_front();
+ Subvectors.pop_front();
+ }
+
+ assert(Subvectors.size() == 1 &&
+ "There should only be one subvector after tree flattening");
+
+ return Subvectors[0];
+
+}
+
SDValue SelectionDAG::expandVAArg(SDNode *Node) {
SDLoc dl(Node);
const TargetLowering &TLI = getTargetLoweringInfo();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7211c00240bb27..209f444ca5e341 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7991,34 +7991,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
- SDValue OpNode = getValue(I.getOperand(1));
- EVT ReducedTy = EVT::getEVT(I.getType());
- EVT FullTy = OpNode.getValueType();
-
- unsigned Stride = ReducedTy.getVectorMinNumElements();
- unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
-
- // Collect all of the subvectors
- std::deque<SDValue> Subvectors;
- Subvectors.push_back(getValue(I.getOperand(0)));
- for (unsigned i = 0; i < ScaleFactor; i++) {
- auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
- Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
- {OpNode, SourceIndex}));
- }
-
- // Flatten the subvector tree
- while (Subvectors.size() > 1) {
- Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
- {Subvectors[0], Subvectors[1]}));
- Subvectors.pop_front();
- Subvectors.pop_front();
- }
-
- assert(Subvectors.size() == 1 &&
- "There should only be one subvector after tree flattening");
-
- setValue(&I, Subvectors[0]);
+ setValue(&I, DAG.expandPartialReductionIntrinsic(EVT::getEVT(I.getType()), getValue(I.getOperand(0)), getValue(I.getOperand(1)), sdl));
return;
}
case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d084fc3f969f34..13dfa0adf19b53 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1978,37 +1978,12 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
if (!RetTy || !RetTy->isScalableTy())
return true;
- Value *InputA;
- Value *InputB;
- if (match(I,
- m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
- m_Value(), m_OneUse(m_Mul(m_ZExtOrSExt(m_Value(InputA)),
- m_ZExtOrSExt(m_Value(InputB))))))) {
- VectorType *InputAType = dyn_cast<VectorType>(InputA->getType());
- VectorType *InputBType = dyn_cast<VectorType>(InputB->getType());
- if (!InputAType || !InputBType)
- return true;
- ElementCount ExpectedCount8 = ElementCount::get(8, RetTy->isScalableTy());
- ElementCount ExpectedCount16 = ElementCount::get(16, RetTy->isScalableTy());
- // Check that the input type is 4 times smaller than the output type. If the
- // output type is 64 bit then we can accept 8 bit inputs if we do a 32 bit
- // dot product and add a zext/sext.
- if ((RetTy->getScalarType()->isIntegerTy(64) &&
- InputAType->getElementType()->isIntegerTy(16) &&
- InputAType->getElementCount() == ExpectedCount8 &&
- InputAType == InputBType) ||
- ((RetTy->getScalarType()->isIntegerTy(32) ||
- RetTy->getScalarType()->isIntegerTy(64)) &&
- InputAType->getElementType()->isIntegerTy(8) &&
- InputAType->getElementCount() == ExpectedCount16 &&
- InputAType == InputBType)) {
- auto *Mul = cast<Instruction>(I->getOperand(1));
- auto *Mul0 = cast<Instruction>(Mul->getOperand(0));
- auto *Mul1 = cast<Instruction>(Mul->getOperand(1));
- if (Mul0->getOpcode() == Mul1->getOpcode())
- return false;
- }
- }
+ if (RetTy->getScalarType()->isIntegerTy(32) && RetTy->getElementCount() == ElementCount::get(4, RetTy->isScalableTy()))
+ return false;
+ if (RetTy->getScalarType()->isIntegerTy(64) && RetTy->getElementCount() == ElementCount::get(2, RetTy->isScalableTy()))
+ return false;
+ if (RetTy->getScalarType()->isIntegerTy(64) && RetTy->getElementCount() == ElementCount::get(4, RetTy->isScalableTy()))
+ return false;
return true;
}
@@ -21271,6 +21246,92 @@ static SDValue tryCombineWhileLo(SDNode *N,
return SDValue(N, 0);
}
+SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // The narrower of the two operands. Used as the accumulator
+ auto NarrowOp = N->getOperand(1);
+ auto MulOp = N->getOperand(2);
+ if (MulOp->getOpcode() != ISD::MUL)
+ return SDValue();
+
+ auto ExtA = MulOp->getOperand(0);
+ auto ExtB = MulOp->getOperand(1);
+ bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
+ bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
+ if (ExtA->getOpcode() != ExtB->getOpcode() || (!IsSExt && !IsZExt))
+ return SDValue();
+
+ auto A = ExtA->getOperand(0);
+ auto B = ExtB->getOperand(0);
+ if (A.getValueType() != B.getValueType())
+ return SDValue();
+
+ // The fully-reduced type. Should be a vector of i32 or i64
+ EVT FullType = N->getValueType(0);
+ // The type that is extended to the wide type. Should be an i8 or i16
+ EVT ExtendedType = A.getValueType();
+ // The wide type with four times as many elements as the reduced type. Should be a vector of i32 or i64, the same as the fully-reduced type
+ EVT WideType = MulOp.getValueType();
+ if (WideType.getScalarSizeInBits() != FullType.getScalarSizeInBits())
+ return SDValue();
+ // Dot products operate on chunks of four elements so there must be four times as many elements in the wide type
+ if (WideType.getVectorMinNumElements() / FullType.getVectorMinNumElements() != 4)
+ return SDValue();
+ switch (FullType.getScalarSizeInBits()) {
+ case 32:
+ if (ExtendedType.getScalarSizeInBits() != 8)
+ return SDValue();
+ break;
+ case 64:
+ // i8 to i64 can be done with an extended i32 dot product
+ if (ExtendedType.getScalarSizeInBits() != 8 && ExtendedType.getScalarSizeInBits() != 16)
+ return SDValue();
+ break;
+ default:
+ return SDValue();
+ }
+
+ unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
+
+ if (IsSExt)
+ DotIntrinsicId = Intrinsic::aarch64_sve_sdot;
+ else if (IsZExt)
+ DotIntrinsicId = Intrinsic::aarch64_sve_udot;
+
+ assert(DotIntrinsicId != Intrinsic::not_intrinsic &&
+ "Unexpected dot product case encountered.");
+
+ EVT Type = NarrowOp.getValueType();
+
+ // 8 bit input to 64 bit output can be done by doing a 32 bit dot product
+ // and extending the output
+ bool Extend = A->getValueType(0).getScalarSizeInBits() == 8 &&
+ Type.getScalarSizeInBits() == 64;
+ SDValue Accumulator = NarrowOp;
+ if (Extend) {
+ Type = Type.changeVectorElementType(
+ EVT::getIntegerVT(*DAG.getContext(), 32));
+ // The accumulator is of the wider type so we insert a 0 accumulator and
+ // add the proper one after extending
+ Accumulator = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv4i32,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+
+ auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
+ auto DotProduct = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Type,
+ {IntrinsicId, Accumulator, A, B});
+ if (Extend) {
+ auto Extended =
+ DAG.getNode(IsZExt ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL,
+ NarrowOp.getValueType(), {DotProduct});
+ auto AccAdd = DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(),
+ {NarrowOp, Extended});
+ DotProduct = AccAdd;
+ }
+ return DotProduct;
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -21280,97 +21341,9 @@ static SDValue performIntrinsicCombine(SDNode *N,
default:
break;
case Intrinsic::experimental_vector_partial_reduce_add: {
- SDLoc DL(N);
-
- bool IsValidDotProduct = true;
-
- auto NarrowOp = N->getOperand(1);
- auto MulOp = N->getOperand(2);
- if (MulOp->getOpcode() != ISD::MUL)
- IsValidDotProduct = false;
-
- auto ExtA = MulOp->getOperand(0);
- auto ExtB = MulOp->getOperand(1);
- bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
- bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
- if (ExtA->getOpcode() != ExtB->getOpcode() || (!IsSExt && !IsZExt))
- IsValidDotProduct = false;
-
- unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
-
- if (IsSExt && IsValidDotProduct)
- DotIntrinsicId = Intrinsic::aarch64_sve_sdot;
- else if (IsZExt && IsValidDotProduct)
- DotIntrinsicId = Intrinsic::aarch64_sve_udot;
-
- assert((!IsValidDotProduct || DotIntrinsicId != Intrinsic::not_intrinsic) &&
- "Unexpected dot product case encountered.");
-
- if (IsValidDotProduct) {
- auto A = ExtA->getOperand(0);
- auto B = ExtB->getOperand(0);
- EVT Type = NarrowOp.getValueType();
-
- // 8 bit input to 64 bit output can be done by doing a 32 bit dot product
- // and extending the output
- bool Extend = A->getValueType(0).getScalarSizeInBits() == 8 &&
- Type.getScalarSizeInBits() == 64;
- SDValue Accumulator = NarrowOp;
- if (Extend) {
- Type = Type.changeVectorElementType(
- EVT::getIntegerVT(*DAG.getContext(), 32));
- // The accumulator is of the wider type so we insert a 0 accumulator and
- // add the proper one after extending
- Accumulator = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv4i32,
- DAG.getConstant(0, DL, MVT::i32));
- }
-
- auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
- auto DotProduct = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Type,
- {IntrinsicId, Accumulator, A, B});
- if (Extend) {
- auto Extended =
- DAG.getNode(IsZExt ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL,
- NarrowOp.getValueType(), {DotProduct});
- auto AccAdd = DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(),
- {NarrowOp, Extended});
- DotProduct = AccAdd;
- }
- return DotProduct;
- } else {
- // If the node doesn't match a dot product, lower to a series of ADDs
- // instead.
- SDValue Op0 = N->getOperand(1);
- SDValue Op1 = N->getOperand(2);
- EVT Type0 = Op0->getValueType(0);
- EVT Type1 = Op1->getValueType(0);
-
- // Canonicalise so that Op1 has the larger type
- if (Type1.getVectorNumElements() > Type0.getVectorNumElements()) {
- std::swap(Op0, Op1);
- std::swap(Type0, Type1);
- }
-
- auto Type0Elements = Type0.getVectorNumElements();
- auto Type1Elements = Type1.getVectorNumElements();
-
- // If the types are equal then a single ADD is fine
- if (Type0 == Type1)
- return DAG.getNode(ISD::ADD, DL, Type0, {Op0, Op1});
-
- // Otherwise, we need to add each subvector together so that the output is
- // the intrinsic's return type. For example, <4 x i32>
- // partial.reduction(<4 x i32> a, <16 x i32> b) becomes a + b[0..3] +
- // b[4..7] + b[8..11] + b[12..15]
- SDValue Add = Op0;
- for (unsigned i = 0; i < Type1Elements / Type0Elements; i++) {
- SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Type0, Op1,
- DAG.getConstant(i, DL, MVT::i64));
-
- Add = DAG.getNode(ISD::ADD, DL, Type0, {Add, Subvec});
- }
- return Add;
- }
+ if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
+ return Dot;
+ return DAG.expandPartialReductionIntrinsic(N->getValueType(0), N->getOperand(1), N->getOperand(2), SDLoc(N));
}
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
>From 651c200a854d1232ea9803e2793419496409a4f4 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 21 Aug 2024 16:04:52 +0100
Subject: [PATCH 19/19] Check for NEON or SVE
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 3 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 45 +++++++-------
.../SelectionDAG/SelectionDAGBuilder.cpp | 4 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 61 +++++++++++--------
4 files changed, 65 insertions(+), 48 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 66bf5550a8a8cc..ebfa28ae2dffbf 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1592,7 +1592,8 @@ class SelectionDAG {
/// Expand a partial reduction intrinsic call.
/// Op1 and Op2 are its operands and ReducedTY is the intrinsic's return type.
- SDValue expandPartialReductionIntrinsic(EVT ReducedTy, SDValue Op1, SDValue Op2, SDLoc DL);
+ SDValue expandPartialReductionIntrinsic(EVT ReducedTy, SDValue Op1,
+ SDValue Op2, SDLoc DL);
/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
SDValue expandVAArg(SDNode *Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c947e44e7d1a9a..69f913aedb6bc5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2413,33 +2413,34 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
}
-SDValue SelectionDAG::expandPartialReductionIntrinsic(EVT ReducedTy, SDValue Op1, SDValue Op2, SDLoc DL) {
- EVT FullTy = Op2.getValueType();
+SDValue SelectionDAG::expandPartialReductionIntrinsic(EVT ReducedTy,
+ SDValue Op1, SDValue Op2,
+ SDLoc DL) {
+ EVT FullTy = Op2.getValueType();
- unsigned Stride = ReducedTy.getVectorMinNumElements();
- unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
+ unsigned Stride = ReducedTy.getVectorMinNumElements();
+ unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
- // Collect all of the subvectors
- std::deque<SDValue> Subvectors = {Op1};
- for (unsigned I = 0; I < ScaleFactor; I++) {
- auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
- Subvectors.push_back(getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy,
- {Op2, SourceIndex}));
- }
-
- // Flatten the subvector tree
- while (Subvectors.size() > 1) {
- Subvectors.push_back(getNode(ISD::ADD, DL, ReducedTy,
- {Subvectors[0], Subvectors[1]}));
- Subvectors.pop_front();
- Subvectors.pop_front();
- }
+ // Collect all of the subvectors
+ std::deque<SDValue> Subvectors = {Op1};
+ for (unsigned I = 0; I < ScaleFactor; I++) {
+ auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
+ Subvectors.push_back(
+ getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex}));
+ }
- assert(Subvectors.size() == 1 &&
- "There should only be one subvector after tree flattening");
+ // Flatten the subvector tree
+ while (Subvectors.size() > 1) {
+ Subvectors.push_back(
+ getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]}));
+ Subvectors.pop_front();
+ Subvectors.pop_front();
+ }
- return Subvectors[0];
+ assert(Subvectors.size() == 1 &&
+ "There should only be one subvector after tree flattening");
+ return Subvectors[0];
}
SDValue SelectionDAG::expandVAArg(SDNode *Node) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 209f444ca5e341..b47eaab14448fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7991,7 +7991,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
return;
}
- setValue(&I, DAG.expandPartialReductionIntrinsic(EVT::getEVT(I.getType()), getValue(I.getOperand(0)), getValue(I.getOperand(1)), sdl));
+ setValue(&I, DAG.expandPartialReductionIntrinsic(
+ EVT::getEVT(I.getType()), getValue(I.getOperand(0)),
+ getValue(I.getOperand(1)), sdl));
return;
}
case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 13dfa0adf19b53..c33acefcca5eb5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1978,11 +1978,14 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
if (!RetTy || !RetTy->isScalableTy())
return true;
- if (RetTy->getScalarType()->isIntegerTy(32) && RetTy->getElementCount() == ElementCount::get(4, RetTy->isScalableTy()))
+ if (RetTy->getScalarType()->isIntegerTy(32) &&
+ RetTy->getElementCount() == ElementCount::get(4, RetTy->isScalableTy()))
return false;
- if (RetTy->getScalarType()->isIntegerTy(64) && RetTy->getElementCount() == ElementCount::get(2, RetTy->isScalableTy()))
+ if (RetTy->getScalarType()->isIntegerTy(64) &&
+ RetTy->getElementCount() == ElementCount::get(2, RetTy->isScalableTy()))
return false;
- if (RetTy->getScalarType()->isIntegerTy(64) && RetTy->getElementCount() == ElementCount::get(4, RetTy->isScalableTy()))
+ if (RetTy->getScalarType()->isIntegerTy(64) &&
+ RetTy->getElementCount() == ElementCount::get(4, RetTy->isScalableTy()))
return false;
return true;
@@ -21246,7 +21249,13 @@ static SDValue tryCombineWhileLo(SDNode *N,
return SDValue(N, 0);
}
-SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG) {
+SDValue tryLowerPartialReductionToDot(SDNode *N,
+ const AArch64Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+
+ if (!Subtarget->isSVEAvailable() && !Subtarget->isNeonAvailable())
+ return SDValue();
+
SDLoc DL(N);
// The narrower of the two operands. Used as the accumulator
@@ -21271,25 +21280,29 @@ SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarg
EVT FullType = N->getValueType(0);
// The type that is extended to the wide type. Should be an i8 or i16
EVT ExtendedType = A.getValueType();
- // The wide type with four times as many elements as the reduced type. Should be a vector of i32 or i64, the same as the fully-reduced type
+ // The wide type with four times as many elements as the reduced type. Should
+ // be a vector of i32 or i64, the same as the fully-reduced type
EVT WideType = MulOp.getValueType();
if (WideType.getScalarSizeInBits() != FullType.getScalarSizeInBits())
return SDValue();
- // Dot products operate on chunks of four elements so there must be four times as many elements in the wide type
- if (WideType.getVectorMinNumElements() / FullType.getVectorMinNumElements() != 4)
+ // Dot products operate on chunks of four elements so there must be four times
+ // as many elements in the wide type
+ if (WideType.getVectorMinNumElements() / FullType.getVectorMinNumElements() !=
+ 4)
return SDValue();
switch (FullType.getScalarSizeInBits()) {
- case 32:
- if (ExtendedType.getScalarSizeInBits() != 8)
- return SDValue();
- break;
- case 64:
- // i8 to i64 can be done with an extended i32 dot product
- if (ExtendedType.getScalarSizeInBits() != 8 && ExtendedType.getScalarSizeInBits() != 16)
- return SDValue();
- break;
- default:
+ case 32:
+ if (ExtendedType.getScalarSizeInBits() != 8)
+ return SDValue();
+ break;
+ case 64:
+ // i8 to i64 can be done with an extended i32 dot product
+ if (ExtendedType.getScalarSizeInBits() != 8 &&
+ ExtendedType.getScalarSizeInBits() != 16)
return SDValue();
+ break;
+ default:
+ return SDValue();
}
unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
@@ -21310,8 +21323,8 @@ SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarg
Type.getScalarSizeInBits() == 64;
SDValue Accumulator = NarrowOp;
if (Extend) {
- Type = Type.changeVectorElementType(
- EVT::getIntegerVT(*DAG.getContext(), 32));
+ Type =
+ Type.changeVectorElementType(EVT::getIntegerVT(*DAG.getContext(), 32));
// The accumulator is of the wider type so we insert a 0 accumulator and
// add the proper one after extending
Accumulator = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv4i32,
@@ -21322,9 +21335,8 @@ SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarg
auto DotProduct = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Type,
{IntrinsicId, Accumulator, A, B});
if (Extend) {
- auto Extended =
- DAG.getNode(IsZExt ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL,
- NarrowOp.getValueType(), {DotProduct});
+ auto Extended = DAG.getNode(IsZExt ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ DL, NarrowOp.getValueType(), {DotProduct});
auto AccAdd = DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(),
{NarrowOp, Extended});
DotProduct = AccAdd;
@@ -21342,8 +21354,9 @@ static SDValue performIntrinsicCombine(SDNode *N,
break;
case Intrinsic::experimental_vector_partial_reduce_add: {
if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
- return Dot;
- return DAG.expandPartialReductionIntrinsic(N->getValueType(0), N->getOperand(1), N->getOperand(2), SDLoc(N));
+ return Dot;
+ return DAG.expandPartialReductionIntrinsic(
+ N->getValueType(0), N->getOperand(1), N->getOperand(2), SDLoc(N));
}
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
More information about the llvm-commits
mailing list