[llvm] f318d1e - [VE] v256i32|64 reduction isel and tests
Simon Moll via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 14 03:12:20 PDT 2022
Author: Simon Moll
Date: 2022-03-14T11:10:38+01:00
New Revision: f318d1e26b7c2c27d8b22b3402208898675ea42e
URL: https://github.com/llvm/llvm-project/commit/f318d1e26b7c2c27d8b22b3402208898675ea42e
DIFF: https://github.com/llvm/llvm-project/commit/f318d1e26b7c2c27d8b22b3402208898675ea42e.diff
LOG: [VE] v256i32|64 reduction isel and tests
and|add|or|xor|smax v256i32|64 isel and tests for vp and vector.reduce
intrinsics
Reviewed By: kaz7
Differential Revision: https://reviews.llvm.org/D121469
Added:
llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll
llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll
llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll
llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll
llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll
llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll
llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll
llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll
llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll
llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll
Modified:
llvm/lib/Target/VE/VECustomDAG.cpp
llvm/lib/Target/VE/VECustomDAG.h
llvm/lib/Target/VE/VEISelLowering.cpp
llvm/lib/Target/VE/VETargetTransformInfo.h
llvm/lib/Target/VE/VVPISelLowering.cpp
llvm/lib/Target/VE/VVPInstrInfo.td
llvm/lib/Target/VE/VVPInstrPatternsVec.td
llvm/lib/Target/VE/VVPNodes.def
Removed:
################################################################################
diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index 41ffb772c8459..d1a2c06584203 100644
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -138,6 +138,15 @@ bool isVVPBinaryOp(unsigned VVPOpcode) {
return false;
}
+bool isVVPReductionOp(unsigned Opcode) {
+ switch (Opcode) {
+#define ADD_REDUCE_VVP_OP(VVP_NAME, SDNAME) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+ return true;
+ }
+ return false;
+}
+
// Return the AVL operand position for this VVP or VEC Op.
Optional<int> getAVLPos(unsigned Opc) {
// This is only available for VP SDNodes
@@ -235,9 +244,14 @@ Optional<EVT> getIdiomaticVectorType(SDNode *Op) {
}
// Translate to VVP where possible.
+ unsigned OriginalOC = OC;
if (auto VVPOpc = getVVPOpcode(OC))
OC = *VVPOpc;
+ if (isVVPReductionOp(OC))
+ return Op->getOperand(hasReductionStartParam(OriginalOC) ? 1 : 0)
+ .getValueType();
+
switch (OC) {
default:
case VEISD::VVP_SETCC:
@@ -320,6 +334,27 @@ SDValue getNodePassthru(SDValue Op) {
return SDValue();
}
+bool hasReductionStartParam(unsigned OPC) {
+ // TODO: Ordered reduction opcodes.
+ if (ISD::isVPReduction(OPC))
+ return true;
+ return false;
+}
+
+unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask) {
+ assert(!IsMask && "Mask reduction isel");
+
+ switch (VVPOC) {
+#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD) \
+ case VEISD::VVP_RED_ISD: \
+ return ISD::REDUCE_ISD;
+#include "VVPNodes.def"
+ default:
+ break;
+ }
+ llvm_unreachable("Cannot not scalarize this reduction Opcode!");
+}
+
/// } Node Properties
SDValue getNodeAVL(SDValue Op) {
@@ -499,4 +534,31 @@ SDValue VECustomDAG::getGatherScatterAddress(SDValue BasePtr, SDValue Scale,
return ResPtr;
}
+SDValue VECustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT,
+ SDValue StartV, SDValue VectorV,
+ SDValue Mask, SDValue AVL,
+ SDNodeFlags Flags) const {
+
+ // Optionally attach the start param with a scalar op (where it is
+ // unsupported).
+ bool scalarizeStartParam = StartV && !hasReductionStartParam(VVPOpcode);
+ bool IsMaskReduction = isMaskType(VectorV.getValueType());
+ assert(!IsMaskReduction && "TODO Implement");
+ auto AttachStartValue = [&](SDValue ReductionResV) {
+ if (!scalarizeStartParam)
+ return ReductionResV;
+ auto ScalarOC = getScalarReductionOpcode(VVPOpcode, IsMaskReduction);
+ return getNode(ScalarOC, ResVT, {StartV, ReductionResV});
+ };
+
+ // Fixup: Always Use sequential 'fmul' reduction.
+ if (!scalarizeStartParam && StartV) {
+ assert(hasReductionStartParam(VVPOpcode));
+ return AttachStartValue(
+ getNode(VVPOpcode, ResVT, {StartV, VectorV, Mask, AVL}, Flags));
+ } else
+ return AttachStartValue(
+ getNode(VVPOpcode, ResVT, {VectorV, Mask, AVL}, Flags));
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index 84586afc9adc1..8cf14587f9d5f 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -24,6 +24,7 @@ namespace llvm {
Optional<unsigned> getVVPOpcode(unsigned Opcode);
bool isVVPBinaryOp(unsigned Opcode);
+bool isVVPReductionOp(unsigned Opcode);
MVT splitVectorType(MVT VT);
@@ -106,6 +107,12 @@ SDValue getGatherScatterIndex(SDValue Op);
SDValue getGatherScatterScale(SDValue Op);
+unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask);
+
+// Whether this VP_REDUCE_*/ VECREDUCE_*/VVP_REDUCE_* SDNode has a start
+// parameter.
+bool hasReductionStartParam(unsigned VVPOC);
+
/// } Node Properties
enum class Packing {
@@ -172,6 +179,12 @@ class VECustomDAG {
SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); }
/// } getNode
+ /// Legalizing getNode {
+ SDValue getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, SDValue StartV,
+ SDValue VectorV, SDValue Mask, SDValue AVL,
+ SDNodeFlags Flags) const;
+ /// } Legalizing getNode
+
/// Packing {
SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL) const;
SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const;
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index f1247598ddea6..dd945bf47c3f1 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -332,6 +332,14 @@ void VETargetLowering::initVPUActions() {
for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
setOperationAction(MemOpc, VT, Custom);
+
+ const ISD::NodeType IntReductionOCs[] = {
+ ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND,
+ ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMIN,
+ ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX};
+
+ for (unsigned IntRedOpc : IntReductionOCs)
+ setOperationAction(IntRedOpc, VT, Custom);
}
}
diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index 7cca3d496f6e1..c688447088782 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -61,6 +61,25 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
bool enableVPU() const { return getST()->enableVPU(); }
+ static bool isSupportedReduction(Intrinsic::ID ReductionID) {
+#define VEC_VP_CASE(SUFFIX) \
+ case Intrinsic::vp_reduce_##SUFFIX: \
+ case Intrinsic::vector_reduce_##SUFFIX:
+
+ switch (ReductionID) {
+ VEC_VP_CASE(add)
+ VEC_VP_CASE(and)
+ VEC_VP_CASE(or)
+ VEC_VP_CASE(xor)
+ VEC_VP_CASE(smax)
+ return true;
+
+ default:
+ return false;
+ }
+#undef VEC_VP_CASE
+ }
+
public:
explicit VETTIImpl(const VETargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -127,6 +146,12 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
return isVectorLaneType(*getLaneType(DataType));
}
// } Load & Store
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ if (!enableVPU())
+ return true;
+ return !isSupportedReduction(II->getIntrinsicID());
+ }
};
} // namespace llvm
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index 162d6521e4027..ab6d2aef6f0a1 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -56,7 +56,7 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
return lowerVVP_GATHER_SCATTER(Op, CDAG);
}
- EVT OpVecVT = Op.getValueType();
+ EVT OpVecVT = *getIdiomaticVectorType(Op.getNode());
EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
@@ -84,6 +84,14 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
return CDAG.getNode(VVPOpcode, LegalVecVT,
{Op->getOperand(0), Op->getOperand(1), Mask, AVL});
}
+ if (isVVPReductionOp(VVPOpcode)) {
+ auto SrcHasStart = hasReductionStartParam(Op->getOpcode());
+ SDValue StartV = SrcHasStart ? Op->getOperand(0) : SDValue();
+ SDValue VectorV = Op->getOperand(SrcHasStart ? 1 : 0);
+ return CDAG.getLegalReductionOpVVP(VVPOpcode, Op.getValueType(), StartV,
+ VectorV, Mask, AVL, Op->getFlags());
+ }
+
if (VVPOpcode == VEISD::VVP_SELECT) {
auto Mask = Op->getOperand(0);
auto OnTrue = Op->getOperand(1);
@@ -91,10 +99,11 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
}
if (VVPOpcode == VEISD::VVP_SETCC) {
+ EVT LegalResVT = getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
auto LHS = Op->getOperand(0);
auto RHS = Op->getOperand(1);
auto Pred = Op->getOperand(2);
- return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL});
+ return CDAG.getNode(VVPOpcode, LegalResVT, {LHS, RHS, Pred, Mask, AVL});
}
llvm_unreachable("lowerToVVP called for unexpected SDNode.");
}
diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index 98e28e813bcce..8257033e42d16 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -53,8 +53,6 @@ def SDTGatherVVP: SDTypeProfile<1, 3, [
IsVLVT<3>
]>;
-// Binary Operators {
-
// BinaryOp(x,y,mask,vl)
def SDTIntBinOpVVP : SDTypeProfile<1, 4, [ // vp_add, vp_and, etc.
SDTCisSameAs<0, 1>,
@@ -95,6 +93,15 @@ def SDTSetCCVVP : SDTypeProfile<1, 5, [ // vp_setcc
IsVLVT<5>
]>;
+// vvp_reduce(vector, mask, vl)
+def SDTReduceVVP : SDTypeProfile<1, 3, [
+ SDTCisVec<1>,
+ SDTCisInt<2>,
+ SDTCisVec<2>,
+ SDTCisSameNumEltsAs<1,2>,
+ IsVLVT<3>
+]>;
+
// Binary operator commutative pattern.
class vvp_commutative<SDNode RootOp> :
@@ -135,8 +142,6 @@ def vvp_fmul : SDNode<"VEISD::VVP_FMUL", SDTFPBinOpVVP>;
def c_vvp_fmul : vvp_commutative<vvp_fmul>;
def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>;
-// } Binary Operators
-
def vvp_scatter : SDNode<"VEISD::VVP_SCATTER", SDTScatterVVP,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def vvp_gather : SDNode<"VEISD::VVP_GATHER", SDTGatherVVP,
@@ -147,6 +152,15 @@ def vvp_load : SDNode<"VEISD::VVP_LOAD", SDTLoadVVP,
def vvp_store : SDNode<"VEISD::VVP_STORE", SDTStoreVVP,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+// Reductions
+
+// int reductions
+def vvp_reduce_add : SDNode<"VEISD::VVP_REDUCE_ADD", SDTReduceVVP>;
+def vvp_reduce_and : SDNode<"VEISD::VVP_REDUCE_AND", SDTReduceVVP>;
+def vvp_reduce_or : SDNode<"VEISD::VVP_REDUCE_OR", SDTReduceVVP>;
+def vvp_reduce_xor : SDNode<"VEISD::VVP_REDUCE_XOR", SDTReduceVVP>;
+def vvp_reduce_smax : SDNode<"VEISD::VVP_REDUCE_SMAX", SDTReduceVVP>;
+
def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;
diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index acd933402580e..f25fe6561d5cd 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -434,3 +434,36 @@ defm : Set_CC<v256f64,"VFMKL","VFCMPD",cond,fcond2cc>;
defm : Set_CC<v256i32,"VFMKW","VCMPUW",CCUIOp,icond2cc>;
defm : Set_CC<v256i32,"VFMKW","VCMPSWZX",CCSIOp,icond2cc>;
defm : Set_CC<v256f32,"VFMKS","VFCMPS",cond,fcond2cc>;
+
+multiclass Reduce_GenericInt<ValueType VectorVT,
+ RegisterClass ResRC, ValueType ResVT,
+ string VVPRedOp, string RedInstName> {
+ // Unmasked.
+ def : Pat <(ResVT (!cast<SDPatternOperator>("vvp_reduce_"#VVPRedOp)
+ VectorVT:$vx, (v256i1 true_mask), i32:$vl)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>("LVSvi")
+ (!cast<Instruction>(RedInstName#"vl") $vx, $vl), 0),
+ ResRC)>;
+
+ // Masked.
+ def : Pat <(ResVT (!cast<SDPatternOperator>("vvp_reduce_"#VVPRedOp)
+ VectorVT:$vx, v256i1:$vm, i32:$vl)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>("LVSvi")
+ (!cast<Instruction>(RedInstName#"vml") $vx, $vm, $vl), 0),
+ ResRC)>;
+}
+
+multiclass IntReduce_ShortLong<ValueType VectorVT,
+ RegisterClass ResRC, ValueType ResVT,
+ string SumSuffix, string MinMaxSuffix> {
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "or", "VROR">;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "and", "VRAND">;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "xor", "VRXOR">;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "add", "VSUM"#SumSuffix>;
+ defm: Reduce_GenericInt<VectorVT, ResRC, ResVT, "smax", "VRMAX"#MinMaxSuffix>;
+}
+
+defm: IntReduce_ShortLong<v256i64, I64, i64, "L","SLFST">;
+defm: IntReduce_ShortLong<v256i32, I32, i32, "WSX","SWFSTSX">;
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index 8b6e917ab3755..e042410a0e52e 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -44,12 +44,38 @@
#define REGISTER_PACKED(OPC)
#endif
-ADD_VVP_OP(VVP_GATHER, MGATHER) HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER)
-ADD_VVP_OP(VVP_SCATTER, MSCATTER) HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER)
+/// ADD_REDUCE_VVP_OP(OPC)
+/// \p OPC The VVP opcode of the operation.
+/// \p SDNAME The standard opcode of the operation.
+#ifndef ADD_REDUCE_VVP_OP
+#define ADD_REDUCE_VVP_OP(OPC, SDNAME) ADD_VVP_OP(OPC, SDNAME)
+#endif
+
+// Scalar standard ISD to perform this reduction.
+#ifndef HANDLE_VVP_REDUCE_TO_SCALAR
+#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD)
+#endif
+
+/// Reductions.
+#define HELPER_REDUCTION(OPC, SCALAR_OPC) \
+ ADD_REDUCE_VVP_OP(VVP_REDUCE_##OPC,VECREDUCE_##OPC) \
+ HANDLE_VP_TO_VVP(VP_REDUCE_##OPC, VVP_REDUCE_##OPC) \
+ HANDLE_VVP_REDUCE_TO_SCALAR(VVP_REDUCE_##OPC, SCALAR_OPC)
+
+HELPER_REDUCTION(ADD, ADD)
+HELPER_REDUCTION(AND, AND)
+HELPER_REDUCTION(OR, OR)
+HELPER_REDUCTION(XOR, XOR)
+HELPER_REDUCTION(SMAX, SMAX)
+
+#undef HELPER_REDUCTION
ADD_VVP_OP(VVP_LOAD,LOAD) HANDLE_VP_TO_VVP(VP_LOAD, VVP_LOAD) REGISTER_PACKED(VVP_LOAD)
ADD_VVP_OP(VVP_STORE,STORE) HANDLE_VP_TO_VVP(VP_STORE, VVP_STORE) REGISTER_PACKED(VVP_STORE)
+ADD_VVP_OP(VVP_GATHER, MGATHER) HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER)
+ADD_VVP_OP(VVP_SCATTER, MSCATTER) HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER)
+
// Integer arithmetic.
ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD)
ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB)
@@ -78,8 +104,11 @@ ADD_VVP_OP(VVP_SELECT,VSELECT) REGISTER_PACKED(VVP_SELECT)
HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT)
HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
+
#undef ADD_BINARY_VVP_OP
#undef ADD_BINARY_VVP_OP_COMPACT
+#undef ADD_REDUCE_VVP_OP
#undef ADD_VVP_OP
#undef HANDLE_VP_TO_VVP
+#undef HANDLE_VVP_REDUCE_TO_SCALAR
#undef REGISTER_PACKED
diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll
new file mode 100644
index 0000000000000..a0ad0cfbbb498
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.add.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_add_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_add_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vsum.l %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.add.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.add.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_add_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_add_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vsum.w.sx %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.add.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll
new file mode 100644
index 0000000000000..7186ce42c04cb
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.and.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_and_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_and_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrand %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.and.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.and.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_and_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_and_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrand %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.and.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll
new file mode 100644
index 0000000000000..8f8308d047f9f
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.or.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_or_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_or_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vror %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.or.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.or.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_or_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_or_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vror %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.or.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll
new file mode 100644
index 0000000000000..abb86645b74aa
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.smax.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_smax_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_smax_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrmaxs.l.fst %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.smax.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.smax.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_smax_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_smax_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrmaxs.w.fst.sx %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.smax.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll
new file mode 100644
index 0000000000000..ad8c7ad735020
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vector.reduce.xor.v256i64(<256 x i64>)
+
+define fastcc i64 @vec_reduce_xor_v256i64(<256 x i64> %v) {
+; CHECK-LABEL: vec_reduce_xor_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrxor %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vector.reduce.xor.v256i64( <256 x i64> %v)
+ ret i64 %r
+}
+
+declare i32 @llvm.vector.reduce.xor.v256i32(<256 x i32>)
+
+define fastcc i32 @vec_reduce_xor_v256i32(<256 x i32> %v) {
+; CHECK-LABEL: vec_reduce_xor_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s0, 256
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: lvl %s0
+; CHECK-NEXT: vrxor %v0, %v0
+; CHECK-NEXT: lvs %s0, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s0
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vector.reduce.xor.v256i32( <256 x i32> %v)
+ ret i32 %r
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll
new file mode 100644
index 0000000000000..97ea68df3e09d
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.add.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_add_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_add_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vsum.l %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: adds.l %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.add.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.add.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_add_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_add_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vsum.w.sx %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s1
+; CHECK-NEXT: adds.w.sx %s1, %s0, %s1
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.add.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll
new file mode 100644
index 0000000000000..c1dc3841e7f2f
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.and.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_and_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_and_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrand %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: and %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.and.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.and.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_and_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_and_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s2, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrand %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s2, 0, %s1
+; CHECK-NEXT: # implicit-def: $sx1
+; CHECK-NEXT: or %s1, 0, %s2
+; CHECK-NEXT: and %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.and.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll
new file mode 100644
index 0000000000000..41677f4c682a6
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.or.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_or_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_or_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vror %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.or.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.or.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_or_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_or_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s2, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vror %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s2, 0, %s1
+; CHECK-NEXT: # implicit-def: $sx1
+; CHECK-NEXT: or %s1, 0, %s2
+; CHECK-NEXT: or %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.or.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll
new file mode 100644
index 0000000000000..9791a9aa27b0e
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.smax.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_smax_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_smax_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrmaxs.l.fst %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: maxs.l %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.smax.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.smax.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_smax_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_smax_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrmaxs.w.fst.sx %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s1, 0, %s1
+; CHECK-NEXT: maxs.w.sx %s1, %s0, %s1
+; CHECK-NEXT: # implicit-def: $sx0
+; CHECK-NEXT: or %s0, 0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.smax.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll
new file mode 100644
index 0000000000000..233abb92c7f53
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s
+
+declare i64 @llvm.vp.reduce.xor.v256i64(i64, <256 x i64>, <256 x i1>, i32)
+
+define fastcc i64 @vp_reduce_xor_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_xor_v256i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrxor %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: xor %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i64 @llvm.vp.reduce.xor.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n)
+ ret i64 %r
+}
+
+declare i32 @llvm.vp.reduce.xor.v256i32(i32, <256 x i32>, <256 x i1>, i32)
+
+define fastcc i32 @vp_reduce_xor_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: vp_reduce_xor_v256i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: and %s2, %s0, (32)0
+; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2
+; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vrxor %v0, %v0, %vm1
+; CHECK-NEXT: lvs %s1, %v0(0)
+; CHECK-NEXT: or %s2, 0, %s1
+; CHECK-NEXT: # implicit-def: $sx1
+; CHECK-NEXT: or %s1, 0, %s2
+; CHECK-NEXT: xor %s0, %s0, %s1
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call i32 @llvm.vp.reduce.xor.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n)
+ ret i32 %r
+}
+
+
More information about the llvm-commits
mailing list