[llvm] r345869 - [LegalizeDAG] Add generic vector CTPOP expansion (PR32655)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 1 11:22:12 PDT 2018
Author: rksimon
Date: Thu Nov 1 11:22:11 2018
New Revision: 345869
URL: http://llvm.org/viewvc/llvm-project?rev=345869&view=rev
Log:
[LegalizeDAG] Add generic vector CTPOP expansion (PR32655)
This patch adds support for expanding vector CTPOP instructions and removes the x86 'bitmath' lowering which replicates the same expansion.
Differential Revision: https://reviews.llvm.org/D53258
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp?rev=345869&r1=345868&r2=345869&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp Thu Nov 1 11:22:11 2018
@@ -129,6 +129,7 @@ class VectorLegalizer {
SDValue ExpandFNEG(SDValue Op);
SDValue ExpandFSUB(SDValue Op);
SDValue ExpandBITREVERSE(SDValue Op);
+ SDValue ExpandCTPOP(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ(SDValue Op);
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
@@ -726,6 +727,8 @@ SDValue VectorLegalizer::Expand(SDValue
return UnrollVSETCC(Op);
case ISD::BITREVERSE:
return ExpandBITREVERSE(Op);
+ case ISD::CTPOP:
+ return ExpandCTPOP(Op);
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
return ExpandCTLZ(Op);
@@ -1104,6 +1107,16 @@ SDValue VectorLegalizer::ExpandFSUB(SDVa
return DAG.UnrollVectorOp(Op.getNode());
}
+SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
+ // Attempt to expand using TargetLowering.
+ SDValue Result;
+ if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
+ return Result;
+
+ // Otherwise go ahead and unroll.
+ return DAG.UnrollVectorOp(Op.getNode());
+}
+
SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
// Attempt to expand using TargetLowering.
SDValue Result;
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=345869&r1=345868&r2=345869&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Thu Nov 1 11:22:11 2018
@@ -4295,8 +4295,19 @@ bool TargetLowering::expandCTPOP(SDNode
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Op = Node->getOperand(0);
unsigned Len = VT.getScalarSizeInBits();
- assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
- "CTPOP not implemented for this type.");
+ assert(VT.isInteger() && "CTPOP not implemented for this type.");
+
+ // TODO: Add support for irregular type lengths.
+ if (!(Len <= 128 && Len % 8 == 0))
+ return false;
+
+ // Only expand vector types if we have the appropriate vector bit operations.
+ if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) ||
+ !isOperationLegalOrCustom(ISD::SUB, VT) ||
+ !isOperationLegalOrCustom(ISD::SRL, VT) ||
+ (Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) ||
+ !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+ return false;
// This is the "best" algorithm from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=345869&r1=345868&r2=345869&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Nov 1 11:22:11 2018
@@ -25103,57 +25103,6 @@ static SDValue LowerVectorCTPOPInRegLUT(
return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
}
-static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported.");
-
- // This is the vectorized version of the "best" algorithm from
- // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
- // with a minor tweak to use a series of adds + shifts instead of vector
- // multiplications. Implemented for all integer vector types. We only use
- // this when we don't have SSSE3 which allows a LUT-based lowering that is
- // much faster, even faster than using native popcnt instructions.
-
- auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
- MVT VT = V.getSimpleValueType();
- SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
- return DAG.getNode(OpCode, DL, VT, V, ShifterV);
- };
- auto GetMask = [&](SDValue V, APInt Mask) {
- MVT VT = V.getSimpleValueType();
- SDValue MaskV = DAG.getConstant(Mask, DL, VT);
- return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
- };
-
- // We don't want to incur the implicit masks required to SRL vNi8 vectors on
- // x86, so set the SRL type to have elements at least i16 wide. This is
- // correct because all of our SRLs are followed immediately by a mask anyways
- // that handles any bits that sneak into the high bits of the byte elements.
- MVT SrlVT = MVT::v8i16;
- SDValue V = Op;
-
- // v = v - ((v >> 1) & 0x55555555...)
- SDValue Srl =
- DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
- SDValue And = GetMask(Srl, APInt(8, 0x55));
- V = DAG.getNode(ISD::SUB, DL, VT, V, And);
-
- // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
- SDValue AndLHS = GetMask(V, APInt(8, 0x33));
- Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
- SDValue AndRHS = GetMask(Srl, APInt(8, 0x33));
- V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
-
- // v = (v + (v >> 4)) & 0x0F0F0F0F...
- Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
- SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
- V = GetMask(Add, APInt(8, 0x0F));
-
- return V;
-}
-
// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
@@ -25193,9 +25142,9 @@ static SDValue LowerVectorCTPOP(SDValue
return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
}
- // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+ // We can't use the fast LUT approach, so fall back on LegalizeDAG.
if (!Subtarget.hasSSSE3())
- return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+ return SDValue();
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}
More information about the llvm-commits
mailing list