[llvm] r339121 - [TargetLowering] Add support for non-uniform vectors to BuildUDIV
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 7 18:19:31 PDT 2018
Hi Simon,
This change is hitting an assertion failure in some of our internal tests. I have put the details in PR38477.
Douglas Yung
> -----Original Message-----
> From: llvm-commits [mailto:llvm-commits-bounces at lists.llvm.org] On
> Behalf Of Simon Pilgrim via llvm-commits
> Sent: Tuesday, August 07, 2018 2:52
> To: llvm-commits at lists.llvm.org
> Subject: [llvm] r339121 - [TargetLowering] Add support for non-uniform
> vectors to BuildUDIV
>
> Author: rksimon
> Date: Tue Aug 7 02:51:34 2018
> New Revision: 339121
>
> URL: http://llvm.org/viewvc/llvm-project?rev=339121&view=rev
> Log:
> [TargetLowering] Add support for non-uniform vectors to BuildUDIV
>
> This patch refactors the existing TargetLowering::BuildUDIV base
> implementation to support non-uniform constant vector denominators.
>
> It also includes a fold for MULHU by pow2 constants to SRL which can
> now more readily occur from BuildUDIV.
>
> Differential Revision: https://reviews.llvm.org/D49248
>
> Modified:
> llvm/trunk/include/llvm/CodeGen/TargetLowering.h
> llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
> llvm/trunk/test/CodeGen/X86/combine-udiv.ll
>
> Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
> URL: http://llvm.org/viewvc/llvm-
> project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=339121&r1=
> 339120&r2=339121&view=diff
> =======================================================================
> =======
> --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
> +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Tue Aug 7
> 02:51:34 2018
> @@ -3497,8 +3497,7 @@ public:
> //
> SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool
> IsAfterLegalization,
> SmallVectorImpl<SDNode *> &Created) const;
> - SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG
> &DAG,
> - bool IsAfterLegalization,
> + SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool
> IsAfterLegalization,
> SmallVectorImpl<SDNode *> &Created) const;
>
> /// Targets may override this function to provide custom SDIV
> lowering for
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
> URL: http://llvm.org/viewvc/llvm-
> project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=339121&
> r1=339120&r2=339121&view=diff
> =======================================================================
> =======
> --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Tue Aug 7
> 02:51:34 2018
> @@ -3278,8 +3278,6 @@ SDValue DAGCombiner::visitUDIVLike(SDVal
> SDLoc DL(N);
> EVT VT = N->getValueType(0);
>
> - ConstantSDNode *N1C = isConstOrConstSplat(N1);
> -
> // fold (udiv x, (1 << c)) -> x >>u c
> if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
> DAG.isKnownToBeAPowerOfTwo(N1)) {
> @@ -3311,7 +3309,8 @@ SDValue DAGCombiner::visitUDIVLike(SDVal
>
> // fold (udiv x, c) -> alternate
> AttributeList Attr =
> DAG.getMachineFunction().getFunction().getAttributes();
> - if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
> + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
> + !TLI.isIntDivCheap(N->getValueType(0), Attr))
> if (SDValue Op = BuildUDIV(N))
> return Op;
>
> @@ -3468,6 +3467,19 @@ SDValue DAGCombiner::visitMULHU(SDNode *
> if (N0.isUndef() || N1.isUndef())
> return DAG.getConstant(0, DL, VT);
>
> + // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
> + if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
> + DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
> + SDLoc DL(N);
> + unsigned NumEltBits = VT.getScalarSizeInBits();
> + SDValue LogBase2 = BuildLogBase2(N1, DL);
> + SDValue SRLAmt = DAG.getNode(
> + ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT),
> LogBase2);
> + EVT ShiftVT = getShiftAmountTy(N0.getValueType());
> + SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
> + return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
> + }
> +
> // If the type twice as wide is legal, transform the mulhu to a
> wider multiply
> // plus a shift.
> if (VT.isSimple() && !VT.isVector()) {
> @@ -18099,21 +18111,14 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N
> if (DAG.getMachineFunction().getFunction().optForMinSize())
> return SDValue();
>
> - ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
> - if (!C)
> - return SDValue();
> -
> - // Avoid division by zero.
> - if (C->isNullValue())
> - return SDValue();
> -
> SmallVector<SDNode *, 8> Built;
> - SDValue S =
> - TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations,
> Built);
> + if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
> + for (SDNode *N : Built)
> + AddToWorklist(N);
> + return S;
> + }
>
> - for (SDNode *N : Built)
> - AddToWorklist(N);
> - return S;
> + return SDValue();
> }
>
> /// Determines the LogBase2 value for a non-null input value using the
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
> URL: http://llvm.org/viewvc/llvm-
> project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=3391
> 21&r1=339120&r2=339121&view=diff
> =======================================================================
> =======
> --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Tue Aug 7
> 02:51:34 2018
> @@ -3547,72 +3547,142 @@ SDValue TargetLowering::BuildSDIV(SDNode
> /// return a DAG expression to select that will generate the same
> value by
> /// multiplying by a magic number.
> /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
> -SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
> - SelectionDAG &DAG, bool
> IsAfterLegalization,
> +SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
> + bool IsAfterLegalization,
> SmallVectorImpl<SDNode *> &Created)
> const {
> - EVT VT = N->getValueType(0);
> SDLoc dl(N);
> auto &DL = DAG.getDataLayout();
>
> + EVT VT = N->getValueType(0);
> + EVT ShVT = getShiftAmountTy(VT, DL);
> +
> // Check to see if we can do this.
> // FIXME: We should be more aggressive here.
> if (!isTypeLegal(VT))
> return SDValue();
>
> - // FIXME: We should use a narrower constant when the upper
> - // bits are known to be zero.
> - APInt::mu magics = Divisor.magicu();
> -
> - SDValue Q = N->getOperand(0);
> -
> - // If the divisor is even, we can avoid using the expensive fixup by
> shifting
> - // the divided value upfront.
> - if (magics.a != 0 && !Divisor[0]) {
> - unsigned Shift = Divisor.countTrailingZeros();
> - Q = DAG.getNode(
> - ISD::SRL, dl, VT, Q,
> - DAG.getConstant(Shift, dl, getShiftAmountTy(Q.getValueType(),
> DL)));
> - Created.push_back(Q.getNode());
> -
> - // Get magic number for the shifted divisor.
> - magics = Divisor.lshr(Shift).magicu(Shift);
> - assert(magics.a == 0 && "Should use cheap fixup now");
> + auto BuildUDIVPattern = [](const APInt &Divisor, unsigned &PreShift,
> + APInt &Magic, unsigned &PostShift) {
> + // FIXME: We should use a narrower constant when the upper
> + // bits are known to be zero.
> + APInt::mu magics = Divisor.magicu();
> + PreShift = PostShift = 0;
> +
> + // If the divisor is even, we can avoid using the expensive fixup
> by
> + // shifting the divided value upfront.
> + if (magics.a != 0 && !Divisor[0]) {
> + PreShift = Divisor.countTrailingZeros();
> + // Get magic number for the shifted divisor.
> + magics = Divisor.lshr(PreShift).magicu(PreShift);
> + assert(magics.a == 0 && "Should use cheap fixup now");
> + }
> +
> + Magic = magics.m;
> +
> + if (magics.a == 0) {
> + assert(magics.s < Divisor.getBitWidth() &&
> + "We shouldn't generate an undefined shift!");
> + PostShift = magics.s;
> + return false;
> + } else {
> + PostShift = magics.s - 1;
> + return true;
> + }
> + };
> +
> + SDValue N0 = N->getOperand(0);
> + SDValue N1 = N->getOperand(1);
> +
> + // Collect the shifts/magic values from each element.
> + bool UseNPQ = false;
> + SDValue PreShift, PostShift, MagicFactor, NPQFactor;
> + if (VT.isVector()) {
> + EVT SVT = VT.getScalarType();
> + EVT ShSVT = ShVT.getScalarType();
> + unsigned EltBits = VT.getScalarSizeInBits();
> + unsigned NumElts = VT.getVectorNumElements();
> + SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors,
> NPQFactors;
> + if (ISD::BUILD_VECTOR != N1.getOpcode())
> + return SDValue();
> + for (unsigned i = 0; i != NumElts; ++i) {
> + auto *C = dyn_cast<ConstantSDNode>(N1.getOperand(i));
> + if (!C || C->isNullValue() || C->getAPIntValue().getBitWidth()
> != EltBits)
> + return SDValue();
> + APInt MagicVal;
> + unsigned PreShiftVal, PostShiftVal;
> + bool SelNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal,
> MagicVal,
> + PostShiftVal);
> + PreShifts.push_back(DAG.getConstant(PreShiftVal, dl, ShSVT));
> + MagicFactors.push_back(DAG.getConstant(MagicVal, dl, SVT));
> + NPQFactors.push_back(
> + DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits,
> EltBits - 1)
> + : APInt::getNullValue(EltBits),
> + dl, SVT));
> + PostShifts.push_back(DAG.getConstant(PostShiftVal, dl, ShSVT));
> + UseNPQ |= SelNPQ;
> + }
> + PreShift = DAG.getBuildVector(ShVT, dl, PreShifts);
> + MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
> + NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors);
> + PostShift = DAG.getBuildVector(ShVT, dl, PostShifts);
> + } else {
> + auto *C = dyn_cast<ConstantSDNode>(N1);
> + if (!C || C->isNullValue())
> + return SDValue();
> + APInt MagicVal;
> + unsigned PreShiftVal, PostShiftVal;
> + UseNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal,
> MagicVal,
> + PostShiftVal);
> + PreShift = DAG.getConstant(PreShiftVal, dl, ShVT);
> + MagicFactor = DAG.getConstant(MagicVal, dl, VT);
> + PostShift = DAG.getConstant(PostShiftVal, dl, ShVT);
> }
>
> - // Multiply the numerator (operand 0) by the magic value
> - // FIXME: We should support doing a MUL in a wider type
> - if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) :
> - isOperationLegalOrCustom(ISD::MULHU, VT))
> - Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m,
> dl, VT));
> - else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT)
> :
> -
> isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
> - Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT),
> Q,
> - DAG.getConstant(magics.m, dl,
> VT)).getNode(), 1);
> - else
> - return SDValue(); // No mulhu or equivalent
> + SDValue Q = N0;
> + Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
> + Created.push_back(Q.getNode());
> +
> + // FIXME: We should support doing a MUL in a wider type.
> + auto GetMULHU = [&](SDValue X, SDValue Y) {
> + if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT)
> + : isOperationLegalOrCustom(ISD::MULHU,
> VT))
> + return DAG.getNode(ISD::MULHU, dl, VT, X, Y);
> + if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT)
> + : isOperationLegalOrCustom(ISD::UMUL_LOHI,
> VT)) {
> + SDValue LoHi =
> + DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X,
> Y);
> + return SDValue(LoHi.getNode(), 1);
> + }
> + return SDValue(); // No mulhu or equivalent
> + };
> +
> + // Multiply the numerator (operand 0) by the magic value.
> + Q = GetMULHU(Q, MagicFactor);
> + if (!Q)
> + return SDValue();
>
> Created.push_back(Q.getNode());
>
> - if (magics.a == 0) {
> - assert(magics.s < Divisor.getBitWidth() &&
> - "We shouldn't generate an undefined shift!");
> - return DAG.getNode(
> - ISD::SRL, dl, VT, Q,
> - DAG.getConstant(magics.s, dl,
> getShiftAmountTy(Q.getValueType(), DL)));
> - } else {
> - SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
> + if (UseNPQ) {
> + SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q);
> Created.push_back(NPQ.getNode());
> - NPQ = DAG.getNode(
> - ISD::SRL, dl, VT, NPQ,
> - DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(),
> DL)));
> +
> + // For vectors we might have a mix of non-NPQ/NPQ paths, so use
> + // MULHU to act as a SRL-by-1 for NPQ, else multiply by zero.
> + if (VT.isVector()) {
> + NPQ = GetMULHU(NPQ, NPQFactor);
> + } else {
> + NPQ = DAG.getNode(
> + ISD::SRL, dl, VT, NPQ,
> + DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(),
> DL)));
> + }
> Created.push_back(NPQ.getNode());
> - NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
> +
> + Q = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
> Created.push_back(NPQ.getNode());
> - return DAG.getNode(
> - ISD::SRL, dl, VT, NPQ,
> - DAG.getConstant(magics.s - 1, dl,
> - getShiftAmountTy(NPQ.getValueType(), DL)));
> }
> +
> + return DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
> }
>
> bool TargetLowering::
>
> Modified: llvm/trunk/test/CodeGen/X86/combine-udiv.ll
> URL: http://llvm.org/viewvc/llvm-
> project/llvm/trunk/test/CodeGen/X86/combine-
> udiv.ll?rev=339121&r1=339120&r2=339121&view=diff
> =======================================================================
> =======
> --- llvm/trunk/test/CodeGen/X86/combine-udiv.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/combine-udiv.ll Tue Aug 7 02:51:34
> 2018
> @@ -365,87 +365,32 @@ define <8 x i16> @combine_vec_udiv_unifo
> define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
> ; SSE-LABEL: combine_vec_udiv_nonuniform:
> ; SSE: # %bb.0:
> -; SSE-NEXT: movd %xmm0, %eax
> -; SSE-NEXT: movzwl %ax, %ecx
> -; SSE-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $4, %eax
> -; SSE-NEXT: movd %eax, %xmm1
> -; SSE-NEXT: pextrw $1, %xmm0, %eax
> -; SSE-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1
> -; SSE-NEXT: shrl $21, %eax
> -; SSE-NEXT: pinsrw $1, %eax, %xmm1
> -; SSE-NEXT: pextrw $2, %xmm0, %eax
> -; SSE-NEXT: imull $8195, %eax, %eax # imm = 0x2003
> -; SSE-NEXT: shrl $29, %eax
> -; SSE-NEXT: pinsrw $2, %eax, %xmm1
> -; SSE-NEXT: pextrw $3, %xmm0, %eax
> -; SSE-NEXT: shrl $3, %eax
> -; SSE-NEXT: imull $9363, %eax, %eax # imm = 0x2493
> -; SSE-NEXT: shrl $16, %eax
> -; SSE-NEXT: pinsrw $3, %eax, %xmm1
> -; SSE-NEXT: pextrw $4, %xmm0, %eax
> -; SSE-NEXT: shrl $7, %eax
> -; SSE-NEXT: pinsrw $4, %eax, %xmm1
> -; SSE-NEXT: pextrw $5, %xmm0, %eax
> -; SSE-NEXT: xorl %ecx, %ecx
> -; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
> -; SSE-NEXT: sete %cl
> -; SSE-NEXT: pinsrw $5, %ecx, %xmm1
> -; SSE-NEXT: pextrw $6, %xmm0, %eax
> -; SSE-NEXT: imull $32897, %eax, %eax # imm = 0x8081
> -; SSE-NEXT: shrl $31, %eax
> -; SSE-NEXT: pinsrw $6, %eax, %xmm1
> -; SSE-NEXT: pextrw $7, %xmm0, %eax
> -; SSE-NEXT: shrl $15, %eax
> -; SSE-NEXT: pinsrw $7, %eax, %xmm1
> -; SSE-NEXT: movdqa %xmm1, %xmm0
> +; SSE-NEXT: movdqa %xmm0, %xmm1
> +; SSE-NEXT: psrlw $3, %xmm1
> +; SSE-NEXT: pblendw {{.*#+}} xmm1 =
> xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
> +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
> +; SSE-NEXT: psubw %xmm1, %xmm0
> +; SSE-NEXT: movl $32768, %eax # imm = 0x8000
> +; SSE-NEXT: movd %eax, %xmm2
> +; SSE-NEXT: pmulhuw %xmm0, %xmm2
> +; SSE-NEXT: paddw %xmm1, %xmm2
> +; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4096,2048,8,u,u,2,2,u>
> +; SSE-NEXT: pmulhuw %xmm2, %xmm0
> +; SSE-NEXT: pblendw {{.*#+}} xmm0 =
> xmm0[0,1,2],xmm2[3,4],xmm0[5,6],xmm2[7]
> ; SSE-NEXT: retq
> ;
> ; AVX-LABEL: combine_vec_udiv_nonuniform:
> ; AVX: # %bb.0:
> -; AVX-NEXT: vmovd %xmm0, %eax
> -; AVX-NEXT: movzwl %ax, %ecx
> -; AVX-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $4, %eax
> -; AVX-NEXT: vmovd %eax, %xmm1
> -; AVX-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1
> -; AVX-NEXT: shrl $21, %eax
> -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $2, %xmm0, %eax
> -; AVX-NEXT: imull $8195, %eax, %eax # imm = 0x2003
> -; AVX-NEXT: shrl $29, %eax
> -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $3, %xmm0, %eax
> -; AVX-NEXT: shrl $3, %eax
> -; AVX-NEXT: imull $9363, %eax, %eax # imm = 0x2493
> -; AVX-NEXT: shrl $16, %eax
> -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $4, %xmm0, %eax
> -; AVX-NEXT: shrl $7, %eax
> -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $5, %xmm0, %eax
> -; AVX-NEXT: xorl %ecx, %ecx
> -; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
> -; AVX-NEXT: sete %cl
> -; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $6, %xmm0, %eax
> -; AVX-NEXT: imull $32897, %eax, %eax # imm = 0x8081
> -; AVX-NEXT: shrl $31, %eax
> -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $7, %xmm0, %eax
> -; AVX-NEXT: shrl $15, %eax
> -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
> +; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
> +; AVX-NEXT: vpblendw {{.*#+}} xmm1 =
> xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
> +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
> +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
> +; AVX-NEXT: movl $32768, %eax # imm = 0x8000
> +; AVX-NEXT: vmovd %eax, %xmm2
> +; AVX-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0
> +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
> +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
> +; AVX-NEXT: vpblendw {{.*#+}} xmm0 =
> xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
> ; AVX-NEXT: retq
> %1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128,
> i16 -1, i16 -256, i16 -32768>
> ret <8 x i16> %1
> @@ -454,77 +399,20 @@ define <8 x i16> @combine_vec_udiv_nonun
> define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
> ; SSE-LABEL: combine_vec_udiv_nonuniform2:
> ; SSE: # %bb.0:
> -; SSE-NEXT: pextrw $1, %xmm0, %eax
> -; SSE-NEXT: imull $59919, %eax, %eax # imm = 0xEA0F
> -; SSE-NEXT: shrl $21, %eax
> -; SSE-NEXT: pextrw $0, %xmm0, %ecx
> -; SSE-NEXT: shrl %ecx
> -; SSE-NEXT: imull $16393, %ecx, %ecx # imm = 0x4009
> -; SSE-NEXT: shrl $29, %ecx
> -; SSE-NEXT: movd %ecx, %xmm1
> -; SSE-NEXT: pinsrw $1, %eax, %xmm1
> -; SSE-NEXT: pextrw $2, %xmm0, %eax
> -; SSE-NEXT: imull $58255, %eax, %eax # imm = 0xE38F
> -; SSE-NEXT: shrl $21, %eax
> -; SSE-NEXT: pinsrw $2, %eax, %xmm1
> -; SSE-NEXT: pextrw $3, %xmm0, %eax
> -; SSE-NEXT: imull $32787, %eax, %eax # imm = 0x8013
> -; SSE-NEXT: shrl $31, %eax
> -; SSE-NEXT: pinsrw $3, %eax, %xmm1
> -; SSE-NEXT: pextrw $4, %xmm0, %eax
> -; SSE-NEXT: imull $55189, %eax, %eax # imm = 0xD795
> -; SSE-NEXT: shrl $21, %eax
> -; SSE-NEXT: pinsrw $4, %eax, %xmm1
> -; SSE-NEXT: pextrw $5, %xmm0, %eax
> -; SSE-NEXT: imull $8197, %eax, %eax # imm = 0x2005
> -; SSE-NEXT: shrl $29, %eax
> -; SSE-NEXT: pinsrw $5, %eax, %xmm1
> -; SSE-NEXT: pextrw $6, %xmm0, %eax
> -; SSE-NEXT: imull $52429, %eax, %eax # imm = 0xCCCD
> -; SSE-NEXT: shrl $21, %eax
> -; SSE-NEXT: pinsrw $6, %eax, %xmm1
> -; SSE-NEXT: pextrw $7, %xmm0, %eax
> -; SSE-NEXT: imull $32789, %eax, %eax # imm = 0x8015
> -; SSE-NEXT: shrl $31, %eax
> -; SSE-NEXT: pinsrw $7, %eax, %xmm1
> +; SSE-NEXT: movdqa %xmm0, %xmm1
> +; SSE-NEXT: psrlw $1, %xmm1
> +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
> +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
> +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
> ; SSE-NEXT: movdqa %xmm1, %xmm0
> ; SSE-NEXT: retq
> ;
> ; AVX-LABEL: combine_vec_udiv_nonuniform2:
> ; AVX: # %bb.0:
> -; AVX-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX-NEXT: imull $59919, %eax, %eax # imm = 0xEA0F
> -; AVX-NEXT: shrl $21, %eax
> -; AVX-NEXT: vpextrw $0, %xmm0, %ecx
> -; AVX-NEXT: shrl %ecx
> -; AVX-NEXT: imull $16393, %ecx, %ecx # imm = 0x4009
> -; AVX-NEXT: shrl $29, %ecx
> -; AVX-NEXT: vmovd %ecx, %xmm1
> -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $2, %xmm0, %eax
> -; AVX-NEXT: imull $58255, %eax, %eax # imm = 0xE38F
> -; AVX-NEXT: shrl $21, %eax
> -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $3, %xmm0, %eax
> -; AVX-NEXT: imull $32787, %eax, %eax # imm = 0x8013
> -; AVX-NEXT: shrl $31, %eax
> -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $4, %xmm0, %eax
> -; AVX-NEXT: imull $55189, %eax, %eax # imm = 0xD795
> -; AVX-NEXT: shrl $21, %eax
> -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $5, %xmm0, %eax
> -; AVX-NEXT: imull $8197, %eax, %eax # imm = 0x2005
> -; AVX-NEXT: shrl $29, %eax
> -; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $6, %xmm0, %eax
> -; AVX-NEXT: imull $52429, %eax, %eax # imm = 0xCCCD
> -; AVX-NEXT: shrl $21, %eax
> -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $7, %xmm0, %eax
> -; AVX-NEXT: imull $32789, %eax, %eax # imm = 0x8015
> -; AVX-NEXT: shrl $31, %eax
> -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
> +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1
> +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
> +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
> +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
> ; AVX-NEXT: retq
> %1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38,
> i16 -39, i16 40, i16 -41>
> ret <8 x i16> %1
> @@ -533,157 +421,21 @@ define <8 x i16> @combine_vec_udiv_nonun
> define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
> ; SSE-LABEL: combine_vec_udiv_nonuniform3:
> ; SSE: # %bb.0:
> -; SSE-NEXT: pextrw $1, %xmm0, %eax
> -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $4, %eax
> -; SSE-NEXT: movd %xmm0, %ecx
> -; SSE-NEXT: movzwl %cx, %edx
> -; SSE-NEXT: imull $9363, %edx, %edx # imm = 0x2493
> -; SSE-NEXT: shrl $16, %edx
> -; SSE-NEXT: subl %edx, %ecx
> -; SSE-NEXT: movzwl %cx, %ecx
> -; SSE-NEXT: shrl %ecx
> -; SSE-NEXT: addl %edx, %ecx
> -; SSE-NEXT: shrl $2, %ecx
> -; SSE-NEXT: movd %ecx, %xmm1
> -; SSE-NEXT: pinsrw $1, %eax, %xmm1
> -; SSE-NEXT: pextrw $2, %xmm0, %eax
> -; SSE-NEXT: imull $18351, %eax, %ecx # imm = 0x47AF
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $4, %eax
> -; SSE-NEXT: pinsrw $2, %eax, %xmm1
> -; SSE-NEXT: pextrw $3, %xmm0, %eax
> -; SSE-NEXT: imull $12137, %eax, %ecx # imm = 0x2F69
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $4, %eax
> -; SSE-NEXT: pinsrw $3, %eax, %xmm1
> -; SSE-NEXT: pextrw $4, %xmm0, %eax
> -; SSE-NEXT: imull $2115, %eax, %ecx # imm = 0x843
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $4, %eax
> -; SSE-NEXT: pinsrw $4, %eax, %xmm1
> -; SSE-NEXT: pextrw $5, %xmm0, %eax
> -; SSE-NEXT: imull $23705, %eax, %ecx # imm = 0x5C99
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $5, %eax
> -; SSE-NEXT: pinsrw $5, %eax, %xmm1
> -; SSE-NEXT: pextrw $6, %xmm0, %eax
> -; SSE-NEXT: imull $1041, %eax, %ecx # imm = 0x411
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $5, %eax
> -; SSE-NEXT: pinsrw $6, %eax, %xmm1
> -; SSE-NEXT: pextrw $7, %xmm0, %eax
> -; SSE-NEXT: imull $517, %eax, %ecx # imm = 0x205
> -; SSE-NEXT: shrl $16, %ecx
> -; SSE-NEXT: subl %ecx, %eax
> -; SSE-NEXT: movzwl %ax, %eax
> -; SSE-NEXT: shrl %eax
> -; SSE-NEXT: addl %ecx, %eax
> -; SSE-NEXT: shrl $6, %eax
> -; SSE-NEXT: pinsrw $7, %eax, %xmm1
> -; SSE-NEXT: movdqa %xmm1, %xmm0
> +; SSE-NEXT: movdqa {{.*#+}} xmm1 =
> [9363,25645,18351,12137,2115,23705,1041,517]
> +; SSE-NEXT: pmulhuw %xmm0, %xmm1
> +; SSE-NEXT: psubw %xmm1, %xmm0
> +; SSE-NEXT: psrlw $1, %xmm0
> +; SSE-NEXT: paddw %xmm1, %xmm0
> +; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm0
> ; SSE-NEXT: retq
> ;
> ; AVX-LABEL: combine_vec_udiv_nonuniform3:
> ; AVX: # %bb.0:
> -; AVX-NEXT: vpextrw $1, %xmm0, %eax
> -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $4, %eax
> -; AVX-NEXT: vmovd %xmm0, %ecx
> -; AVX-NEXT: movzwl %cx, %edx
> -; AVX-NEXT: imull $9363, %edx, %edx # imm = 0x2493
> -; AVX-NEXT: shrl $16, %edx
> -; AVX-NEXT: subl %edx, %ecx
> -; AVX-NEXT: movzwl %cx, %ecx
> -; AVX-NEXT: shrl %ecx
> -; AVX-NEXT: addl %edx, %ecx
> -; AVX-NEXT: shrl $2, %ecx
> -; AVX-NEXT: vmovd %ecx, %xmm1
> -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $2, %xmm0, %eax
> -; AVX-NEXT: imull $18351, %eax, %ecx # imm = 0x47AF
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $4, %eax
> -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $3, %xmm0, %eax
> -; AVX-NEXT: imull $12137, %eax, %ecx # imm = 0x2F69
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $4, %eax
> -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $4, %xmm0, %eax
> -; AVX-NEXT: imull $2115, %eax, %ecx # imm = 0x843
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $4, %eax
> -; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $5, %xmm0, %eax
> -; AVX-NEXT: imull $23705, %eax, %ecx # imm = 0x5C99
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $5, %eax
> -; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $6, %xmm0, %eax
> -; AVX-NEXT: imull $1041, %eax, %ecx # imm = 0x411
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $5, %eax
> -; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
> -; AVX-NEXT: vpextrw $7, %xmm0, %eax
> -; AVX-NEXT: imull $517, %eax, %ecx # imm = 0x205
> -; AVX-NEXT: shrl $16, %ecx
> -; AVX-NEXT: subl %ecx, %eax
> -; AVX-NEXT: movzwl %ax, %eax
> -; AVX-NEXT: shrl %eax
> -; AVX-NEXT: addl %ecx, %eax
> -; AVX-NEXT: shrl $6, %eax
> -; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
> +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
> +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
> +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
> +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
> +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
> ; AVX-NEXT: retq
> %1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16
> 47, i16 63, i16 127>
> ret <8 x i16> %1
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list