[llvm] r339121 - [TargetLowering] Add support for non-uniform vectors to BuildUDIV
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 7 02:51:34 PDT 2018
Author: rksimon
Date: Tue Aug 7 02:51:34 2018
New Revision: 339121
URL: http://llvm.org/viewvc/llvm-project?rev=339121&view=rev
Log:
[TargetLowering] Add support for non-uniform vectors to BuildUDIV
This patch refactors the existing TargetLowering::BuildUDIV base implementation to support non-uniform constant vector denominators.
It also includes a fold for MULHU by pow2 constants to SRL which can now more readily occur from BuildUDIV.
Differential Revision: https://reviews.llvm.org/D49248
Modified:
llvm/trunk/include/llvm/CodeGen/TargetLowering.h
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/trunk/test/CodeGen/X86/combine-udiv.ll
Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=339121&r1=339120&r2=339121&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Tue Aug 7 02:51:34 2018
@@ -3497,8 +3497,7 @@ public:
//
SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const;
- SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
- bool IsAfterLegalization,
+ SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const;
/// Targets may override this function to provide custom SDIV lowering for
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=339121&r1=339120&r2=339121&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Tue Aug 7 02:51:34 2018
@@ -3278,8 +3278,6 @@ SDValue DAGCombiner::visitUDIVLike(SDVal
SDLoc DL(N);
EVT VT = N->getValueType(0);
- ConstantSDNode *N1C = isConstOrConstSplat(N1);
-
// fold (udiv x, (1 << c)) -> x >>u c
if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
DAG.isKnownToBeAPowerOfTwo(N1)) {
@@ -3311,7 +3309,8 @@ SDValue DAGCombiner::visitUDIVLike(SDVal
// fold (udiv x, c) -> alternate
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
- if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+ if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+ !TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue Op = BuildUDIV(N))
return Op;
@@ -3468,6 +3467,19 @@ SDValue DAGCombiner::visitMULHU(SDNode *
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, DL, VT);
+ // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
+ if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+ DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
+ SDLoc DL(N);
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ SDValue LogBase2 = BuildLogBase2(N1, DL);
+ SDValue SRLAmt = DAG.getNode(
+ ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2);
+ EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+ SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
+ return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
+ }
+
// If the type twice as wide is legal, transform the mulhu to a wider multiply
// plus a shift.
if (VT.isSimple() && !VT.isVector()) {
@@ -18099,21 +18111,14 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N
if (DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();
- ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
- if (!C)
- return SDValue();
-
- // Avoid division by zero.
- if (C->isNullValue())
- return SDValue();
-
SmallVector<SDNode *, 8> Built;
- SDValue S =
- TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, Built);
+ if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
+ for (SDNode *N : Built)
+ AddToWorklist(N);
+ return S;
+ }
- for (SDNode *N : Built)
- AddToWorklist(N);
- return S;
+ return SDValue();
}
/// Determines the LogBase2 value for a non-null input value using the
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=339121&r1=339120&r2=339121&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Tue Aug 7 02:51:34 2018
@@ -3547,72 +3547,142 @@ SDValue TargetLowering::BuildSDIV(SDNode
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
- SelectionDAG &DAG, bool IsAfterLegalization,
+SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
+ bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const {
- EVT VT = N->getValueType(0);
SDLoc dl(N);
auto &DL = DAG.getDataLayout();
+ EVT VT = N->getValueType(0);
+ EVT ShVT = getShiftAmountTy(VT, DL);
+
// Check to see if we can do this.
// FIXME: We should be more aggressive here.
if (!isTypeLegal(VT))
return SDValue();
- // FIXME: We should use a narrower constant when the upper
- // bits are known to be zero.
- APInt::mu magics = Divisor.magicu();
-
- SDValue Q = N->getOperand(0);
-
- // If the divisor is even, we can avoid using the expensive fixup by shifting
- // the divided value upfront.
- if (magics.a != 0 && !Divisor[0]) {
- unsigned Shift = Divisor.countTrailingZeros();
- Q = DAG.getNode(
- ISD::SRL, dl, VT, Q,
- DAG.getConstant(Shift, dl, getShiftAmountTy(Q.getValueType(), DL)));
- Created.push_back(Q.getNode());
-
- // Get magic number for the shifted divisor.
- magics = Divisor.lshr(Shift).magicu(Shift);
- assert(magics.a == 0 && "Should use cheap fixup now");
+ auto BuildUDIVPattern = [](const APInt &Divisor, unsigned &PreShift,
+ APInt &Magic, unsigned &PostShift) {
+ // FIXME: We should use a narrower constant when the upper
+ // bits are known to be zero.
+ APInt::mu magics = Divisor.magicu();
+ PreShift = PostShift = 0;
+
+ // If the divisor is even, we can avoid using the expensive fixup by
+ // shifting the divided value upfront.
+ if (magics.a != 0 && !Divisor[0]) {
+ PreShift = Divisor.countTrailingZeros();
+ // Get magic number for the shifted divisor.
+ magics = Divisor.lshr(PreShift).magicu(PreShift);
+ assert(magics.a == 0 && "Should use cheap fixup now");
+ }
+
+ Magic = magics.m;
+
+ if (magics.a == 0) {
+ assert(magics.s < Divisor.getBitWidth() &&
+ "We shouldn't generate an undefined shift!");
+ PostShift = magics.s;
+ return false;
+ } else {
+ PostShift = magics.s - 1;
+ return true;
+ }
+ };
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Collect the shifts/magic values from each element.
+ bool UseNPQ = false;
+ SDValue PreShift, PostShift, MagicFactor, NPQFactor;
+ if (VT.isVector()) {
+ EVT SVT = VT.getScalarType();
+ EVT ShSVT = ShVT.getScalarType();
+ unsigned EltBits = VT.getScalarSizeInBits();
+ unsigned NumElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
+ if (ISD::BUILD_VECTOR != N1.getOpcode())
+ return SDValue();
+ for (unsigned i = 0; i != NumElts; ++i) {
+ auto *C = dyn_cast<ConstantSDNode>(N1.getOperand(i));
+ if (!C || C->isNullValue() || C->getAPIntValue().getBitWidth() != EltBits)
+ return SDValue();
+ APInt MagicVal;
+ unsigned PreShiftVal, PostShiftVal;
+ bool SelNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal, MagicVal,
+ PostShiftVal);
+ PreShifts.push_back(DAG.getConstant(PreShiftVal, dl, ShSVT));
+ MagicFactors.push_back(DAG.getConstant(MagicVal, dl, SVT));
+ NPQFactors.push_back(
+ DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
+ : APInt::getNullValue(EltBits),
+ dl, SVT));
+ PostShifts.push_back(DAG.getConstant(PostShiftVal, dl, ShSVT));
+ UseNPQ |= SelNPQ;
+ }
+ PreShift = DAG.getBuildVector(ShVT, dl, PreShifts);
+ MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
+ NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors);
+ PostShift = DAG.getBuildVector(ShVT, dl, PostShifts);
+ } else {
+ auto *C = dyn_cast<ConstantSDNode>(N1);
+ if (!C || C->isNullValue())
+ return SDValue();
+ APInt MagicVal;
+ unsigned PreShiftVal, PostShiftVal;
+ UseNPQ = BuildUDIVPattern(C->getAPIntValue(), PreShiftVal, MagicVal,
+ PostShiftVal);
+ PreShift = DAG.getConstant(PreShiftVal, dl, ShVT);
+ MagicFactor = DAG.getConstant(MagicVal, dl, VT);
+ PostShift = DAG.getConstant(PostShiftVal, dl, ShVT);
}
- // Multiply the numerator (operand 0) by the magic value
- // FIXME: We should support doing a MUL in a wider type
- if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) :
- isOperationLegalOrCustom(ISD::MULHU, VT))
- Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, dl, VT));
- else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) :
- isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
- Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q,
- DAG.getConstant(magics.m, dl, VT)).getNode(), 1);
- else
- return SDValue(); // No mulhu or equivalent
+ SDValue Q = N0;
+ Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
+ Created.push_back(Q.getNode());
+
+ // FIXME: We should support doing a MUL in a wider type.
+ auto GetMULHU = [&](SDValue X, SDValue Y) {
+ if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT)
+ : isOperationLegalOrCustom(ISD::MULHU, VT))
+ return DAG.getNode(ISD::MULHU, dl, VT, X, Y);
+ if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT)
+ : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) {
+ SDValue LoHi =
+ DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
+ return SDValue(LoHi.getNode(), 1);
+ }
+ return SDValue(); // No mulhu or equivalent
+ };
+
+ // Multiply the numerator (operand 0) by the magic value.
+ Q = GetMULHU(Q, MagicFactor);
+ if (!Q)
+ return SDValue();
Created.push_back(Q.getNode());
- if (magics.a == 0) {
- assert(magics.s < Divisor.getBitWidth() &&
- "We shouldn't generate an undefined shift!");
- return DAG.getNode(
- ISD::SRL, dl, VT, Q,
- DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL)));
- } else {
- SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
+ if (UseNPQ) {
+ SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q);
Created.push_back(NPQ.getNode());
- NPQ = DAG.getNode(
- ISD::SRL, dl, VT, NPQ,
- DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL)));
+
+ // For vectors we might have a mix of non-NPQ/NPQ paths, so use
+ // MULHU to act as a SRL-by-1 for NPQ, else multiply by zero.
+ if (VT.isVector()) {
+ NPQ = GetMULHU(NPQ, NPQFactor);
+ } else {
+ NPQ = DAG.getNode(
+ ISD::SRL, dl, VT, NPQ,
+ DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL)));
+ }
Created.push_back(NPQ.getNode());
- NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
+
+ Q = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
Created.push_back(NPQ.getNode());
- return DAG.getNode(
- ISD::SRL, dl, VT, NPQ,
- DAG.getConstant(magics.s - 1, dl,
- getShiftAmountTy(NPQ.getValueType(), DL)));
}
+
+ return DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
}
bool TargetLowering::
Modified: llvm/trunk/test/CodeGen/X86/combine-udiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-udiv.ll?rev=339121&r1=339120&r2=339121&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-udiv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-udiv.ll Tue Aug 7 02:51:34 2018
@@ -365,87 +365,32 @@ define <8 x i16> @combine_vec_udiv_unifo
define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE-LABEL: combine_vec_udiv_nonuniform:
; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: movzwl %ax, %ecx
-; SSE-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $4, %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1
-; SSE-NEXT: shrl $21, %eax
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $8195, %eax, %eax # imm = 0x2003
-; SSE-NEXT: shrl $29, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: imull $9363, %eax, %eax # imm = 0x2493
-; SSE-NEXT: shrl $16, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: pextrw $4, %xmm0, %eax
-; SSE-NEXT: shrl $7, %eax
-; SSE-NEXT: pinsrw $4, %eax, %xmm1
-; SSE-NEXT: pextrw $5, %xmm0, %eax
-; SSE-NEXT: xorl %ecx, %ecx
-; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; SSE-NEXT: sete %cl
-; SSE-NEXT: pinsrw $5, %ecx, %xmm1
-; SSE-NEXT: pextrw $6, %xmm0, %eax
-; SSE-NEXT: imull $32897, %eax, %eax # imm = 0x8081
-; SSE-NEXT: shrl $31, %eax
-; SSE-NEXT: pinsrw $6, %eax, %xmm1
-; SSE-NEXT: pextrw $7, %xmm0, %eax
-; SSE-NEXT: shrl $15, %eax
-; SSE-NEXT: pinsrw $7, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $3, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: movl $32768, %eax # imm = 0x8000
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: pmulhuw %xmm0, %xmm2
+; SSE-NEXT: paddw %xmm1, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4096,2048,8,u,u,2,2,u>
+; SSE-NEXT: pmulhuw %xmm2, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6],xmm2[7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movzwl %ax, %ecx
-; AVX-NEXT: imull $25645, %ecx, %ecx # imm = 0x642D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $4, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $61681, %eax, %eax # imm = 0xF0F1
-; AVX-NEXT: shrl $21, %eax
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imull $8195, %eax, %eax # imm = 0x2003
-; AVX-NEXT: shrl $29, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: shrl $3, %eax
-; AVX-NEXT: imull $9363, %eax, %eax # imm = 0x2493
-; AVX-NEXT: shrl $16, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %eax
-; AVX-NEXT: shrl $7, %eax
-; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %eax
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; AVX-NEXT: sete %cl
-; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %eax
-; AVX-NEXT: imull $32897, %eax, %eax # imm = 0x8081
-; AVX-NEXT: shrl $31, %eax
-; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %eax
-; AVX-NEXT: shrl $15, %eax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: movl $32768, %eax # imm = 0x8000
+; AVX-NEXT: vmovd %eax, %xmm2
+; AVX-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
; AVX-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768>
ret <8 x i16> %1
@@ -454,77 +399,20 @@ define <8 x i16> @combine_vec_udiv_nonun
define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SSE-LABEL: combine_vec_udiv_nonuniform2:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $59919, %eax, %eax # imm = 0xEA0F
-; SSE-NEXT: shrl $21, %eax
-; SSE-NEXT: pextrw $0, %xmm0, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: imull $16393, %ecx, %ecx # imm = 0x4009
-; SSE-NEXT: shrl $29, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $58255, %eax, %eax # imm = 0xE38F
-; SSE-NEXT: shrl $21, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $32787, %eax, %eax # imm = 0x8013
-; SSE-NEXT: shrl $31, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: pextrw $4, %xmm0, %eax
-; SSE-NEXT: imull $55189, %eax, %eax # imm = 0xD795
-; SSE-NEXT: shrl $21, %eax
-; SSE-NEXT: pinsrw $4, %eax, %xmm1
-; SSE-NEXT: pextrw $5, %xmm0, %eax
-; SSE-NEXT: imull $8197, %eax, %eax # imm = 0x2005
-; SSE-NEXT: shrl $29, %eax
-; SSE-NEXT: pinsrw $5, %eax, %xmm1
-; SSE-NEXT: pextrw $6, %xmm0, %eax
-; SSE-NEXT: imull $52429, %eax, %eax # imm = 0xCCCD
-; SSE-NEXT: shrl $21, %eax
-; SSE-NEXT: pinsrw $6, %eax, %xmm1
-; SSE-NEXT: pextrw $7, %xmm0, %eax
-; SSE-NEXT: imull $32789, %eax, %eax # imm = 0x8015
-; SSE-NEXT: shrl $31, %eax
-; SSE-NEXT: pinsrw $7, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $1, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
+; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform2:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $59919, %eax, %eax # imm = 0xEA0F
-; AVX-NEXT: shrl $21, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: imull $16393, %ecx, %ecx # imm = 0x4009
-; AVX-NEXT: shrl $29, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imull $58255, %eax, %eax # imm = 0xE38F
-; AVX-NEXT: shrl $21, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $32787, %eax, %eax # imm = 0x8013
-; AVX-NEXT: shrl $31, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %eax
-; AVX-NEXT: imull $55189, %eax, %eax # imm = 0xD795
-; AVX-NEXT: shrl $21, %eax
-; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %eax
-; AVX-NEXT: imull $8197, %eax, %eax # imm = 0x2005
-; AVX-NEXT: shrl $29, %eax
-; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %eax
-; AVX-NEXT: imull $52429, %eax, %eax # imm = 0xCCCD
-; AVX-NEXT: shrl $21, %eax
-; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %eax
-; AVX-NEXT: imull $32789, %eax, %eax # imm = 0x8015
-; AVX-NEXT: shrl $31, %eax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
ret <8 x i16> %1
@@ -533,157 +421,21 @@ define <8 x i16> @combine_vec_udiv_nonun
define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; SSE-LABEL: combine_vec_udiv_nonuniform3:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $4, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movzwl %cx, %edx
-; SSE-NEXT: imull $9363, %edx, %edx # imm = 0x2493
-; SSE-NEXT: shrl $16, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movzwl %cx, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: addl %edx, %ecx
-; SSE-NEXT: shrl $2, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $18351, %eax, %ecx # imm = 0x47AF
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $4, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $12137, %eax, %ecx # imm = 0x2F69
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $4, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: pextrw $4, %xmm0, %eax
-; SSE-NEXT: imull $2115, %eax, %ecx # imm = 0x843
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $4, %eax
-; SSE-NEXT: pinsrw $4, %eax, %xmm1
-; SSE-NEXT: pextrw $5, %xmm0, %eax
-; SSE-NEXT: imull $23705, %eax, %ecx # imm = 0x5C99
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $5, %eax
-; SSE-NEXT: pinsrw $5, %eax, %xmm1
-; SSE-NEXT: pextrw $6, %xmm0, %eax
-; SSE-NEXT: imull $1041, %eax, %ecx # imm = 0x411
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $5, %eax
-; SSE-NEXT: pinsrw $6, %eax, %xmm1
-; SSE-NEXT: pextrw $7, %xmm0, %eax
-; SSE-NEXT: imull $517, %eax, %ecx # imm = 0x205
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movzwl %ax, %eax
-; SSE-NEXT: shrl %eax
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: shrl $6, %eax
-; SSE-NEXT: pinsrw $7, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,25645,18351,12137,2115,23705,1041,517]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: psrlw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform3:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $4, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movzwl %cx, %edx
-; AVX-NEXT: imull $9363, %edx, %edx # imm = 0x2493
-; AVX-NEXT: shrl $16, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imull $18351, %eax, %ecx # imm = 0x47AF
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $4, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $12137, %eax, %ecx # imm = 0x2F69
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $4, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %eax
-; AVX-NEXT: imull $2115, %eax, %ecx # imm = 0x843
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $4, %eax
-; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %eax
-; AVX-NEXT: imull $23705, %eax, %ecx # imm = 0x5C99
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $5, %eax
-; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %eax
-; AVX-NEXT: imull $1041, %eax, %ecx # imm = 0x411
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $5, %eax
-; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %eax
-; AVX-NEXT: imull $517, %eax, %ecx # imm = 0x205
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $6, %eax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
ret <8 x i16> %1
More information about the llvm-commits
mailing list