[llvm] r175619 - I optimized the following patterns:
Muhammad Tauqir
mtahmed at uwaterloo.ca
Sun Feb 24 12:48:44 PST 2013
Hi Elena,
For v8i8 -> v8i64, the code pasted below is produced.
I noticed that there is an apparently unnecessary vpmovzxbw and then a
vpor even though (as far as I can see), the first mask can be changed
to (0x0, 0x80, 0x80, 0x80, 0x1, 0x80, 0x80, 0x80, 0x2 0x80, 0x80,
0x80, 0x3, 0x80, 0x80, 0x80) which would avoid the use of the
vpmovzxbw + vpor.
Is there are reason it is done this way or is that unintentional?
.LCPI0_0:
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 1 # 0x1
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 2 # 0x2
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 3 # 0x3
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.LCPI0_1:
.byte 0 # 0x0
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.LCPI0_2:
.byte 4 # 0x4
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 5 # 0x5
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 6 # 0x6
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.byte 7 # 0x7
.byte 128 # 0x80
.byte 128 # 0x80
.byte 128 # 0x80
.text
.globl func17
.align 16, 0x90
.type func17, at function
func17: # @func17
.cfi_startproc
# BB#0:
vmovq (%rax), %xmm1
vpshufb .LCPI0_0(%rip), %xmm1, %xmm0
vpmovzxbw %xmm1, %xmm2
vpshufb .LCPI0_1(%rip), %xmm2, %xmm2
vpor %xmm0, %xmm2, %xmm0
vpslld $24, %xmm0, %xmm0
vpsrad $24, %xmm0, %xmm0
vpshufb .LCPI0_2(%rip), %xmm1, %xmm1
vpslld $24, %xmm1, %xmm1
vpsrad $24, %xmm1, %xmm2
vpmovsxdq %xmm2, %xmm1
vpmovsxdq %xmm0, %xmm3
vmovdqa %xmm3, (%rax)
vmovhlps %xmm2, %xmm2, %xmm2 # xmm2 = xmm2[1,1]
vpmovsxdq %xmm2, %xmm2
vinsertf128 $1, %xmm2, %ymm1, %ymm1
vmovaps %ymm1, (%rax)
vmovhlps %xmm0, %xmm0, %xmm0 # xmm0 = xmm0[1,1]
vpmovsxdq %xmm0, %xmm0
vmovdqa %xmm0, (%rax)
vzeroupper
ret
- Muhammad Tauqir
On Wed, Feb 20, 2013 at 7:42 AM, Elena Demikhovsky
<elena.demikhovsky at intel.com> wrote:
> Author: delena
> Date: Wed Feb 20 06:42:54 2013
> New Revision: 175619
>
> URL: http://llvm.org/viewvc/llvm-project?rev=175619&view=rev
> Log:
> I optimized the following patterns:
> sext <4 x i1> to <4 x i64>
> sext <4 x i8> to <4 x i64>
> sext <4 x i16> to <4 x i64>
>
> I'm running Combine on SIGN_EXTEND_IN_REG and revert SEXT patterns:
> (sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
>
> The sext_in_reg (v4i32 x) may be lowered to shl+sar operations.
> The "sar" does not exist on 64-bit operation, so lowering sext_in_reg (v4i64 x) has no vector solution.
>
> I also added a cost of this operations to the AVX costs table.
>
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
> llvm/trunk/test/Analysis/CostModel/X86/cast.ll
> llvm/trunk/test/CodeGen/X86/avx-sext.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Feb 20 06:42:54 2013
> @@ -1323,6 +1323,7 @@ X86TargetLowering::X86TargetLowering(X86
> setTargetDAGCombine(ISD::ZERO_EXTEND);
> setTargetDAGCombine(ISD::ANY_EXTEND);
> setTargetDAGCombine(ISD::SIGN_EXTEND);
> + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
> setTargetDAGCombine(ISD::TRUNCATE);
> setTargetDAGCombine(ISD::SINT_TO_FP);
> setTargetDAGCombine(ISD::SETCC);
> @@ -17076,6 +17077,41 @@ static SDValue PerformVZEXT_MOVLCombine(
> return SDValue();
> }
>
> +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
> + const X86Subtarget *Subtarget) {
> + EVT VT = N->getValueType(0);
> + if (!VT.isVector())
> + return SDValue();
> +
> + SDValue N0 = N->getOperand(0);
> + SDValue N1 = N->getOperand(1);
> + EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
> + DebugLoc dl = N->getDebugLoc();
> +
> + // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
> + // both SSE and AVX2 since there is no sign-extended shift right
> + // operation on a vector with 64-bit elements.
> + //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
> + // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
> + if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
> + N0.getOpcode() == ISD::SIGN_EXTEND)) {
> + SDValue N00 = N0.getOperand(0);
> +
> + // EXTLOAD has a better solution on AVX2,
> + // it may be replaced with X86ISD::VSEXT node.
> + if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
> + if (!ISD::isNormalLoad(N00.getNode()))
> + return SDValue();
> +
> + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
> + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
> + N00, N1);
> + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
> + }
> + }
> + return SDValue();
> +}
> +
> static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
> TargetLowering::DAGCombinerInfo &DCI,
> const X86Subtarget *Subtarget) {
> @@ -17468,6 +17504,7 @@ SDValue X86TargetLowering::PerformDAGCom
> case ISD::ANY_EXTEND:
> case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
> case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
> + case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
> case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
> case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
> case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
>
> Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed Feb 20 06:42:54 2013
> @@ -232,6 +232,9 @@ unsigned X86TTI::getCastInstrCost(unsign
> { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
> { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
> { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
> + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },
> + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
> + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 },
> { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
> };
>
>
> Modified: llvm/trunk/test/Analysis/CostModel/X86/cast.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/cast.ll?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/test/Analysis/CostModel/X86/cast.ll (original)
> +++ llvm/trunk/test/Analysis/CostModel/X86/cast.ll Wed Feb 20 06:42:54 2013
> @@ -44,6 +44,10 @@ define i32 @zext_sext(<8 x i1> %in) {
> %B = zext <8 x i16> undef to <8 x i32>
> ;CHECK: cost of 1 {{.*}} sext
> %C = sext <4 x i32> undef to <4 x i64>
> + ;CHECK: cost of 8 {{.*}} sext
> + %C1 = sext <4 x i8> undef to <4 x i64>
> + ;CHECK: cost of 8 {{.*}} sext
> + %C2 = sext <4 x i16> undef to <4 x i64>
>
> ;CHECK: cost of 1 {{.*}} zext
> %D = zext <4 x i32> undef to <4 x i64>
> @@ -59,7 +63,7 @@ define i32 @zext_sext(<8 x i1> %in) {
> ret i32 undef
> }
>
> -define i32 @masks(<8 x i1> %in) {
> +define i32 @masks8(<8 x i1> %in) {
> ;CHECK: cost of 6 {{.*}} zext
> %Z = zext <8 x i1> %in to <8 x i32>
> ;CHECK: cost of 9 {{.*}} sext
> @@ -67,3 +71,9 @@ define i32 @masks(<8 x i1> %in) {
> ret i32 undef
> }
>
> +define i32 @masks4(<4 x i1> %in) {
> + ;CHECK: cost of 8 {{.*}} sext
> + %S = sext <4 x i1> %in to <4 x i64>
> + ret i32 undef
> +}
> +
>
> Modified: llvm/trunk/test/CodeGen/X86/avx-sext.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-sext.ll?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx-sext.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx-sext.ll Wed Feb 20 06:42:54 2013
> @@ -142,3 +142,26 @@ define <8 x i16> @load_sext_test6(<8 x i
> %Y = sext <8 x i8> %X to <8 x i16>
> ret <8 x i16>%Y
> }
> +
> +; AVX: sext_4i1_to_4i64
> +; AVX: vpslld $31
> +; AVX: vpsrad $31
> +; AVX: vpmovsxdq
> +; AVX: vpmovsxdq
> +; AVX: ret
> +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
> + %extmask = sext <4 x i1> %mask to <4 x i64>
> + ret <4 x i64> %extmask
> +}
> +
> +; AVX: sext_4i8_to_4i64
> +; AVX: vpslld $24
> +; AVX: vpsrad $24
> +; AVX: vpmovsxdq
> +; AVX: vpmovsxdq
> +; AVX: ret
> +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
> + %extmask = sext <4 x i8> %mask to <4 x i64>
> + ret <4 x i64> %extmask
> +}
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list