[llvm] r175619 - I optimized the following patterns:

Sun Feb 24 12:48:44 PST 2013

Hi Elena,

For v8i8 -> v8i64, the code pasted below is produced.

I noticed that there is an apparently unnecessary vpmovzxbw and then a
vpor even though (as far as I can see), the first mask can be changed
to (0x0, 0x80, 0x80, 0x80, 0x1, 0x80, 0x80, 0x80, 0x2 0x80, 0x80,
0x80, 0x3, 0x80, 0x80, 0x80) which would avoid the use of the
vpmovzxbw + vpor.

Is there are reason it is done this way or is that unintentional?

.LCPI0_0:
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   1                       # 0x1
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   2                       # 0x2
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   3                       # 0x3
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
.LCPI0_1:
        .byte   0                       # 0x0
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
.LCPI0_2:
        .byte   4                       # 0x4
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   5                       # 0x5
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   6                       # 0x6
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   7                       # 0x7
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .byte   128                     # 0x80
        .text
        .globl  func17
        .align  16, 0x90
        .type   func17, at function
func17:                                 # @func17
        .cfi_startproc
# BB#0:
        vmovq   (%rax), %xmm1
        vpshufb .LCPI0_0(%rip), %xmm1, %xmm0
        vpmovzxbw       %xmm1, %xmm2
        vpshufb .LCPI0_1(%rip), %xmm2, %xmm2
        vpor    %xmm0, %xmm2, %xmm0
        vpslld  $24, %xmm0, %xmm0
        vpsrad  $24, %xmm0, %xmm0
        vpshufb .LCPI0_2(%rip), %xmm1, %xmm1
        vpslld  $24, %xmm1, %xmm1
        vpsrad  $24, %xmm1, %xmm2
        vpmovsxdq       %xmm2, %xmm1
        vpmovsxdq       %xmm0, %xmm3
        vmovdqa %xmm3, (%rax)
        vmovhlps        %xmm2, %xmm2, %xmm2 # xmm2 = xmm2[1,1]
        vpmovsxdq       %xmm2, %xmm2
        vinsertf128     $1, %xmm2, %ymm1, %ymm1
        vmovaps %ymm1, (%rax)
        vmovhlps        %xmm0, %xmm0, %xmm0 # xmm0 = xmm0[1,1]
        vpmovsxdq       %xmm0, %xmm0
        vmovdqa %xmm0, (%rax)
        vzeroupper
        ret

- Muhammad Tauqir

On Wed, Feb 20, 2013 at 7:42 AM, Elena Demikhovsky
<elena.demikhovsky at intel.com> wrote:
> Author: delena
> Date: Wed Feb 20 06:42:54 2013
> New Revision: 175619
>
> URL: http://llvm.org/viewvc/llvm-project?rev=175619&view=rev
> Log:
> I optimized the following patterns:
>  sext <4 x i1> to <4 x i64>
>  sext <4 x i8> to <4 x i64>
>  sext <4 x i16> to <4 x i64>
>
> I'm running Combine on SIGN_EXTEND_IN_REG and revert SEXT patterns:
>  (sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
>
>  The sext_in_reg (v4i32 x) may be lowered to shl+sar operations.
>  The "sar" does not exist on 64-bit operation, so lowering sext_in_reg (v4i64 x) has no vector solution.
>
> I also added a cost of this operations to the AVX costs table.
>
> Modified:
>     llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>     llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
>     llvm/trunk/test/Analysis/CostModel/X86/cast.ll
>     llvm/trunk/test/CodeGen/X86/avx-sext.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Feb 20 06:42:54 2013
> @@ -1323,6 +1323,7 @@ X86TargetLowering::X86TargetLowering(X86
>    setTargetDAGCombine(ISD::ZERO_EXTEND);
>    setTargetDAGCombine(ISD::ANY_EXTEND);
>    setTargetDAGCombine(ISD::SIGN_EXTEND);
> +  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
>    setTargetDAGCombine(ISD::TRUNCATE);
>    setTargetDAGCombine(ISD::SINT_TO_FP);
>    setTargetDAGCombine(ISD::SETCC);
> @@ -17076,6 +17077,41 @@ static SDValue PerformVZEXT_MOVLCombine(
>    return SDValue();
>  }
>
> +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
> +                                               const X86Subtarget *Subtarget) {
> +  EVT VT = N->getValueType(0);
> +  if (!VT.isVector())
> +    return SDValue();
> +
> +  SDValue N0 = N->getOperand(0);
> +  SDValue N1 = N->getOperand(1);
> +  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
> +  DebugLoc dl = N->getDebugLoc();
> +
> +  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
> +  // both SSE and AVX2 since there is no sign-extended shift right
> +  // operation on a vector with 64-bit elements.
> +  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
> +  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
> +  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
> +      N0.getOpcode() == ISD::SIGN_EXTEND)) {
> +    SDValue N00 = N0.getOperand(0);
> +
> +    // EXTLOAD has a better solution on AVX2,
> +    // it may be replaced with X86ISD::VSEXT node.
> +    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
> +      if (!ISD::isNormalLoad(N00.getNode()))
> +        return SDValue();
> +
> +    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
> +        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
> +                                  N00, N1);
> +      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
> +    }
> +  }
> +  return SDValue();
> +}
> +
>  static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
>                                    TargetLowering::DAGCombinerInfo &DCI,
>                                    const X86Subtarget *Subtarget) {
> @@ -17468,6 +17504,7 @@ SDValue X86TargetLowering::PerformDAGCom
>    case ISD::ANY_EXTEND:
>    case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
>    case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
> +  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
>    case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
>    case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
>    case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
>
> Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Wed Feb 20 06:42:54 2013
> @@ -232,6 +232,9 @@ unsigned X86TTI::getCastInstrCost(unsign
>      { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
>      { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
>      { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
> +    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1,  8 },
> +    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,  8 },
> +    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 },
>      { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
>    };
>
>
> Modified: llvm/trunk/test/Analysis/CostModel/X86/cast.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/cast.ll?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/test/Analysis/CostModel/X86/cast.ll (original)
> +++ llvm/trunk/test/Analysis/CostModel/X86/cast.ll Wed Feb 20 06:42:54 2013
> @@ -44,6 +44,10 @@ define i32 @zext_sext(<8 x i1> %in) {
>    %B = zext <8 x i16> undef to <8 x i32>
>    ;CHECK: cost of 1 {{.*}} sext
>    %C = sext <4 x i32> undef to <4 x i64>
> +  ;CHECK: cost of 8 {{.*}} sext
> +  %C1 = sext <4 x i8> undef to <4 x i64>
> +  ;CHECK: cost of 8 {{.*}} sext
> +  %C2 = sext <4 x i16> undef to <4 x i64>
>
>    ;CHECK: cost of 1 {{.*}} zext
>    %D = zext <4 x i32> undef to <4 x i64>
> @@ -59,7 +63,7 @@ define i32 @zext_sext(<8 x i1> %in) {
>    ret i32 undef
>  }
>
> -define i32 @masks(<8 x i1> %in) {
> +define i32 @masks8(<8 x i1> %in) {
>    ;CHECK: cost of 6 {{.*}} zext
>    %Z = zext <8 x i1> %in to <8 x i32>
>    ;CHECK: cost of 9 {{.*}} sext
> @@ -67,3 +71,9 @@ define i32 @masks(<8 x i1> %in) {
>    ret i32 undef
>  }
>
> +define i32 @masks4(<4 x i1> %in) {
> +  ;CHECK: cost of 8 {{.*}} sext
> +  %S = sext <4 x i1> %in to <4 x i64>
> +  ret i32 undef
> +}
> +
>
> Modified: llvm/trunk/test/CodeGen/X86/avx-sext.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-sext.ll?rev=175619&r1=175618&r2=175619&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx-sext.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx-sext.ll Wed Feb 20 06:42:54 2013
> @@ -142,3 +142,26 @@ define <8 x i16> @load_sext_test6(<8 x i
>   %Y = sext <8 x i8> %X to <8 x i16>
>   ret <8 x i16>%Y
>  }
> +
> +; AVX: sext_4i1_to_4i64
> +; AVX: vpslld  $31
> +; AVX: vpsrad  $31
> +; AVX: vpmovsxdq
> +; AVX: vpmovsxdq
> +; AVX: ret
> +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
> +  %extmask = sext <4 x i1> %mask to <4 x i64>
> +  ret <4 x i64> %extmask
> +}
> +
> +; AVX: sext_4i8_to_4i64
> +; AVX: vpslld  $24
> +; AVX: vpsrad  $24
> +; AVX: vpmovsxdq
> +; AVX: vpmovsxdq
> +; AVX: ret
> +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
> +  %extmask = sext <4 x i8> %mask to <4 x i64>
> +  ret <4 x i64> %extmask
> +}
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits