[llvm] r232682 - [X86][SSE] Avoid scalarization of v2i64 vector shifts (REAPPLIED)

Wed Mar 18 15:49:54 PDT 2015

Hi Simon,
If you really want to check the entire sequence, then you should force
an x86_64-unknown-unknown triple to the test.
Alternatively I suggest you to only match the two expected psrlq
instructions using a pair of CHECK-DAG.

I hope it helps.

On Wed, Mar 18, 2015 at 10:26 PM, Rafael Espíndola
<rafael.espindola at gmail.com> wrote:
> I am still getting an error with this:
>
> ******************** TEST 'LLVM :: CodeGen/X86/x86-shifts.ll' FAILED
> ********************
> Script:
> --
> /home/espindola/llvm/build/./bin/llc <
> /home/espindola/llvm/llvm/test/CodeGen/X86/x86-shifts.ll -march=x86-64
> -mattr=sse2 | /home/espindola/llvm/build/./bin/FileCheck
> /home/espindola/llvm/llvm/test/CodeGen/X86/x86-shifts.ll
> --
> Exit Code: 1
>
> Command Output (stderr):
> --
> /home/espindola/llvm/llvm/test/CodeGen/X86/x86-shifts.ll:122:10:
> error: expected string not found in input
> ; CHECK: movdqa %xmm1, %xmm2
>          ^
> <stdin>:142:2: note: scanning from here
>  .align 16, 0x90
>  ^
> <stdin>:148:2: note: possible intended match here
>  movdqa %xmm0, %xmm2
>  ^
>
> --
>
> On 18 March 2015 at 18:18, Simon Pilgrim <llvm-dev at redking.me.uk> wrote:
>> Author: rksimon
>> Date: Wed Mar 18 17:18:51 2015
>> New Revision: 232682
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=232682&view=rev
>> Log:
>> [X86][SSE] Avoid scalarization of v2i64 vector shifts (REAPPLIED)
>>
>> Fixed broken tests.
>>
>> Differential Revision: http://reviews.llvm.org/D8416
>>
>> Modified:
>>     llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>>     llvm/trunk/test/Analysis/CostModel/X86/testshiftlshr.ll
>>     llvm/trunk/test/Analysis/CostModel/X86/testshiftshl.ll
>>     llvm/trunk/test/CodeGen/X86/vshift-4.ll
>>     llvm/trunk/test/CodeGen/X86/x86-shifts.ll
>>
>> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=232682&r1=232681&r2=232682&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Mar 18 17:18:51 2015
>> @@ -5906,7 +5906,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(S
>>    return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
>>  }
>>
>> -static SDValue LowerCONCAT_VECTORS(SDValue Op,
>> +static SDValue LowerCONCAT_VECTORS(SDValue Op,
>>                                     const X86Subtarget *Subtarget,
>>                                     SelectionDAG &DAG) {
>>    MVT VT = Op.getSimpleValueType();
>> @@ -13255,11 +13255,11 @@ SDValue X86TargetLowering::LowerSELECT(S
>>        // If we have AVX, we can use a variable vector select (VBLENDV) instead
>>        // of 3 logic instructions for size savings and potentially speed.
>>        // Unfortunately, there is no scalar form of VBLENDV.
>> -
>> +
>>        // If either operand is a constant, don't try this. We can expect to
>>        // optimize away at least one of the logic instructions later in that
>>        // case, so that sequence would be faster than a variable blend.
>> -
>> +
>>        // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
>>        // uses XMM0 as the selection register. That may need just as many
>>        // instructions as the AND/ANDN/OR sequence due to register moves, so
>> @@ -13267,10 +13267,10 @@ SDValue X86TargetLowering::LowerSELECT(S
>>
>>        if (Subtarget->hasAVX() &&
>>            !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
>> -
>> +
>>          // Convert to vectors, do a VSELECT, and convert back to scalar.
>>          // All of the conversions should be optimized away.
>> -
>> +
>>          EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
>>          SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
>>          SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
>> @@ -13278,9 +13278,9 @@ SDValue X86TargetLowering::LowerSELECT(S
>>
>>          EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
>>          VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
>> -
>> +
>>          SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
>> -
>> +
>>          return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
>>                             VSel, DAG.getIntPtrConstant(0));
>>        }
>> @@ -16189,6 +16189,17 @@ static SDValue LowerShift(SDValue Op, co
>>        return Op;
>>    }
>>
>> +  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
>> +  // shifts per-lane and then shuffle the partial results back together.
>> +  if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
>> +    // Splat the shift amounts so the scalar shifts above will catch it.
>> +    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
>> +    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
>> +    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
>> +    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
>> +    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
>> +  }
>> +
>>    // If possible, lower this packed shift into a vector multiply instead of
>>    // expanding it into a sequence of scalar shifts.
>>    // Do this only if the vector shift count is a constant build_vector.
>> @@ -21960,7 +21971,7 @@ static SDValue VectorZextCombine(SDNode
>>    // an and with a mask.
>>    // We'd like to try to combine that into a shuffle with zero
>>    // plus a bitcast, removing the and.
>> -  if (N0.getOpcode() != ISD::BITCAST ||
>> +  if (N0.getOpcode() != ISD::BITCAST ||
>>        N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
>>      return SDValue();
>>
>> @@ -21990,7 +22001,7 @@ static SDValue VectorZextCombine(SDNode
>>
>>    unsigned ResSize = N1.getValueType().getScalarSizeInBits();
>>    // Make sure the splat matches the mask we expect
>> -  if (SplatBitSize > ResSize ||
>> +  if (SplatBitSize > ResSize ||
>>        (SplatValue + 1).exactLogBase2() != (int)SrcSize)
>>      return SDValue();
>>
>> @@ -22948,7 +22959,7 @@ static SDValue PerformFANDCombine(SDNode
>>    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
>>      if (C->getValueAPF().isPosZero())
>>        return N->getOperand(1);
>> -
>> +
>>    return SDValue();
>>  }
>>
>> @@ -23222,7 +23233,7 @@ static SDValue PerformISDSETCCCombine(SD
>>          return DAG.getConstant(1, VT);
>>        if (CC == ISD::SETEQ || CC == ISD::SETGE)
>>          return DAG.getNOT(DL, LHS.getOperand(0), VT);
>> -
>> +
>>        assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
>>               "Unexpected condition code!");
>>        return LHS.getOperand(0);
>> @@ -23264,7 +23275,7 @@ static SDValue PerformINSERTPSCombine(SD
>>      // countS and just gets an f32 from that address.
>>      unsigned DestIndex =
>>          cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
>> -
>> +
>>      Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
>>
>>      // Create this as a scalar to vector to match the instruction pattern.
>> @@ -23288,7 +23299,7 @@ static SDValue PerformBLENDICombine(SDNo
>>    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
>>    // x86InstrInfo knows how to commute this back after instruction selection
>>    // if it would help register allocation.
>> -
>> +
>>    // TODO: If optimizing for size or a processor that doesn't suffer from
>>    // partial register update stalls, this should be transformed into a MOVSD
>>    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
>>
>> Modified: llvm/trunk/test/Analysis/CostModel/X86/testshiftlshr.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/testshiftlshr.ll?rev=232682&r1=232681&r2=232682&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Analysis/CostModel/X86/testshiftlshr.ll (original)
>> +++ llvm/trunk/test/Analysis/CostModel/X86/testshiftlshr.ll Wed Mar 18 17:18:51 2015
>> @@ -7,7 +7,7 @@ entry:
>>    ; SSE2: shift2i16
>>    ; SSE2: cost of 20 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift2i16
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype %a , %b
>>    ret %shifttype %0
>> @@ -67,7 +67,7 @@ entry:
>>    ; SSE2: shift2i32
>>    ; SSE2: cost of 20 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift2i32
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype2i32 %a , %b
>>    ret %shifttype2i32 %0
>> @@ -127,7 +127,7 @@ entry:
>>    ; SSE2: shift2i64
>>    ; SSE2: cost of 20 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift2i64
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype2i64 %a , %b
>>    ret %shifttype2i64 %0
>> @@ -139,7 +139,7 @@ entry:
>>    ; SSE2: shift4i64
>>    ; SSE2: cost of 40 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift4i64
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype4i64 %a , %b
>>    ret %shifttype4i64 %0
>> @@ -151,7 +151,7 @@ entry:
>>    ; SSE2: shift8i64
>>    ; SSE2: cost of 80 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift8i64
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype8i64 %a , %b
>>    ret %shifttype8i64 %0
>> @@ -163,7 +163,7 @@ entry:
>>    ; SSE2: shift16i64
>>    ; SSE2: cost of 160 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift16i64
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype16i64 %a , %b
>>    ret %shifttype16i64 %0
>> @@ -175,7 +175,7 @@ entry:
>>    ; SSE2: shift32i64
>>    ; SSE2: cost of 320 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift32i64
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype32i64 %a , %b
>>    ret %shifttype32i64 %0
>> @@ -187,7 +187,7 @@ entry:
>>    ; SSE2: shift2i8
>>    ; SSE2: cost of 20 {{.*}} lshr
>>    ; SSE2-CODEGEN: shift2i8
>> -  ; SSE2-CODEGEN: shrq %cl
>> +  ; SSE2-CODEGEN: psrlq
>>
>>    %0 = lshr %shifttype2i8 %a , %b
>>    ret %shifttype2i8 %0
>>
>> Modified: llvm/trunk/test/Analysis/CostModel/X86/testshiftshl.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/testshiftshl.ll?rev=232682&r1=232681&r2=232682&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Analysis/CostModel/X86/testshiftshl.ll (original)
>> +++ llvm/trunk/test/Analysis/CostModel/X86/testshiftshl.ll Wed Mar 18 17:18:51 2015
>> @@ -7,7 +7,7 @@ entry:
>>    ; SSE2: shift2i16
>>    ; SSE2: cost of 20 {{.*}} shl
>>    ; SSE2-CODEGEN: shift2i16
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype %a , %b
>>    ret %shifttype %0
>> @@ -67,7 +67,7 @@ entry:
>>    ; SSE2: shift2i32
>>    ; SSE2: cost of 20 {{.*}} shl
>>    ; SSE2-CODEGEN: shift2i32
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype2i32 %a , %b
>>    ret %shifttype2i32 %0
>> @@ -127,7 +127,7 @@ entry:
>>    ; SSE2: shift2i64
>>    ; SSE2: cost of 20 {{.*}} shl
>>    ; SSE2-CODEGEN: shift2i64
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype2i64 %a , %b
>>    ret %shifttype2i64 %0
>> @@ -139,7 +139,7 @@ entry:
>>    ; SSE2: shift4i64
>>    ; SSE2: cost of 40 {{.*}} shl
>>    ; SSE2-CODEGEN: shift4i64
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype4i64 %a , %b
>>    ret %shifttype4i64 %0
>> @@ -151,7 +151,7 @@ entry:
>>    ; SSE2: shift8i64
>>    ; SSE2: cost of 80 {{.*}} shl
>>    ; SSE2-CODEGEN: shift8i64
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype8i64 %a , %b
>>    ret %shifttype8i64 %0
>> @@ -163,7 +163,7 @@ entry:
>>    ; SSE2: shift16i64
>>    ; SSE2: cost of 160 {{.*}} shl
>>    ; SSE2-CODEGEN: shift16i64
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype16i64 %a , %b
>>    ret %shifttype16i64 %0
>> @@ -175,7 +175,7 @@ entry:
>>    ; SSE2: shift32i64
>>    ; SSE2: cost of 320 {{.*}} shl
>>    ; SSE2-CODEGEN: shift32i64
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype32i64 %a , %b
>>    ret %shifttype32i64 %0
>> @@ -187,7 +187,7 @@ entry:
>>    ; SSE2: shift2i8
>>    ; SSE2: cost of 20 {{.*}} shl
>>    ; SSE2-CODEGEN: shift2i8
>> -  ; SSE2-CODEGEN: shlq %cl
>> +  ; SSE2-CODEGEN: psllq
>>
>>    %0 = shl %shifttype2i8 %a , %b
>>    ret %shifttype2i8 %0
>>
>> Modified: llvm/trunk/test/CodeGen/X86/vshift-4.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vshift-4.ll?rev=232682&r1=232681&r2=232682&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/vshift-4.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/vshift-4.ll Wed Mar 18 17:18:51 2015
>> @@ -13,11 +13,16 @@ entry:
>>    ret void
>>  }
>>
>> -; shift1b can't use a packed shift
>> +; shift1b can't use a packed shift but can shift lanes separately and shuffle back together
>>  define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
>>  entry:
>>  ; CHECK-LABEL: shift1b:
>> -; CHECK: shll
>> +; CHECK:       pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
>> +; CHECK-NEXT:  movdqa %xmm0, %xmm3
>> +; CHECK-NEXT:  psllq  %xmm2, %xmm3
>> +; CHECK-NEXT:  movq   {{.*#+}} xmm1 = xmm1[0],zero
>> +; CHECK-NEXT:  psllq  %xmm1, %xmm0
>> +; CHECK-NEXT:  movsd  {{.*#+}} xmm3 = xmm0[0],xmm3[1]
>>    %shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
>>    %shl = shl <2 x i64> %val, %shamt
>>    store <2 x i64> %shl, <2 x i64>* %dst
>>
>> Modified: llvm/trunk/test/CodeGen/X86/x86-shifts.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-shifts.ll?rev=232682&r1=232681&r2=232682&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/x86-shifts.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/x86-shifts.ll Wed Mar 18 17:18:51 2015
>> @@ -118,10 +118,15 @@ entry:
>>
>>  define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
>>  entry:
>> -; CHECK: shr2_nosplat
>> -; CHECK-NOT:  psrlq
>> -; CHECK-NOT:  psrlq
>> -; CHECK:      ret
>> +; CHECK-LABEL: shr2_nosplat
>> +; CHECK:       movdqa %xmm1, %xmm2
>> +; CHECK-NEXT:  psrlq  $8, %xmm2
>> +; CHECK-NEXT:  movdqa %xmm1, %xmm0
>> +; CHECK-NEXT:  psrlq  $1, %xmm0
>> +; CHECK-NEXT:  movsd  {{.*#+}} xmm1 = xmm0[0],xmm1[1]
>> +; CHECK-NEXT:  movsd  {{.*#+}} xmm0 = xmm2[0],xmm0[1]
>> +; CHECK-NEXT:  xorpd  %xmm1, %xmm0
>> +; CHECK-NEXT:  ret
>>    %B = lshr <2 x i64> %A,  < i64 8, i64 1>
>>    %C = lshr <2 x i64> %A,  < i64 1, i64 0>
>>    %K = xor <2 x i64> %B, %C
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits