[llvm] r226953 - [x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector

Tue Jan 27 16:16:49 PST 2015

No problem Alexey, my bad I lost track of the builders. Thanks for the
testcase :-)

On Tue, Jan 27, 2015 at 7:37 PM, Alexey Samsonov <vonosmas at gmail.com> wrote:
> Sorry, I had to revert this commit in r227248. It caused the following
> failure:
>
> $ cat a.cc
> #include <emmintrin.h>
> #include <stdio.h>
>
> typedef unsigned short U2;
> typedef unsigned int U4;
>
> typedef U4 V2x32 __attribute__((__vector_size__(8)));
> typedef U2 V4x16 __attribute__((__vector_size__(8)));
>
> int main() {
>   V4x16 a = {0, 1, 2, 3};
>   V4x16 b = {100, 101, 102, 103};
>   V2x32 c = _mm_madd_pi16(a, b);
>
>   printf("%d %d\n", c[0], c[1]);
>   return 0;
> }
>
> $ ./bin/clang++ a.cc && ./a.out
> 101 0
>
> (expected result is 101 513)
>
> This caused test failures in MSan test suite:
> http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux/builds/15572/steps/run%20msan%20unit%20tests/logs/stdio
>
>
>
> On Fri, Jan 23, 2015 at 2:44 PM, Bruno Cardoso Lopes
> <bruno.cardoso at gmail.com> wrote:
>>
>> Author: bruno
>> Date: Fri Jan 23 16:44:16 2015
>> New Revision: 226953
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=226953&view=rev
>> Log:
>> [x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector
>>
>> Handle the poor codegen for i64/x86xmm->v2i64 (%mm -> %xmm) moves. Instead
>> of
>> using stack store/load pair to do the job, use scalar_to_vector directly,
>> which
>> in the MMX case can use movq2dq. This was the current behavior prior to
>> improvements for vector legalization of extloads in r213897.
>>
>> This commit fixes the regression and as a side-effect also remove some
>> unnecessary shuffles.
>>
>> In the new attached testcase, we go from:
>>
>> pshufw  $-18, (%rdi), %mm0
>> movq    %mm0, -8(%rsp)
>> movq    -8(%rsp), %xmm0
>> pshufd  $-44, %xmm0, %xmm0
>> movd    %xmm0, %eax
>> ...
>>
>> To:
>>
>> pshufw  $-18, (%rdi), %mm0
>> movq2dq %mm0, %xmm0
>> movd    %xmm0, %eax
>> ...
>>
>> Differential Revision: http://reviews.llvm.org/D7126
>> rdar://problem/19413324
>>
>> Added:
>>     llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll
>> Modified:
>>     llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>>     llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll
>>     llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
>>     llvm/trunk/test/CodeGen/X86/widen_load-2.ll
>>
>> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=226953&r1=226952&r2=226953&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jan 23 16:44:16 2015
>> @@ -24757,6 +24757,8 @@ static SDValue PerformLOADCombine(SDNode
>>    LoadSDNode *Ld = cast<LoadSDNode>(N);
>>    EVT RegVT = Ld->getValueType(0);
>>    EVT MemVT = Ld->getMemoryVT();
>> +  SDValue Ptr   = Ld->getBasePtr();
>> +  SDValue Chain = Ld->getChain();
>>    SDLoc dl(Ld);
>>    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
>>
>> @@ -24795,6 +24797,33 @@ static SDValue PerformLOADCombine(SDNode
>>      return DCI.CombineTo(N, NewVec, TF, true);
>>    }
>>
>> +  // Conversion from x86mmx/i64 to v2i64 types is often done via stack
>> +  // store/load. Under certain conditions we can bypass the memory access
>> and
>> +  // combine this load to use a scalar_to_vector instead. This leads to
>> +  // a reduction in the stack use, redundant emission of shuffles and
>> create
>> +  // isel matching candidates for movq2dq instructions.
>> +  if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD
>> &&
>> +      !Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) {
>> +
>> +    // If this load is directly stored, get the original source value.
>> +    StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
>> +    EVT SrcTy = PrevST->getValue().getValueType();
>> +    if (PrevST->getBasePtr() != Ptr ||
>> +        !(SrcTy == MVT::i64 || SrcTy == MVT::x86mmx))
>> +      return SDValue();
>> +    SDValue SrcVal = Chain.getOperand(1);
>> +
>> +    // On 32bit systems, we can't save 64bit integers, use f64 instead.
>> +    bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit();
>> +    if (Usef64)
>> +      SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal);
>> +    SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 :
>> RegVT,
>> +                              SrcVal);
>> +
>> +    return DCI.CombineTo(N, Usef64 ?
>> +        DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain);
>> +  }
>> +
>>    return SDValue();
>>  }
>>
>>
>> Modified: llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll?rev=226953&r1=226952&r2=226953&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll Fri Jan 23 16:44:16
>> 2015
>> @@ -1,14 +1,15 @@
>>  ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 |
>> FileCheck %s
>>
>> -;CHECK-LABEL: vcast:
>> +; CHECK-LABEL: vcast:
>>  define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
>> -;CHECK: pmovzxdq
>> -;CHECK: pmovzxdq
>> +; CHECK-NOT: pmovzxdq
>> +; CHECK-NOT: pmovzxdq
>> +; CHECK: movdqa (%{{.*}}),  %[[R0:xmm[0-9]+]]
>>    %af = bitcast <2 x float> %a to <2 x i32>
>>    %bf = bitcast <2 x float> %b to <2 x i32>
>> +; CHECK-NEXT: psubq (%{{.*}}), %[[R0]]
>>    %x = sub <2 x i32> %af, %bf
>> -;CHECK: psubq
>> +; CHECK: ret
>>    ret <2 x i32> %x
>> -;CHECK: ret
>>  }
>>
>>
>> Modified: llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/lower-bitcast.ll?rev=226953&r1=226952&r2=226953&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/lower-bitcast.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/lower-bitcast.ll Fri Jan 23 16:44:16 2015
>> @@ -68,12 +68,13 @@ define i64 @test4(i64 %A) {
>>    %2 = bitcast <2 x i32> %add to i64
>>    ret i64 %2
>>  }
>> -; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
>> +; FIXME: At the moment we still produce the sequence paddd+pshufd.
>>  ; Ideally, we should fold that sequence into a single paddd. This is
>> fixed with
>>  ; the widening legalization.
>>  ;
>>  ; CHECK-LABEL: test4
>> -; CHECK: pshufd
>> +; CHECK: movd
>> +; CHECK-NOT: pshufd
>>  ; CHECK-NEXT: paddd
>>  ; CHECK-NEXT: pshufd
>>  ; CHECK: ret
>>
>> Added: llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll?rev=226953&view=auto
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll (added)
>> +++ llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll Fri Jan 23 16:44:16 2015
>> @@ -0,0 +1,29 @@
>> +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s
>> -check-prefix=X86-32
>> +; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
>> -check-prefix=X86-64
>> +
>> +; X86-32-LABEL: test0
>> +; X86-64-LABEL: test0
>> +define i32 @test0(<1 x i64>* %v4) {
>> +  %v5 = load <1 x i64>* %v4, align 8
>> +  %v12 = bitcast <1 x i64> %v5 to <4 x i16>
>> +  %v13 = bitcast <4 x i16> %v12 to x86_mmx
>> +  ; X86-32: pshufw  $238
>> +  ; X86-32-NOT: movq
>> +  ; X86-32-NOT: movsd
>> +  ; X86-32: movq2dq
>> +  ; X86-64: pshufw  $238
>> +  ; X86-64-NOT: movq
>> +  ; X86-64-NOT: pshufd
>> +  ; X86-64: movq2dq
>> +  ; X86-64-NEXT: movd
>> +  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
>> +  %v15 = bitcast x86_mmx %v14 to <4 x i16>
>> +  %v16 = bitcast <4 x i16> %v15 to <1 x i64>
>> +  %v17 = extractelement <1 x i64> %v16, i32 0
>> +  %v18 = bitcast i64 %v17 to <2 x i32>
>> +  %v19 = extractelement <2 x i32> %v18, i32 0
>> +  %v20 = add i32 %v19, 32
>> +  ret i32 %v20
>> +}
>> +
>> +declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
>>
>> Modified: llvm/trunk/test/CodeGen/X86/widen_load-2.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_load-2.ll?rev=226953&r1=226952&r2=226953&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/widen_load-2.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll Fri Jan 23 16:44:16 2015
>> @@ -78,8 +78,7 @@ define void @add3i16(%i16vec3* nocapture
>>  ; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
>>  ; CHECK-NEXT:    pextrw   $4, %[[R1]], 4(%{{.*}})
>>  ; CHECK-NEXT:    pshufb   {{.*}}, %[[R1]]
>> -; CHECK-NEXT:    pmovzxdq %[[R1]], %[[R0]]
>> -; CHECK-NEXT:    movd     %[[R0]], (%{{.*}})
>> +; CHECK-NEXT:    movd     %[[R1]], (%{{.*}})
>>         %a = load %i16vec3* %ap, align 16
>>         %b = load %i16vec3* %bp, align 16
>>         %x = add %i16vec3 %a, %b
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
>
>
>
> --
> Alexey Samsonov
> vonosmas at gmail.com

-- 
Bruno Cardoso Lopes
http://www.brunocardoso.cc