[llvm] r358887 - [TargetLowering][AMDGPU][X86] Improve SimplifyDemandedBits bitcast handling
Richard Trieu via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 26 14:47:41 PDT 2019
Hi Simon,
This revision is causing a fatal error in the backend. I've reported the
details and a reproducer in https://bugs.llvm.org/show_bug.cgi?id=41619
On Mon, Apr 22, 2019 at 7:02 AM Simon Pilgrim via llvm-commits <
llvm-commits at lists.llvm.org> wrote:
> Author: rksimon
> Date: Mon Apr 22 07:04:35 2019
> New Revision: 358887
>
> URL: http://llvm.org/viewvc/llvm-project?rev=358887&view=rev
> Log:
> [TargetLowering][AMDGPU][X86] Improve SimplifyDemandedBits bitcast handling
>
> This patch adds support for BigBitWidth -> SmallBitWidth bitcasts,
> splitting the DemandedBits/Elts accordingly.
>
> The AMDGPU backend needed an extra (srl (and x, c1 << c2), c2) -> (and
> (srl(x, c2), c1) combine to encourage BFE creation, I investigated putting
> this in DAGCombine but it caused a lot of noise on other targets - some
> improvements, some regressions.
>
> The X86 changes are all definite wins.
>
> Differential Revision: https://reviews.llvm.org/D60462
>
> Modified:
> llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
> llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
> llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll
> llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll
> llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll
> llvm/trunk/test/CodeGen/X86/dagcombine-cse.ll
> llvm/trunk/test/CodeGen/X86/masked_store.ll
> llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp Mon Apr 22
> 07:04:35 2019
> @@ -1471,12 +1471,36 @@ bool TargetLowering::SimplifyDemandedBit
> if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
> KnownSrcBits, TLO, Depth + 1))
> return true;
> + } else if ((NumSrcEltBits % BitWidth) == 0 &&
> + TLO.DAG.getDataLayout().isLittleEndian()) {
> + unsigned Scale = NumSrcEltBits / BitWidth;
> + unsigned NumSrcElts = SrcVT.isVector() ?
> SrcVT.getVectorNumElements() : 1;
> + APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
> + APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
> + for (unsigned i = 0; i != NumElts; ++i)
> + if (DemandedElts[i]) {
> + unsigned Offset = (i % Scale) * BitWidth;
> + DemandedSrcBits.insertBits(DemandedBits, Offset);
> + DemandedSrcElts.setBit(i / Scale);
> + }
> +
> + if (SrcVT.isVector()) {
> + APInt KnownSrcUndef, KnownSrcZero;
> + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts,
> KnownSrcUndef,
> + KnownSrcZero, TLO, Depth + 1))
> + return true;
> + }
> +
> + KnownBits KnownSrcBits;
> + if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
> + KnownSrcBits, TLO, Depth + 1))
> + return true;
> }
>
> // If this is a bitcast, let computeKnownBits handle it. Only do
> this on a
> // recursive call where Known may be useful to the caller.
> if (Depth > 0) {
> - Known = TLO.DAG.computeKnownBits(Op, Depth);
> + Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
> return false;
> }
> break;
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Mon Apr 22
> 07:04:35 2019
> @@ -3147,30 +3147,44 @@ SDValue AMDGPUTargetLowering::performSra
>
> SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
> DAGCombinerInfo &DCI)
> const {
> - if (N->getValueType(0) != MVT::i64)
> - return SDValue();
> -
> - const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
> + auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
> if (!RHS)
> return SDValue();
>
> + EVT VT = N->getValueType(0);
> + SDValue LHS = N->getOperand(0);
> unsigned ShiftAmt = RHS->getZExtValue();
> + SelectionDAG &DAG = DCI.DAG;
> + SDLoc SL(N);
> +
> + // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
> + // this improves the ability to match BFE patterns in isel.
> + if (LHS.getOpcode() == ISD::AND) {
> + if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
> + if (Mask->getAPIntValue().isShiftedMask() &&
> + Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
> + return DAG.getNode(
> + ISD::AND, SL, VT,
> + DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
> N->getOperand(1)),
> + DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
> N->getOperand(1)));
> + }
> + }
> + }
> +
> + if (VT != MVT::i64)
> + return SDValue();
> +
> if (ShiftAmt < 32)
> return SDValue();
>
> // srl i64:x, C for C >= 32
> // =>
> // build_pair (srl hi_32(x), C - 32), 0
> -
> - SelectionDAG &DAG = DCI.DAG;
> - SDLoc SL(N);
> -
> SDValue One = DAG.getConstant(1, SL, MVT::i32);
> SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
>
> - SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32,
> N->getOperand(0));
> - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
> - VecOp, One);
> + SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
> + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp,
> One);
>
> SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
> SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/store-weird-sizes.ll Mon Apr 22
> 07:04:35 2019
> @@ -86,8 +86,8 @@ define amdgpu_kernel void @local_store_i
> ; GFX9-NEXT: v_mov_b32_e32 v2, s2
> ; GFX9-NEXT: ds_write_b16 v1, v2 offset:4
> ; GFX9-NEXT: s_waitcnt vmcnt(0)
> -; GFX9-NEXT: v_and_b32_e32 v0, 0x7f0000, v0
> -; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6
> +; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 7
> +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
> ; GFX9-NEXT: ds_write_b32 v1, v3
> ; GFX9-NEXT: s_endpgm
> store i55 %arg, i55 addrspace(3)* %ptr, align 8
>
> Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll Mon Apr 22 07:04:35
> 2019
> @@ -448,22 +448,6 @@ define void @bitcast_8i32_store(i8* %p,
> define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {
> ; SSE2-SSSE3-LABEL: bitcast_4i64_store:
> ; SSE2-SSSE3: # %bb.0:
> -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
> -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
> -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
> -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
> -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
> -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
> -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
> -; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
> -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
> -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
> -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
> -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
> -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
> -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
> ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
> ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
> ; SSE2-SSSE3-NEXT: movb %al, (%rdi)
>
> Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll Mon Apr 22 07:04:35
> 2019
> @@ -609,15 +609,13 @@ define void @bitcast_8i64_store(i8* %p,
> ;
> ; AVX1-LABEL: bitcast_8i64_store:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: movb %al, (%rdi)
>
> Modified: llvm/trunk/test/CodeGen/X86/dagcombine-cse.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/dagcombine-cse.ll?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/dagcombine-cse.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/dagcombine-cse.ll Mon Apr 22 07:04:35 2019
> @@ -14,18 +14,11 @@ define i32 @t(i8* %ref_frame_ptr, i32 %r
> ;
> ; X64-LABEL: t:
> ; X64: ## %bb.0: ## %entry
> -; X64-NEXT: ## kill: def $edx killed $edx def $rdx
> -; X64-NEXT: ## kill: def $esi killed $esi def $rsi
> ; X64-NEXT: imull %ecx, %esi
> -; X64-NEXT: leal (%rsi,%rdx), %eax
> -; X64-NEXT: cltq
> +; X64-NEXT: addl %edx, %esi
> +; X64-NEXT: movslq %esi, %rax
> ; X64-NEXT: movl (%rdi,%rax), %eax
> -; X64-NEXT: leal 4(%rsi,%rdx), %ecx
> -; X64-NEXT: movslq %ecx, %rcx
> -; X64-NEXT: movzwl (%rdi,%rcx), %ecx
> -; X64-NEXT: shlq $32, %rcx
> -; X64-NEXT: orq %rax, %rcx
> -; X64-NEXT: movq %rcx, %xmm0
> +; X64-NEXT: movq %rax, %xmm0
> ; X64-NEXT: movd %xmm0, %eax
> ; X64-NEXT: retq
> entry:
>
> Modified: llvm/trunk/test/CodeGen/X86/masked_store.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_store.ll?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/masked_store.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/masked_store.ll Mon Apr 22 07:04:35 2019
> @@ -36,25 +36,21 @@ define void @store_v1f64_v1i64(<1 x i64>
> define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr,
> <2 x double> %val) {
> ; SSE2-LABEL: store_v2f64_v2i64:
> ; SSE2: ## %bb.0:
> -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
> -; SSE2-NEXT: pxor %xmm3, %xmm0
> -; SSE2-NEXT: movdqa %xmm3, %xmm2
> -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
> -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
> -; SSE2-NEXT: movdqa %xmm0, %xmm4
> -; SSE2-NEXT: pand %xmm2, %xmm4
> -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
> -; SSE2-NEXT: por %xmm3, %xmm4
> -; SSE2-NEXT: movd %xmm4, %eax
> +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
> +; SSE2-NEXT: pxor %xmm2, %xmm0
> +; SSE2-NEXT: movdqa %xmm2, %xmm3
> +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
> +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
> +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
> +; SSE2-NEXT: pand %xmm3, %xmm2
> +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
> +; SSE2-NEXT: por %xmm2, %xmm0
> +; SSE2-NEXT: movd %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB1_2
> ; SSE2-NEXT: ## %bb.1: ## %cond.store
> ; SSE2-NEXT: movlpd %xmm1, (%rdi)
> ; SSE2-NEXT: LBB1_2: ## %else
> -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
> -; SSE2-NEXT: pand %xmm2, %xmm0
> -; SSE2-NEXT: por %xmm3, %xmm0
> ; SSE2-NEXT: pextrw $4, %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB1_4
> @@ -117,20 +113,16 @@ define void @store_v4f64_v4i64(<4 x i64>
> ; SSE2-NEXT: movdqa %xmm4, %xmm5
> ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
> ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
> -; SSE2-NEXT: movdqa %xmm0, %xmm7
> -; SSE2-NEXT: pand %xmm5, %xmm7
> -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
> -; SSE2-NEXT: por %xmm6, %xmm7
> -; SSE2-NEXT: movd %xmm7, %eax
> +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
> +; SSE2-NEXT: pand %xmm5, %xmm6
> +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
> +; SSE2-NEXT: por %xmm6, %xmm0
> +; SSE2-NEXT: movd %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB2_2
> ; SSE2-NEXT: ## %bb.1: ## %cond.store
> ; SSE2-NEXT: movlpd %xmm2, (%rdi)
> ; SSE2-NEXT: LBB2_2: ## %else
> -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
> -; SSE2-NEXT: pand %xmm5, %xmm0
> -; SSE2-NEXT: por %xmm6, %xmm0
> ; SSE2-NEXT: pextrw $4, %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB2_4
> @@ -140,10 +132,9 @@ define void @store_v4f64_v4i64(<4 x i64>
> ; SSE2-NEXT: pxor %xmm4, %xmm1
> ; SSE2-NEXT: movdqa %xmm4, %xmm0
> ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
> ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
> ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
> -; SSE2-NEXT: pand %xmm2, %xmm1
> +; SSE2-NEXT: pand %xmm0, %xmm1
> ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
> ; SSE2-NEXT: por %xmm1, %xmm0
> ; SSE2-NEXT: pextrw $0, %xmm0, %eax
> @@ -863,25 +854,21 @@ define void @store_v16f32_v16i32(<16 x f
> define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x
> i64> %val) {
> ; SSE2-LABEL: store_v2i64_v2i64:
> ; SSE2: ## %bb.0:
> -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
> -; SSE2-NEXT: pxor %xmm3, %xmm0
> -; SSE2-NEXT: movdqa %xmm3, %xmm2
> -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
> -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
> -; SSE2-NEXT: movdqa %xmm0, %xmm4
> -; SSE2-NEXT: pand %xmm2, %xmm4
> -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
> -; SSE2-NEXT: por %xmm3, %xmm4
> -; SSE2-NEXT: movd %xmm4, %eax
> +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
> +; SSE2-NEXT: pxor %xmm2, %xmm0
> +; SSE2-NEXT: movdqa %xmm2, %xmm3
> +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
> +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
> +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
> +; SSE2-NEXT: pand %xmm3, %xmm2
> +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
> +; SSE2-NEXT: por %xmm2, %xmm0
> +; SSE2-NEXT: movd %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB7_2
> ; SSE2-NEXT: ## %bb.1: ## %cond.store
> ; SSE2-NEXT: movq %xmm1, (%rdi)
> ; SSE2-NEXT: LBB7_2: ## %else
> -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
> -; SSE2-NEXT: pand %xmm2, %xmm0
> -; SSE2-NEXT: por %xmm3, %xmm0
> ; SSE2-NEXT: pextrw $4, %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB7_4
> @@ -950,20 +937,16 @@ define void @store_v4i64_v4i64(<4 x i64>
> ; SSE2-NEXT: movdqa %xmm4, %xmm5
> ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
> ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
> -; SSE2-NEXT: movdqa %xmm0, %xmm7
> -; SSE2-NEXT: pand %xmm5, %xmm7
> -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
> -; SSE2-NEXT: por %xmm6, %xmm7
> -; SSE2-NEXT: movd %xmm7, %eax
> +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
> +; SSE2-NEXT: pand %xmm5, %xmm6
> +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
> +; SSE2-NEXT: por %xmm6, %xmm0
> +; SSE2-NEXT: movd %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB8_2
> ; SSE2-NEXT: ## %bb.1: ## %cond.store
> ; SSE2-NEXT: movq %xmm2, (%rdi)
> ; SSE2-NEXT: LBB8_2: ## %else
> -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
> -; SSE2-NEXT: pand %xmm5, %xmm0
> -; SSE2-NEXT: por %xmm6, %xmm0
> ; SSE2-NEXT: pextrw $4, %xmm0, %eax
> ; SSE2-NEXT: testb $1, %al
> ; SSE2-NEXT: je LBB8_4
> @@ -974,10 +957,9 @@ define void @store_v4i64_v4i64(<4 x i64>
> ; SSE2-NEXT: pxor %xmm4, %xmm1
> ; SSE2-NEXT: movdqa %xmm4, %xmm0
> ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
> -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
> ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
> ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
> -; SSE2-NEXT: pand %xmm2, %xmm1
> +; SSE2-NEXT: pand %xmm0, %xmm1
> ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
> ; SSE2-NEXT: por %xmm1, %xmm0
> ; SSE2-NEXT: pextrw $0, %xmm0, %eax
>
> Modified: llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll?rev=358887&r1=358886&r2=358887&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll Mon Apr 22 07:04:35 2019
> @@ -929,22 +929,6 @@ define i1 @allzeros_v16i32_sign(<16 x i3
> define i1 @allones_v4i64_sign(<4 x i64> %arg) {
> ; SSE2-LABEL: allones_v4i64_sign:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
> -; SSE2-NEXT: pxor %xmm2, %xmm1
> -; SSE2-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
> -; SSE2-NEXT: movdqa %xmm2, %xmm4
> -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
> -; SSE2-NEXT: pand %xmm3, %xmm1
> -; SSE2-NEXT: por %xmm4, %xmm1
> -; SSE2-NEXT: pxor %xmm2, %xmm0
> -; SSE2-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
> -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
> -; SSE2-NEXT: pand %xmm3, %xmm0
> -; SSE2-NEXT: por %xmm2, %xmm0
> ; SSE2-NEXT: packssdw %xmm1, %xmm0
> ; SSE2-NEXT: movmskps %xmm0, %eax
> ; SSE2-NEXT: cmpb $15, %al
> @@ -989,22 +973,6 @@ define i1 @allones_v4i64_sign(<4 x i64>
> define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
> ; SSE2-LABEL: allzeros_v4i64_sign:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
> -; SSE2-NEXT: pxor %xmm2, %xmm1
> -; SSE2-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
> -; SSE2-NEXT: movdqa %xmm2, %xmm4
> -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
> -; SSE2-NEXT: pand %xmm3, %xmm1
> -; SSE2-NEXT: por %xmm4, %xmm1
> -; SSE2-NEXT: pxor %xmm2, %xmm0
> -; SSE2-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
> -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
> -; SSE2-NEXT: pand %xmm3, %xmm0
> -; SSE2-NEXT: por %xmm2, %xmm0
> ; SSE2-NEXT: packssdw %xmm1, %xmm0
> ; SSE2-NEXT: movmskps %xmm0, %eax
> ; SSE2-NEXT: testb %al, %al
> @@ -1095,15 +1063,13 @@ define i1 @allones_v8i64_sign(<8 x i64>
> ;
> ; AVX1-LABEL: allones_v8i64_sign:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: cmpb $-1, %al
> @@ -1198,15 +1164,13 @@ define i1 @allzeros_v8i64_sign(<8 x i64>
> ;
> ; AVX1-LABEL: allzeros_v8i64_sign:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: testb %al, %al
> @@ -2539,19 +2503,17 @@ define i1 @allones_v8i64_and1(<8 x i64>
> ;
> ; AVX1-LABEL: allones_v8i64_and1:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
> +; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: cmpb $-1, %al
> @@ -2615,19 +2577,17 @@ define i1 @allzeros_v8i64_and1(<8 x i64>
> ;
> ; AVX1-LABEL: allzeros_v8i64_and1:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> ; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
> +; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: testb %al, %al
> @@ -3962,19 +3922,17 @@ define i1 @allones_v8i64_and4(<8 x i64>
> ;
> ; AVX1-LABEL: allones_v8i64_and4:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
> +; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: cmpb $-1, %al
> @@ -4038,19 +3996,17 @@ define i1 @allzeros_v8i64_and4(<8 x i64>
> ;
> ; AVX1-LABEL: allzeros_v8i64_and4:
> ; AVX1: # %bb.0:
> -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
> -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> -; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
> -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
> -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
> ; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
> +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
> ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
> ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
> ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
> ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
> +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
> +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
> +; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
> +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; AVX1-NEXT: vmovmskps %ymm0, %eax
> ; AVX1-NEXT: testb %al, %al
> @@ -4170,22 +4126,6 @@ define i32 @movmskps(<4 x float> %x) {
> define i32 @movmskpd256(<4 x double> %x) {
> ; SSE2-LABEL: movmskpd256:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
> -; SSE2-NEXT: pxor %xmm2, %xmm1
> -; SSE2-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
> -; SSE2-NEXT: movdqa %xmm2, %xmm4
> -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
> -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
> -; SSE2-NEXT: pand %xmm3, %xmm1
> -; SSE2-NEXT: por %xmm4, %xmm1
> -; SSE2-NEXT: pxor %xmm2, %xmm0
> -; SSE2-NEXT: movdqa %xmm2, %xmm3
> -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
> -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
> -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
> -; SSE2-NEXT: pand %xmm3, %xmm0
> -; SSE2-NEXT: por %xmm2, %xmm0
> ; SSE2-NEXT: packssdw %xmm1, %xmm0
> ; SSE2-NEXT: movmskps %xmm0, %eax
> ; SSE2-NEXT: retq
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190426/fff9f4ed/attachment.html>
More information about the llvm-commits
mailing list