[PATCH] D36597: DAG: Fix creating select with wrong condition type

Tue Oct 24 18:36:11 PDT 2017

Matt Arsenault via Phabricator <reviews at reviews.llvm.org> writes:
> arsenm created this revision.
> Herald added subscribers: tpr, nhaehnle, wdng.
>
> This code added in r297930 assumed that it could create
> a select with a condition type that is just an integer
> bitcast of the selected type. For AMDGPU any vselect is
> going to be scalarized (although the vector types are legal),
> and all select conditions must be i1 (the same as getSetCCResultType).
>
> This logic doesn't really make sense to me, but there's
> never really been a consistent policy in what the select
> condition mask type is supposed to be. Try to extend
> the logic for skipping the transform for condition types
> that aren't setccs. It doesn't seem quite right to me though,
> but checking conditions that seem more sensible (like whether the
> vselect is going to be expanded) doesn't work since this
> seems to depend on that also.

The logic in this function is getting a bit hard to follow, but this
looks reasonable AFAICT. LGTM.

>
> https://reviews.llvm.org/D36597
>
> Files:
>   lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
>   test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
>   test/CodeGen/X86/avx512-vselect.ll
>
> Index: test/CodeGen/X86/avx512-vselect.ll
> ===================================================================
> --- test/CodeGen/X86/avx512-vselect.ll
> +++ test/CodeGen/X86/avx512-vselect.ll
> @@ -23,35 +23,16 @@
>  ; both formulations of vselect. All of this trickery is because we can't
>  ; directly form an SDAG input to the lowering.
>  define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
> -; CHECK-SKX-LABEL: test2:
> -; CHECK-SKX:       # BB#0: # %entry
> -; CHECK-SKX-NEXT:    vxorps %xmm6, %xmm6, %xmm6
> -; CHECK-SKX-NEXT:    vcmpltps %zmm0, %zmm6, %k0
> -; CHECK-SKX-NEXT:    vcmpltps %zmm6, %zmm1, %k1
> -; CHECK-SKX-NEXT:    korw %k1, %k0, %k0
> -; CHECK-SKX-NEXT:    kshiftrw $8, %k0, %k1
> -; CHECK-SKX-NEXT:    vpmovm2q %k1, %zmm1
> -; CHECK-SKX-NEXT:    vpmovm2q %k0, %zmm0
> -; CHECK-SKX-NEXT:    vptestmq %zmm0, %zmm0, %k1
> -; CHECK-SKX-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
> -; CHECK-SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1
> -; CHECK-SKX-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
> -; CHECK-SKX-NEXT:    retq
> -;
> -; CHECK-KNL-LABEL: test2:
> -; CHECK-KNL:       # BB#0: # %entry
> -; CHECK-KNL-NEXT:    vxorps %xmm6, %xmm6, %xmm6
> -; CHECK-KNL-NEXT:    vcmpltps %zmm0, %zmm6, %k0
> -; CHECK-KNL-NEXT:    vcmpltps %zmm6, %zmm1, %k1
> -; CHECK-KNL-NEXT:    korw %k1, %k0, %k1
> -; CHECK-KNL-NEXT:    kshiftrw $8, %k1, %k2
> -; CHECK-KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
> -; CHECK-KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
> -; CHECK-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
> -; CHECK-KNL-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
> -; CHECK-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
> -; CHECK-KNL-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
> -; CHECK-KNL-NEXT:    retq
> +; CHECK-LABEL: test2:
> +; CHECK:       # BB#0: # %entry
> +; CHECK-NEXT:    vxorps %xmm6, %xmm6, %xmm6
> +; CHECK-NEXT:    vcmpltps %zmm0, %zmm6, %k0
> +; CHECK-NEXT:    vcmpltps %zmm6, %zmm1, %k1
> +; CHECK-NEXT:    korw %k1, %k0, %k1
> +; CHECK-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
> +; CHECK-NEXT:    kshiftrw $8, %k1, %k1
> +; CHECK-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
> +; CHECK-NEXT:    retq
>  entry:
>    %gt.m = fcmp ogt <16 x float> %x, zeroinitializer
>    %lt.m = fcmp olt <16 x float> %y, zeroinitializer
> Index: test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
> @@ -0,0 +1,52 @@
> +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
> +
> +; Check that DAGTypeLegalizer::WidenVSELECTAndMask doesn't try to
> +; create vselects with i64 condition masks.
> +
> +; FIXME: Should be able to avoid intermediate vselect
> +; GCN-LABEL: {{^}}widen_vselect_and_mask_v4f64:
> +; GCN: v_cmp_u_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
> +; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
> +; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
> +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
> +define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
> +bb:
> +  %tmp = extractelement <4 x double> %arg, i64 0
> +  %tmp1 = fcmp uno double %tmp, 0.000000e+00
> +  %tmp2 = sext i1 %tmp1 to i64
> +  %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0
> +  %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1
> +  %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2
> +  %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3
> +  %tmp7 = fcmp une <4 x double> %arg, zeroinitializer
> +  %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
> +  %tmp9 = and <4 x i1> %tmp8, %tmp7
> +  %tmp10 = select <4 x i1> %tmp9, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, <4 x double> zeroinitializer
> +  store <4 x double> %tmp10, <4 x double> addrspace(1)* null, align 32
> +  ret void
> +}
> +
> +; GCN-LABEL: {{^}}widen_vselect_and_mask_v4i64:
> +; GCN: v_cmp_eq_u64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
> +; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
> +; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
> +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
> +define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
> +bb:
> +  %tmp = extractelement <4 x i64> %arg, i64 0
> +  %tmp1 = icmp eq i64 %tmp, 0
> +  %tmp2 = sext i1 %tmp1 to i64
> +  %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0
> +  %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1
> +  %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2
> +  %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3
> +  %tmp7 = icmp ne <4 x i64> %arg, zeroinitializer
> +  %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
> +  %tmp9 = and <4 x i1> %tmp8, %tmp7
> +  %tmp10 = select <4 x i1> %tmp9, <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i64> zeroinitializer
> +  store <4 x i64> %tmp10, <4 x i64> addrspace(1)* null, align 32
> +  ret void
> +}
> +
> +attributes #0 = { nounwind }
> +attributes #1 = { nounwind readnone speculatable }
> Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> ===================================================================
> --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> @@ -3069,7 +3069,8 @@
>  
>    // If this is a splitted VSELECT that was previously already handled, do
>    // nothing.
> -  if (Cond->getValueType(0).getScalarSizeInBits() != 1)
> +  EVT CondVT = Cond->getValueType(0);
> +  if (CondVT.getScalarSizeInBits() != 1)
>      return SDValue();
>  
>    EVT VSelVT = N->getValueType(0);
> @@ -3093,6 +3094,14 @@
>      EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
>      if (SetCCResVT.getScalarSizeInBits() == 1)
>        return SDValue();
> +  } else if (CondVT.getScalarType() == MVT::i1) {
> +    // If there is support for an i1 vector mask (or only scalar i1 conditions),
> +    // don't touch.
> +    while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
> +      CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);
> +
> +    if (CondVT.getScalarType() == MVT::i1)
> +      return SDValue();
>    }
>  
>    // Get the VT and operands for VSELECT, and widen if needed.