[PATCH] D36597: DAG: Fix creating select with wrong condition type
Justin Bogner via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 24 18:36:11 PDT 2017
Matt Arsenault via Phabricator <reviews at reviews.llvm.org> writes:
> arsenm created this revision.
> Herald added subscribers: tpr, nhaehnle, wdng.
>
> This code added in r297930 assumed that it could create
> a select with a condition type that is just an integer
> bitcast of the selected type. For AMDGPU any vselect is
> going to be scalarized (although the vector types are legal),
> and all select conditions must be i1 (the same as getSetCCResultType).
>
> This logic doesn't really make sense to me, but there's
> never really been a consistent policy in what the select
> condition mask type is supposed to be. Try to extend
> the logic for skipping the transform for condition types
> that aren't setccs. It doesn't seem quite right to me though,
> but checking conditions that seem more sensible (like whether the
> vselect is going to be expanded) doesn't work since this
> seems to depend on that also.
The logic in this function is getting a bit hard to follow, but this
looks reasonable AFAICT. LGTM.
>
> https://reviews.llvm.org/D36597
>
> Files:
> lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
> test/CodeGen/X86/avx512-vselect.ll
>
> Index: test/CodeGen/X86/avx512-vselect.ll
> ===================================================================
> --- test/CodeGen/X86/avx512-vselect.ll
> +++ test/CodeGen/X86/avx512-vselect.ll
> @@ -23,35 +23,16 @@
> ; both formulations of vselect. All of this trickery is because we can't
> ; directly form an SDAG input to the lowering.
> define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
> -; CHECK-SKX-LABEL: test2:
> -; CHECK-SKX: # BB#0: # %entry
> -; CHECK-SKX-NEXT: vxorps %xmm6, %xmm6, %xmm6
> -; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0
> -; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1
> -; CHECK-SKX-NEXT: korw %k1, %k0, %k0
> -; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1
> -; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1
> -; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0
> -; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1
> -; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
> -; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1
> -; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
> -; CHECK-SKX-NEXT: retq
> -;
> -; CHECK-KNL-LABEL: test2:
> -; CHECK-KNL: # BB#0: # %entry
> -; CHECK-KNL-NEXT: vxorps %xmm6, %xmm6, %xmm6
> -; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0
> -; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1
> -; CHECK-KNL-NEXT: korw %k1, %k0, %k1
> -; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2
> -; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
> -; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
> -; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
> -; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
> -; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
> -; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
> -; CHECK-KNL-NEXT: retq
> +; CHECK-LABEL: test2:
> +; CHECK: # BB#0: # %entry
> +; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6
> +; CHECK-NEXT: vcmpltps %zmm0, %zmm6, %k0
> +; CHECK-NEXT: vcmpltps %zmm6, %zmm1, %k1
> +; CHECK-NEXT: korw %k1, %k0, %k1
> +; CHECK-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
> +; CHECK-NEXT: kshiftrw $8, %k1, %k1
> +; CHECK-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
> +; CHECK-NEXT: retq
> entry:
> %gt.m = fcmp ogt <16 x float> %x, zeroinitializer
> %lt.m = fcmp olt <16 x float> %y, zeroinitializer
> Index: test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
> @@ -0,0 +1,52 @@
> +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
> +
> +; Check that DAGTypeLegalizer::WidenVSELECTAndMask doesn't try to
> +; create vselects with i64 condition masks.
> +
> +; FIXME: Should be able to avoid intermediate vselect
> +; GCN-LABEL: {{^}}widen_vselect_and_mask_v4f64:
> +; GCN: v_cmp_u_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
> +; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
> +; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
> +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
> +define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
> +bb:
> + %tmp = extractelement <4 x double> %arg, i64 0
> + %tmp1 = fcmp uno double %tmp, 0.000000e+00
> + %tmp2 = sext i1 %tmp1 to i64
> + %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0
> + %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1
> + %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2
> + %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3
> + %tmp7 = fcmp une <4 x double> %arg, zeroinitializer
> + %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
> + %tmp9 = and <4 x i1> %tmp8, %tmp7
> + %tmp10 = select <4 x i1> %tmp9, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, <4 x double> zeroinitializer
> + store <4 x double> %tmp10, <4 x double> addrspace(1)* null, align 32
> + ret void
> +}
> +
> +; GCN-LABEL: {{^}}widen_vselect_and_mask_v4i64:
> +; GCN: v_cmp_eq_u64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
> +; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
> +; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
> +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
> +define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
> +bb:
> + %tmp = extractelement <4 x i64> %arg, i64 0
> + %tmp1 = icmp eq i64 %tmp, 0
> + %tmp2 = sext i1 %tmp1 to i64
> + %tmp3 = insertelement <4 x i64> undef, i64 %tmp2, i32 0
> + %tmp4 = insertelement <4 x i64> %tmp3, i64 undef, i32 1
> + %tmp5 = insertelement <4 x i64> %tmp4, i64 undef, i32 2
> + %tmp6 = insertelement <4 x i64> %tmp5, i64 undef, i32 3
> + %tmp7 = icmp ne <4 x i64> %arg, zeroinitializer
> + %tmp8 = icmp sgt <4 x i64> %tmp6, <i64 -1, i64 -1, i64 -1, i64 -1>
> + %tmp9 = and <4 x i1> %tmp8, %tmp7
> + %tmp10 = select <4 x i1> %tmp9, <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i64> zeroinitializer
> + store <4 x i64> %tmp10, <4 x i64> addrspace(1)* null, align 32
> + ret void
> +}
> +
> +attributes #0 = { nounwind }
> +attributes #1 = { nounwind readnone speculatable }
> Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> ===================================================================
> --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> @@ -3069,7 +3069,8 @@
>
> // If this is a splitted VSELECT that was previously already handled, do
> // nothing.
> - if (Cond->getValueType(0).getScalarSizeInBits() != 1)
> + EVT CondVT = Cond->getValueType(0);
> + if (CondVT.getScalarSizeInBits() != 1)
> return SDValue();
>
> EVT VSelVT = N->getValueType(0);
> @@ -3093,6 +3094,14 @@
> EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
> if (SetCCResVT.getScalarSizeInBits() == 1)
> return SDValue();
> + } else if (CondVT.getScalarType() == MVT::i1) {
> + // If there is support for an i1 vector mask (or only scalar i1 conditions),
> + // don't touch.
> + while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
> + CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);
> +
> + if (CondVT.getScalarType() == MVT::i1)
> + return SDValue();
> }
>
> // Get the VT and operands for VSELECT, and widen if needed.
More information about the llvm-commits
mailing list