[llvm] r179989 - Legalize vector truncates by parts rather than just splitting.

Mon Apr 22 03:09:17 PDT 2013

Hi Jim, I'm not sure why DAGTypeLegalizer::SplitVectorOperand doesn't have a
general custom hook.  For example all the integer legalization methods start
like this:

void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
...
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;

Rather than adding a specific hook for truncations, maybe it is better to add
general hooks along these lines to vector type legalization.

Ciao, Duncan.

On 22/04/13 01:47, Jim Grosbach wrote:
> Author: grosbach
> Date: Sun Apr 21 18:47:41 2013
> New Revision: 179989
>
> URL: http://llvm.org/viewvc/llvm-project?rev=179989&view=rev
> Log:
> Legalize vector truncates by parts rather than just splitting.
>
> Rather than just splitting the input type and hoping for the best, apply
> a bit more cleverness. Just splitting the types until the source is
> legal often leads to an illegal result time, which is then widened and a
> scalarization step is introduced which leads to truly horrible code
> generation. With the loop vectorizer, these sorts of operations are much
> more common, and so it's worth extra effort to do them well.
>
> Add a legalization hook for the operands of a TRUNCATE node, which will
> be encountered after the result type has been legalized, but if the
> operand type is still illegal. If simple splitting of both types
> ends up with the result type of each half still being legal, just
> do that (v16i16 -> v16i8 on ARM, for example). If, however, that would
> result in an illegal result type (v8i32 -> v8i8 on ARM, for example),
> we can get more clever with power-two vectors. Specifically,
> split the input type, but also widen the result element size, then
> concatenate the halves and truncate again.  For example on ARM,
> To perform a "%res = v8i8 trunc v8i32 %in" we transform to:
>    %inlo = v4i32 extract_subvector %in, 0
>    %inhi = v4i32 extract_subvector %in, 4
>    %lo16 = v4i16 trunc v4i32 %inlo
>    %hi16 = v4i16 trunc v4i32 %inhi
>    %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
>    %res = v8i8 trunc v8i16 %in16
>
> This allows instruction selection to generate three VMOVN instructions
> instead of a sequences of moves, stores and loads.
>
> Update the ARMTargetTransformInfo to take this improved legalization
> into account.
>
> Consider the simplified IR:
>
> define <16 x i8> @test1(<16 x i32>* %ap) {
>    %a = load <16 x i32>* %ap
>    %tmp = trunc <16 x i32> %a to <16 x i8>
>    ret <16 x i8> %tmp
> }
>
> define <8 x i8> @test2(<8 x i32>* %ap) {
>    %a = load <8 x i32>* %ap
>    %tmp = trunc <8 x i32> %a to <8 x i8>
>    ret <8 x i8> %tmp
> }
>
> Previously, we would generate the truly hideous:
> 	.syntax unified
> 	.section	__TEXT,__text,regular,pure_instructions
> 	.globl	_test1
> 	.align	2
> _test1:                                 @ @test1
> @ BB#0:
> 	push	{r7}
> 	mov	r7, sp
> 	sub	sp, sp, #20
> 	bic	sp, sp, #7
> 	add	r1, r0, #48
> 	add	r2, r0, #32
> 	vld1.64	{d24, d25}, [r0:128]
> 	vld1.64	{d16, d17}, [r1:128]
> 	vld1.64	{d18, d19}, [r2:128]
> 	add	r1, r0, #16
> 	vmovn.i32	d22, q8
> 	vld1.64	{d16, d17}, [r1:128]
> 	vmovn.i32	d20, q9
> 	vmovn.i32	d18, q12
> 	vmov.u16	r0, d22[3]
> 	strb	r0, [sp, #15]
> 	vmov.u16	r0, d22[2]
> 	strb	r0, [sp, #14]
> 	vmov.u16	r0, d22[1]
> 	strb	r0, [sp, #13]
> 	vmov.u16	r0, d22[0]
> 	vmovn.i32	d16, q8
> 	strb	r0, [sp, #12]
> 	vmov.u16	r0, d20[3]
> 	strb	r0, [sp, #11]
> 	vmov.u16	r0, d20[2]
> 	strb	r0, [sp, #10]
> 	vmov.u16	r0, d20[1]
> 	strb	r0, [sp, #9]
> 	vmov.u16	r0, d20[0]
> 	strb	r0, [sp, #8]
> 	vmov.u16	r0, d18[3]
> 	strb	r0, [sp, #3]
> 	vmov.u16	r0, d18[2]
> 	strb	r0, [sp, #2]
> 	vmov.u16	r0, d18[1]
> 	strb	r0, [sp, #1]
> 	vmov.u16	r0, d18[0]
> 	strb	r0, [sp]
> 	vmov.u16	r0, d16[3]
> 	strb	r0, [sp, #7]
> 	vmov.u16	r0, d16[2]
> 	strb	r0, [sp, #6]
> 	vmov.u16	r0, d16[1]
> 	strb	r0, [sp, #5]
> 	vmov.u16	r0, d16[0]
> 	strb	r0, [sp, #4]
> 	vldmia	sp, {d16, d17}
> 	vmov	r0, r1, d16
> 	vmov	r2, r3, d17
> 	mov	sp, r7
> 	pop	{r7}
> 	bx	lr
>
> 	.globl	_test2
> 	.align	2
> _test2:                                 @ @test2
> @ BB#0:
> 	push	{r7}
> 	mov	r7, sp
> 	sub	sp, sp, #12
> 	bic	sp, sp, #7
> 	vld1.64	{d16, d17}, [r0:128]
> 	add	r0, r0, #16
> 	vld1.64	{d20, d21}, [r0:128]
> 	vmovn.i32	d18, q8
> 	vmov.u16	r0, d18[3]
> 	vmovn.i32	d16, q10
> 	strb	r0, [sp, #3]
> 	vmov.u16	r0, d18[2]
> 	strb	r0, [sp, #2]
> 	vmov.u16	r0, d18[1]
> 	strb	r0, [sp, #1]
> 	vmov.u16	r0, d18[0]
> 	strb	r0, [sp]
> 	vmov.u16	r0, d16[3]
> 	strb	r0, [sp, #7]
> 	vmov.u16	r0, d16[2]
> 	strb	r0, [sp, #6]
> 	vmov.u16	r0, d16[1]
> 	strb	r0, [sp, #5]
> 	vmov.u16	r0, d16[0]
> 	strb	r0, [sp, #4]
> 	ldm	sp, {r0, r1}
> 	mov	sp, r7
> 	pop	{r7}
> 	bx	lr
>
> Now, however, we generate the much more straightforward:
> 	.syntax unified
> 	.section	__TEXT,__text,regular,pure_instructions
> 	.globl	_test1
> 	.align	2
> _test1:                                 @ @test1
> @ BB#0:
> 	add	r1, r0, #48
> 	add	r2, r0, #32
> 	vld1.64	{d20, d21}, [r0:128]
> 	vld1.64	{d16, d17}, [r1:128]
> 	add	r1, r0, #16
> 	vld1.64	{d18, d19}, [r2:128]
> 	vld1.64	{d22, d23}, [r1:128]
> 	vmovn.i32	d17, q8
> 	vmovn.i32	d16, q9
> 	vmovn.i32	d18, q10
> 	vmovn.i32	d19, q11
> 	vmovn.i16	d17, q8
> 	vmovn.i16	d16, q9
> 	vmov	r0, r1, d16
> 	vmov	r2, r3, d17
> 	bx	lr
>
> 	.globl	_test2
> 	.align	2
> _test2:                                 @ @test2
> @ BB#0:
> 	vld1.64	{d16, d17}, [r0:128]
> 	add	r0, r0, #16
> 	vld1.64	{d18, d19}, [r0:128]
> 	vmovn.i32	d16, q8
> 	vmovn.i32	d17, q9
> 	vmovn.i16	d16, q8
> 	vmov	r0, r1, d16
> 	bx	lr
>
> Modified:
>      llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h
>      llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
>      llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
>      llvm/trunk/test/Analysis/CostModel/ARM/cast.ll
>      llvm/trunk/test/CodeGen/ARM/vcvt-cost.ll
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h?rev=179989&r1=179988&r2=179989&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h Sun Apr 21 18:47:41 2013
> @@ -581,6 +581,7 @@ private:
>     SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
>     SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
>     SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
> +  SDValue SplitVecOp_TRUNCATE(SDNode *N);
>     SDValue SplitVecOp_VSETCC(SDNode *N);
>     SDValue SplitVecOp_FP_ROUND(SDNode *N);
>
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp?rev=179989&r1=179988&r2=179989&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp Sun Apr 21 18:47:41 2013
> @@ -1046,6 +1046,7 @@ bool DAGTypeLegalizer::SplitVectorOperan
>       case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
>       case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
>       case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
> +    case ISD::TRUNCATE:          Res = SplitVecOp_TRUNCATE(N); break;
>       case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
>       case ISD::STORE:
>         Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
> @@ -1062,7 +1063,6 @@ bool DAGTypeLegalizer::SplitVectorOperan
>       case ISD::SINT_TO_FP:
>       case ISD::UINT_TO_FP:
>       case ISD::FTRUNC:
> -    case ISD::TRUNCATE:
>       case ISD::SIGN_EXTEND:
>       case ISD::ZERO_EXTEND:
>       case ISD::ANY_EXTEND:
> @@ -1293,6 +1293,66 @@ SDValue DAGTypeLegalizer::SplitVecOp_CON
>                        &Elts[0], Elts.size());
>   }
>
> +SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
> +  // The result type is legal, but the input type is illegal.  If splitting
> +  // ends up with the result type of each half still being legal, just
> +  // do that.  If, however, that would result in an illegal result type,
> +  // we can try to get more clever with power-two vectors. Specifically,
> +  // split the input type, but also widen the result element size, then
> +  // concatenate the halves and truncate again.  For example, consider a target
> +  // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
> +  // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
> +  //   %inlo = v4i32 extract_subvector %in, 0
> +  //   %inhi = v4i32 extract_subvector %in, 4
> +  //   %lo16 = v4i16 trunc v4i32 %inlo
> +  //   %hi16 = v4i16 trunc v4i32 %inhi
> +  //   %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
> +  //   %res = v8i8 trunc v8i16 %in16
> +  //
> +  // Without this transform, the original truncate would end up being
> +  // scalarized, which is pretty much always a last resort.
> +  SDValue InVec = N->getOperand(0);
> +  EVT InVT = InVec->getValueType(0);
> +  EVT OutVT = N->getValueType(0);
> +  unsigned NumElements = OutVT.getVectorNumElements();
> +  // Widening should have already made sure this is a power-two vector
> +  // if we're trying to split it at all. assert() that's true, just in case.
> +  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
> +
> +  unsigned InElementSize = InVT.getVectorElementType().getSizeInBits();
> +  unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits();
> +
> +  // If the input elements are only 1/2 the width of the result elements,
> +  // just use the normal splitting. Our trick only work if there's room
> +  // to split more than once.
> +  if (InElementSize <= OutElementSize * 2)
> +    return SplitVecOp_UnaryOp(N);
> +  DebugLoc DL = N->getDebugLoc();
> +
> +  // Extract the halves of the input via extract_subvector.
> +  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
> +                                 InVT.getVectorElementType(), NumElements/2);
> +  SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
> +                                DAG.getIntPtrConstant(0));
> +  SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
> +                                DAG.getIntPtrConstant(NumElements/2));
> +  // Truncate them to 1/2 the element size.
> +  EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
> +  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
> +                                NumElements/2);
> +  SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec);
> +  SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec);
> +  // Concatenate them to get the full intermediate truncation result.
> +  EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
> +  SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
> +                                 HalfHi);
> +  // Now finish up by truncating all the way down to the original result
> +  // type. This should normally be something that ends up being legal directly,
> +  // but in theory if a target has very wide vectors and an annoyingly
> +  // restricted set of legal types, this split can chain to build things up.
> +  return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
> +}
> +
>   SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
>     assert(N->getValueType(0).isVector() &&
>            N->getOperand(0).getValueType().isVector() &&
>
> Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp?rev=179989&r1=179988&r2=179989&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp Sun Apr 21 18:47:41 2013
> @@ -223,9 +223,9 @@ unsigned ARMTTI::getCastInstrCost(unsign
>       { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
>       { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
>
> -    // Operations that we legalize using load/stores to the stack.
> -    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
> -    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
> +    // Operations that we legalize using splitting.
> +    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
> +    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
>
>       // Vector float <-> i32 conversions.
>       { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
>
> Modified: llvm/trunk/test/Analysis/CostModel/ARM/cast.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/ARM/cast.ll?rev=179989&r1=179988&r2=179989&view=diff
> ==============================================================================
> --- llvm/trunk/test/Analysis/CostModel/ARM/cast.ll (original)
> +++ llvm/trunk/test/Analysis/CostModel/ARM/cast.ll Sun Apr 21 18:47:41 2013
> @@ -175,9 +175,9 @@ define i32 @casts() {
>     %rext_5 = zext <4 x i16> undef to <4 x i64>
>
>     ; Vector cast cost of instructions lowering the cast to the stack.
> -  ; CHECK: cost of 19 {{.*}} trunc
> +  ; CHECK: cost of 3 {{.*}} trunc
>     %r74 = trunc <8 x i32> undef to <8 x i8>
> -  ; CHECK: cost of 38 {{.*}} trunc
> +  ; CHECK: cost of 6 {{.*}} trunc
>     %r75 = trunc <16 x i32> undef to <16 x i8>
>
>     ; Floating point truncation costs.
>
> Modified: llvm/trunk/test/CodeGen/ARM/vcvt-cost.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vcvt-cost.ll?rev=179989&r1=179988&r2=179989&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vcvt-cost.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/vcvt-cost.ll Sun Apr 21 18:47:41 2013
> @@ -32,29 +32,22 @@ define void @func_cvt1(%TA0_5* %loadaddr
>     store %TA1_5 %r, %TA1_5* %storeaddr
>     ret void
>   }
> -;; We currently estimate the cost of this instruction as expensive. If lowering
> -;; is improved the cost needs to change.
> +
>   %T0_51 = type <8 x i32>
>   %T1_51 = type <8 x i8>
>   ; CHECK: func_cvt51:
>   define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) {
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> +; CHECK: vmovn.i32
> +; CHECK: vmovn.i32
> +; CHECK: vmovn.i16
>     %v0 = load %T0_51* %loadaddr
>   ; COST: func_cvt51
> -; COST: cost of 19 {{.*}} trunc
> +; COST: cost of 3 {{.*}} trunc
>     %r = trunc %T0_51 %v0 to %T1_51
>     store %T1_51 %r, %T1_51* %storeaddr
>     ret void
>   }
> -;; We currently estimate the cost of this instruction as expensive. If lowering
> -;; is improved the cost needs to change.
> +
>   %TT0_5 = type <16 x i8>
>   %TT1_5 = type <16 x i32>
>   ; CHECK: func_cvt52:
> @@ -87,31 +80,20 @@ define void @func_cvt12(%TTA0_5* %loadad
>     store %TTA1_5 %r, %TTA1_5* %storeaddr
>     ret void
>   }
> -;; We currently estimate the cost of this instruction as expensive. If lowering
> -;; is improved the cost needs to change.
> +
>   %TT0_51 = type <16 x i32>
>   %TT1_51 = type <16 x i8>
>   ; CHECK: func_cvt512:
>   define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) {
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> -; CHECK: strb
> +; CHECK: vmovn.i32
> +; CHECK: vmovn.i32
> +; CHECK: vmovn.i32
> +; CHECK: vmovn.i32
> +; CHECK: vmovn.i16
> +; CHECK: vmovn.i16
>     %v0 = load %TT0_51* %loadaddr
>   ; COST: func_cvt512
> -; COST: cost of 38 {{.*}} trunc
> +; COST: cost of 6 {{.*}} trunc
>     %r = trunc %TT0_51 %v0 to %TT1_51
>     store %TT1_51 %r, %TT1_51* %storeaddr
>     ret void
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>