[PATCH] R600: Compute masked bits for min and max

Mon Mar 31 12:08:08 PDT 2014

On Wed, Mar 26, 2014 at 03:10:28PM -0700, Matt Arsenault wrote:
> http://llvm-reviews.chandlerc.com/D3195
>

LGTM.

-Tom

> Files:
>   lib/Target/R600/AMDGPUISelLowering.cpp
>   test/CodeGen/R600/llvm.AMDGPU.umax.ll
>   test/CodeGen/R600/llvm.AMDGPU.umin.ll
> 
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1177,11 +1177,55 @@
>    }
>  }
>  
> +static void computeMaskedBitsForMinMax(const SDValue Op0,
> +                                       const SDValue Op1,
> +                                       APInt &KnownZero,
> +                                       APInt &KnownOne,
> +                                       const SelectionDAG &DAG,
> +                                       unsigned Depth) {
> +  APInt Op0Zero, Op0One;
> +  APInt Op1Zero, Op1One;
> +  DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
> +  DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
> +
> +  KnownZero = Op0Zero & Op1Zero;
> +  KnownOne = Op0One & Op1One;
> +}
> +
>  void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
>    const SDValue Op,
>    APInt &KnownZero,
>    APInt &KnownOne,
>    const SelectionDAG &DAG,
>    unsigned Depth) const {
> +
>    KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
> +  unsigned Opc = Op.getOpcode();
> +  switch (Opc) {
> +  case ISD::INTRINSIC_WO_CHAIN: {
> +    // FIXME: The intrinsic should just use the node.
> +    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
> +    case AMDGPUIntrinsic::AMDGPU_imax:
> +    case AMDGPUIntrinsic::AMDGPU_umax:
> +    case AMDGPUIntrinsic::AMDGPU_imin:
> +    case AMDGPUIntrinsic::AMDGPU_umin:
> +      computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
> +                                 KnownZero, KnownOne, DAG, Depth);
> +      break;
> +    default:
> +      break;
> +    }
> +
> +    break;
> +  }
> +  case AMDGPUISD::SMAX:
> +  case AMDGPUISD::UMAX:
> +  case AMDGPUISD::SMIN:
> +  case AMDGPUISD::UMIN:
> +    computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
> +                               KnownZero, KnownOne, DAG, Depth);
> +    break;
> +  default:
> +    break;
> +  }
>  }
> Index: test/CodeGen/R600/llvm.AMDGPU.umax.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umax.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umax.ll
> @@ -21,6 +21,21 @@
>    ret void
>  }
>  
> +; SI-LABEL: @trunc_zext_umax
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> +  %tmp5 = load i8 addrspace(1)* %src, align 1
> +  %tmp2 = zext i8 %tmp5 to i32
> +  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
> +  %tmp4 = trunc i32 %tmp3 to i8
> +  %tmp6 = zext i8 %tmp4 to i16
> +  store i16 %tmp6, i16 addrspace(1)* %out, align 2
> +  ret void
> +}
> +
>  ; Function Attrs: readnone
>  declare i32 @llvm.AMDGPU.umax(i32, i32) #1
>  
> Index: test/CodeGen/R600/llvm.AMDGPU.umin.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umin.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umin.ll
> @@ -21,6 +21,21 @@
>    ret void
>  }
>  
> +; SI-LABEL: @trunc_zext_umin
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> +  %tmp5 = load i8 addrspace(1)* %src, align 1
> +  %tmp2 = zext i8 %tmp5 to i32
> +  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
> +  %tmp4 = trunc i32 %tmp3 to i8
> +  %tmp6 = zext i8 %tmp4 to i16
> +  store i16 %tmp6, i16 addrspace(1)* %out, align 2
> +  ret void
> +}
> +
>  ; Function Attrs: readnone
>  declare i32 @llvm.AMDGPU.umin(i32, i32) #1

> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1177,11 +1177,55 @@
>    }
>  }
>  
> +static void computeMaskedBitsForMinMax(const SDValue Op0,
> +                                       const SDValue Op1,
> +                                       APInt &KnownZero,
> +                                       APInt &KnownOne,
> +                                       const SelectionDAG &DAG,
> +                                       unsigned Depth) {
> +  APInt Op0Zero, Op0One;
> +  APInt Op1Zero, Op1One;
> +  DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
> +  DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
> +
> +  KnownZero = Op0Zero & Op1Zero;
> +  KnownOne = Op0One & Op1One;
> +}
> +
>  void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
>    const SDValue Op,
>    APInt &KnownZero,
>    APInt &KnownOne,
>    const SelectionDAG &DAG,
>    unsigned Depth) const {
> +
>    KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
> +  unsigned Opc = Op.getOpcode();
> +  switch (Opc) {
> +  case ISD::INTRINSIC_WO_CHAIN: {
> +    // FIXME: The intrinsic should just use the node.
> +    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
> +    case AMDGPUIntrinsic::AMDGPU_imax:
> +    case AMDGPUIntrinsic::AMDGPU_umax:
> +    case AMDGPUIntrinsic::AMDGPU_imin:
> +    case AMDGPUIntrinsic::AMDGPU_umin:
> +      computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
> +                                 KnownZero, KnownOne, DAG, Depth);
> +      break;
> +    default:
> +      break;
> +    }
> +
> +    break;
> +  }
> +  case AMDGPUISD::SMAX:
> +  case AMDGPUISD::UMAX:
> +  case AMDGPUISD::SMIN:
> +  case AMDGPUISD::UMIN:
> +    computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
> +                               KnownZero, KnownOne, DAG, Depth);
> +    break;
> +  default:
> +    break;
> +  }
>  }
> Index: test/CodeGen/R600/llvm.AMDGPU.umax.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umax.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umax.ll
> @@ -21,6 +21,21 @@
>    ret void
>  }
>  
> +; SI-LABEL: @trunc_zext_umax
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> +  %tmp5 = load i8 addrspace(1)* %src, align 1
> +  %tmp2 = zext i8 %tmp5 to i32
> +  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
> +  %tmp4 = trunc i32 %tmp3 to i8
> +  %tmp6 = zext i8 %tmp4 to i16
> +  store i16 %tmp6, i16 addrspace(1)* %out, align 2
> +  ret void
> +}
> +
>  ; Function Attrs: readnone
>  declare i32 @llvm.AMDGPU.umax(i32, i32) #1
>  
> Index: test/CodeGen/R600/llvm.AMDGPU.umin.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umin.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umin.ll
> @@ -21,6 +21,21 @@
>    ret void
>  }
>  
> +; SI-LABEL: @trunc_zext_umin
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> +  %tmp5 = load i8 addrspace(1)* %src, align 1
> +  %tmp2 = zext i8 %tmp5 to i32
> +  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
> +  %tmp4 = trunc i32 %tmp3 to i8
> +  %tmp6 = zext i8 %tmp4 to i16
> +  store i16 %tmp6, i16 addrspace(1)* %out, align 2
> +  ret void
> +}
> +
>  ; Function Attrs: readnone
>  declare i32 @llvm.AMDGPU.umin(i32, i32) #1
>  

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits