[PATCH] R600: Compute masked bits for min and max
Tom Stellard
tom at stellard.net
Mon Mar 31 12:08:08 PDT 2014
On Wed, Mar 26, 2014 at 03:10:28PM -0700, Matt Arsenault wrote:
> http://llvm-reviews.chandlerc.com/D3195
>
LGTM.
-Tom
> Files:
> lib/Target/R600/AMDGPUISelLowering.cpp
> test/CodeGen/R600/llvm.AMDGPU.umax.ll
> test/CodeGen/R600/llvm.AMDGPU.umin.ll
>
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1177,11 +1177,55 @@
> }
> }
>
> +static void computeMaskedBitsForMinMax(const SDValue Op0,
> + const SDValue Op1,
> + APInt &KnownZero,
> + APInt &KnownOne,
> + const SelectionDAG &DAG,
> + unsigned Depth) {
> + APInt Op0Zero, Op0One;
> + APInt Op1Zero, Op1One;
> + DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
> + DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
> +
> + KnownZero = Op0Zero & Op1Zero;
> + KnownOne = Op0One & Op1One;
> +}
> +
> void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
> const SDValue Op,
> APInt &KnownZero,
> APInt &KnownOne,
> const SelectionDAG &DAG,
> unsigned Depth) const {
> +
> KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
> + unsigned Opc = Op.getOpcode();
> + switch (Opc) {
> + case ISD::INTRINSIC_WO_CHAIN: {
> + // FIXME: The intrinsic should just use the node.
> + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
> + case AMDGPUIntrinsic::AMDGPU_imax:
> + case AMDGPUIntrinsic::AMDGPU_umax:
> + case AMDGPUIntrinsic::AMDGPU_imin:
> + case AMDGPUIntrinsic::AMDGPU_umin:
> + computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
> + KnownZero, KnownOne, DAG, Depth);
> + break;
> + default:
> + break;
> + }
> +
> + break;
> + }
> + case AMDGPUISD::SMAX:
> + case AMDGPUISD::UMAX:
> + case AMDGPUISD::SMIN:
> + case AMDGPUISD::UMIN:
> + computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
> + KnownZero, KnownOne, DAG, Depth);
> + break;
> + default:
> + break;
> + }
> }
> Index: test/CodeGen/R600/llvm.AMDGPU.umax.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umax.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umax.ll
> @@ -21,6 +21,21 @@
> ret void
> }
>
> +; SI-LABEL: @trunc_zext_umax
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> + %tmp5 = load i8 addrspace(1)* %src, align 1
> + %tmp2 = zext i8 %tmp5 to i32
> + %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
> + %tmp4 = trunc i32 %tmp3 to i8
> + %tmp6 = zext i8 %tmp4 to i16
> + store i16 %tmp6, i16 addrspace(1)* %out, align 2
> + ret void
> +}
> +
> ; Function Attrs: readnone
> declare i32 @llvm.AMDGPU.umax(i32, i32) #1
>
> Index: test/CodeGen/R600/llvm.AMDGPU.umin.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umin.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umin.ll
> @@ -21,6 +21,21 @@
> ret void
> }
>
> +; SI-LABEL: @trunc_zext_umin
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> + %tmp5 = load i8 addrspace(1)* %src, align 1
> + %tmp2 = zext i8 %tmp5 to i32
> + %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
> + %tmp4 = trunc i32 %tmp3 to i8
> + %tmp6 = zext i8 %tmp4 to i16
> + store i16 %tmp6, i16 addrspace(1)* %out, align 2
> + ret void
> +}
> +
> ; Function Attrs: readnone
> declare i32 @llvm.AMDGPU.umin(i32, i32) #1
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1177,11 +1177,55 @@
> }
> }
>
> +static void computeMaskedBitsForMinMax(const SDValue Op0,
> + const SDValue Op1,
> + APInt &KnownZero,
> + APInt &KnownOne,
> + const SelectionDAG &DAG,
> + unsigned Depth) {
> + APInt Op0Zero, Op0One;
> + APInt Op1Zero, Op1One;
> + DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
> + DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
> +
> + KnownZero = Op0Zero & Op1Zero;
> + KnownOne = Op0One & Op1One;
> +}
> +
> void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
> const SDValue Op,
> APInt &KnownZero,
> APInt &KnownOne,
> const SelectionDAG &DAG,
> unsigned Depth) const {
> +
> KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
> + unsigned Opc = Op.getOpcode();
> + switch (Opc) {
> + case ISD::INTRINSIC_WO_CHAIN: {
> + // FIXME: The intrinsic should just use the node.
> + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
> + case AMDGPUIntrinsic::AMDGPU_imax:
> + case AMDGPUIntrinsic::AMDGPU_umax:
> + case AMDGPUIntrinsic::AMDGPU_imin:
> + case AMDGPUIntrinsic::AMDGPU_umin:
> + computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
> + KnownZero, KnownOne, DAG, Depth);
> + break;
> + default:
> + break;
> + }
> +
> + break;
> + }
> + case AMDGPUISD::SMAX:
> + case AMDGPUISD::UMAX:
> + case AMDGPUISD::SMIN:
> + case AMDGPUISD::UMIN:
> + computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
> + KnownZero, KnownOne, DAG, Depth);
> + break;
> + default:
> + break;
> + }
> }
> Index: test/CodeGen/R600/llvm.AMDGPU.umax.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umax.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umax.ll
> @@ -21,6 +21,21 @@
> ret void
> }
>
> +; SI-LABEL: @trunc_zext_umax
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> + %tmp5 = load i8 addrspace(1)* %src, align 1
> + %tmp2 = zext i8 %tmp5 to i32
> + %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
> + %tmp4 = trunc i32 %tmp3 to i8
> + %tmp6 = zext i8 %tmp4 to i16
> + store i16 %tmp6, i16 addrspace(1)* %out, align 2
> + ret void
> +}
> +
> ; Function Attrs: readnone
> declare i32 @llvm.AMDGPU.umax(i32, i32) #1
>
> Index: test/CodeGen/R600/llvm.AMDGPU.umin.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.umin.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.umin.ll
> @@ -21,6 +21,21 @@
> ret void
> }
>
> +; SI-LABEL: @trunc_zext_umin
> +; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
> +; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
> +; SI-NOT: AND
> +; SI: BUFFER_STORE_SHORT [[RESULT]],
> +define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
> + %tmp5 = load i8 addrspace(1)* %src, align 1
> + %tmp2 = zext i8 %tmp5 to i32
> + %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
> + %tmp4 = trunc i32 %tmp3 to i8
> + %tmp6 = zext i8 %tmp4 to i16
> + store i16 %tmp6, i16 addrspace(1)* %out, align 2
> + ret void
> +}
> +
> ; Function Attrs: readnone
> declare i32 @llvm.AMDGPU.umin(i32, i32) #1
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list