[PATCH] R600/SI: Combine min3/max3 instructions

Thu Nov 13 08:18:04 PST 2014

On Wed, Oct 22, 2014 at 12:38:53AM +0000, Matt Arsenault wrote:
> http://reviews.llvm.org/D5901
> 
> Files:
>   lib/Target/R600/AMDGPUISelLowering.cpp
>   lib/Target/R600/AMDGPUISelLowering.h
>   lib/Target/R600/AMDGPUInstrInfo.td
>   lib/Target/R600/SIISelLowering.cpp
>   lib/Target/R600/SIISelLowering.h
>   lib/Target/R600/SIInstructions.td
>   test/CodeGen/R600/fmax3.ll
>   test/CodeGen/R600/fmin3.ll
>   test/CodeGen/R600/max3.ll
>   test/CodeGen/R600/min3.ll

> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -2355,6 +2355,12 @@
>    NODE_NAME_CASE(FMIN_LEGACY)
>    NODE_NAME_CASE(SMIN)
>    NODE_NAME_CASE(UMIN)
> +  NODE_NAME_CASE(FMAX3)
> +  NODE_NAME_CASE(SMAX3)
> +  NODE_NAME_CASE(UMAX3)
> +  NODE_NAME_CASE(FMIN3)
> +  NODE_NAME_CASE(SMIN3)
> +  NODE_NAME_CASE(UMIN3)
>    NODE_NAME_CASE(URECIP)
>    NODE_NAME_CASE(DIV_SCALE)
>    NODE_NAME_CASE(DIV_FMAS)
> Index: lib/Target/R600/AMDGPUISelLowering.h
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.h
> +++ lib/Target/R600/AMDGPUISelLowering.h
> @@ -210,6 +210,12 @@
>    FMIN_LEGACY,
>    SMIN,
>    UMIN,
> +  FMAX3,
> +  SMAX3,
> +  UMAX3,
> +  FMIN3,
> +  SMIN3,
> +  UMIN3,
>    URECIP,
>    DIV_SCALE,
>    DIV_FMAS,
> Index: lib/Target/R600/AMDGPUInstrInfo.td
> ===================================================================
> --- lib/Target/R600/AMDGPUInstrInfo.td
> +++ lib/Target/R600/AMDGPUInstrInfo.td
> @@ -84,7 +84,7 @@
>    [SDNPAssociative]
>  >;
>  
> -// out = min(a, b) a snd b are signed ints
> +// out = min(a, b) a and b are signed ints
>  def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
>    [SDNPCommutative, SDNPAssociative]
>  >;
> @@ -94,6 +94,37 @@
>    [SDNPCommutative, SDNPAssociative]
>  >;
>  
> +// FIXME: TableGen doesn't like commutative instructions with more
> +// than 2 operands.
> +// out = max(a, b, c) a, b and c are floats
> +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
> +  [/*SDNPCommutative, SDNPAssociative*/]
> +>;
> +
> +// out = max(a, b, c) a, b, and c are signed ints
> +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
> +  [/*SDNPCommutative, SDNPAssociative*/]
> +>;
> +
> +// out = max(a, b, c) a, b and c are unsigned ints
> +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
> +  [/*SDNPCommutative, SDNPAssociative*/]
> +>;
> +
> +// out = min(a, b, c) a, b and c are floats
> +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
> +  [/*SDNPCommutative, SDNPAssociative*/]
> +>;
> +
> +// out = min(a, b, c) a, b and c are signed ints
> +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
> +  [/*SDNPCommutative, SDNPAssociative*/]
> +>;
> +
> +// out = min(a, b) a and b are unsigned ints
> +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
> +  [/*SDNPCommutative, SDNPAssociative*/]
> +>;
>  
>  def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
>    SDTIntToFPOp, []>;
> @@ -104,7 +135,6 @@
>  def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
>    SDTIntToFPOp, []>;
>  
> -

Extra whitespace.
>  // urecip - This operation is a helper for integer division, it returns the
>  // result of 1 / a as a fractional unsigned integer.
>  // out = (2^32 / a) + e
> Index: lib/Target/R600/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/SIISelLowering.cpp
> +++ lib/Target/R600/SIISelLowering.cpp
> @@ -1305,6 +1305,7 @@
>    return SDValue();
>  }
>  
> +

Extra whitespace.

LGTM.

>  // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
>  
>  // This is a variant of
> @@ -1355,6 +1356,61 @@
>    return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
>  }
>  
> +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
> +  switch (Opc) {
> +  case ISD::FMAXNUM:
> +    return AMDGPUISD::FMAX3;
> +  case AMDGPUISD::SMAX:
> +    return AMDGPUISD::SMAX3;
> +  case AMDGPUISD::UMAX:
> +    return AMDGPUISD::UMAX3;
> +  case ISD::FMINNUM:
> +    return AMDGPUISD::FMIN3;
> +  case AMDGPUISD::SMIN:
> +    return AMDGPUISD::SMIN3;
> +  case AMDGPUISD::UMIN:
> +    return AMDGPUISD::UMIN3;
> +  default:
> +    llvm_unreachable("Not a min/max opcode");
> +  }
> +}
> +
> +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
> +                                                 DAGCombinerInfo &DCI) const {
> +  SelectionDAG &DAG = DCI.DAG;
> +
> +  unsigned Opc = N->getOpcode();
> +  SDValue Op0 = N->getOperand(0);
> +  SDValue Op1 = N->getOperand(1);
> +
> +  // Only do this if the inner op has one use since this will just increases
> +  // register pressure for no benefit.
> +
> +  // max(max(a, b), c)
> +  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
> +    SDLoc DL(N);
> +    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
> +                       DL,
> +                       N->getValueType(0),
> +                       Op0.getOperand(0),
> +                       Op0.getOperand(1),
> +                       Op1);
> +  }
> +
> +  // max(a, max(b, c))
> +  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
> +    SDLoc DL(N);
> +    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
> +                       DL,
> +                       N->getValueType(0),
> +                       Op0,
> +                       Op1.getOperand(0),
> +                       Op1.getOperand(1));
> +  }
> +
> +  return SDValue();
> +}
> +
>  SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
>                                              DAGCombinerInfo &DCI) const {
>    SelectionDAG &DAG = DCI.DAG;
> @@ -1382,6 +1438,17 @@
>        }
>        break;
>      }
> +  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
> +  case ISD::FMINNUM:
> +  case AMDGPUISD::SMAX:
> +  case AMDGPUISD::SMIN:
> +  case AMDGPUISD::UMAX:
> +  case AMDGPUISD::UMIN: {
> +    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
> +        getTargetMachine().getOptLevel() > CodeGenOpt::None)
> +      return performMin3Max3Combine(N, DCI);
> +    break;
> +  }
>  
>    case AMDGPUISD::CVT_F32_UBYTE0:
>    case AMDGPUISD::CVT_F32_UBYTE1:
> Index: lib/Target/R600/SIISelLowering.h
> ===================================================================
> --- lib/Target/R600/SIISelLowering.h
> +++ lib/Target/R600/SIISelLowering.h
> @@ -59,6 +59,8 @@
>                                 unsigned AS,
>                                 DAGCombinerInfo &DCI) const;
>  
> +  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
> +
>  public:
>    SITargetLowering(TargetMachine &tm);
>  
> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -1544,23 +1544,36 @@
>    VOP_F64_F64_F64_F64, fma
>  >;
>  //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
> +
>  defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "V_ALIGNBIT_B32",
>    VOP_I32_I32_I32_I32
>  >;
>  defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f>, "V_ALIGNBYTE_B32",
>    VOP_I32_I32_I32_I32
>  >;
>  defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "V_MULLIT_F32",
>    VOP_F32_F32_F32_F32>;
> -////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
> -////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
> -////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
> -////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
> -////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
> -////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
> -////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
> -////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
> -////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
> +defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "V_MIN3_F32",
> +  VOP_F32_F32_F32_F32, AMDGPUfmin3>;
> +
> +defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "V_MIN3_I32",
> +  VOP_I32_I32_I32_I32, AMDGPUsmin3
> +>;
> +defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "V_MIN3_U32",
> +  VOP_I32_I32_I32_I32, AMDGPUumin3
> +>;
> +defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "V_MAX3_F32",
> +  VOP_F32_F32_F32_F32, AMDGPUfmax3
> +>;
> +defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "V_MAX3_I32",
> +  VOP_I32_I32_I32_I32, AMDGPUsmax3
> +>;
> +defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "V_MAX3_U32",
> +  VOP_I32_I32_I32_I32, AMDGPUumax3
> +>;
> +//def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
> +//def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
> +//def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
>  //def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
>  //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
>  //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
> Index: test/CodeGen/R600/fmax3.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/fmax3.ll
> @@ -0,0 +1,65 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +; SI-LABEL: @test_fmax3_ogt_0:
> +; SI: BUFFER_LOAD_DWORD [[REGA:v[0-9]+]]
> +; SI: BUFFER_LOAD_DWORD [[REGB:v[0-9]+]]
> +; SI: BUFFER_LOAD_DWORD [[REGC:v[0-9]+]]
> +; SI: V_MAX3_F32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @test_fmax3_ogt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp ogt float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp ogt float %f0, %c
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_fmax3_ogt_1:
> +; SI: V_MAX3_F32
> +; SI: S_ENDPGM
> +define void @test_fmax3_ogt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp ogt float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp ole float %c, %f0
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_fmax3_ogt_2:
> +; SI-NOT: V_MAX3_F32
> +; SI: S_ENDPGM
> +define void @test_fmax3_ogt_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp ogt float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp ogt float %c, %f0
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_fmax3_oge_0
> +; SI: V_MAX3_F32
> +; SI: S_ENDPGM
> +define void @test_fmax3_oge_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp oge float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp oge float %f0, %c
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> Index: test/CodeGen/R600/fmin3.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/fmin3.ll
> @@ -0,0 +1,65 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +; SI-LABEL: @test_fmin3_olt_0:
> +; SI: BUFFER_LOAD_DWORD [[REGA:v[0-9]+]]
> +; SI: BUFFER_LOAD_DWORD [[REGB:v[0-9]+]]
> +; SI: BUFFER_LOAD_DWORD [[REGC:v[0-9]+]]
> +; SI: V_MIN3_F32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp olt float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp olt float %f0, %c
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_fmin3_olt_1:
> +; SI: V_MIN3_F32
> +; SI: S_ENDPGM
> +define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp olt float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp oge float %c, %f0
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_fmin3_olt_2:
> +; SI-NOT: V_MIN3_F32
> +; SI: S_ENDPGM
> +define void @test_fmin3_olt_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp ole float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp olt float %c, %f0
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_fmin3_olt_3:
> +; SI-NOT: V_MIN3_F32
> +; SI: S_ENDPGM
> +define void @test_fmin3_olt_3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %c = load float addrspace(1)* %cptr, align 4
> +  %fcmp0 = fcmp olt float %a, %b
> +  %f0 = select i1 %fcmp0, float %a, float %b
> +  %fcmp1 = fcmp olt float %c, %f0
> +  %f1 = select i1 %fcmp1, float %f0, float %c
> +  store float %f1, float addrspace(1)* %out, align 4
> +  ret void
> +}
> Index: test/CodeGen/R600/max3.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/max3.ll
> @@ -0,0 +1,41 @@
> +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +
> +; FUNC-LABEL: @v_test_imax3_sgt_i32
> +; SI: V_MAX3_I32
> +define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
> +  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
> +  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
> +  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %a = load i32 addrspace(1)* %gep0, align 4
> +  %b = load i32 addrspace(1)* %gep1, align 4
> +  %c = load i32 addrspace(1)* %gep2, align 4
> +  %icmp0 = icmp sgt i32 %a, %b
> +  %i0 = select i1 %icmp0, i32 %a, i32 %b
> +  %icmp1 = icmp sgt i32 %i0, %c
> +  %i1 = select i1 %icmp1, i32 %i0, i32 %c
> +  store i32 %i1, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_test_umax3_ugt_i32
> +; SI: V_MAX3_U32
> +define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
> +  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
> +  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
> +  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %a = load i32 addrspace(1)* %gep0, align 4
> +  %b = load i32 addrspace(1)* %gep1, align 4
> +  %c = load i32 addrspace(1)* %gep2, align 4
> +  %icmp0 = icmp ugt i32 %a, %b
> +  %i0 = select i1 %icmp0, i32 %a, i32 %b
> +  %icmp1 = icmp ugt i32 %i0, %c
> +  %i1 = select i1 %icmp1, i32 %i0, i32 %c
> +  store i32 %i1, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> Index: test/CodeGen/R600/min3.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/min3.ll
> @@ -0,0 +1,111 @@
> +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +
> +; FUNC-LABEL: @v_test_imin3_slt_i32
> +; SI: V_MIN3_I32
> +define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
> +  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
> +  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
> +  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %a = load i32 addrspace(1)* %gep0, align 4
> +  %b = load i32 addrspace(1)* %gep1, align 4
> +  %c = load i32 addrspace(1)* %gep2, align 4
> +  %icmp0 = icmp slt i32 %a, %b
> +  %i0 = select i1 %icmp0, i32 %a, i32 %b
> +  %icmp1 = icmp slt i32 %i0, %c
> +  %i1 = select i1 %icmp1, i32 %i0, i32 %c
> +  store i32 %i1, i32 addrspace(1)* %outgep, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_test_umin3_ult_i32
> +; SI: V_MIN3_U32
> +define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
> +  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
> +  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
> +  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %a = load i32 addrspace(1)* %gep0, align 4
> +  %b = load i32 addrspace(1)* %gep1, align 4
> +  %c = load i32 addrspace(1)* %gep2, align 4
> +  %icmp0 = icmp ult i32 %a, %b
> +  %i0 = select i1 %icmp0, i32 %a, i32 %b
> +  %icmp1 = icmp ult i32 %i0, %c
> +  %i1 = select i1 %icmp1, i32 %i0, i32 %c
> +  store i32 %i1, i32 addrspace(1)* %outgep, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_test_umin_umin_umin
> +; SI: V_MIN_I32
> +; SI: V_MIN3_I32
> +define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %tid2 = mul i32 %tid, 2
> +  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
> +  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
> +  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
> +
> +  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
> +  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
> +  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
> +
> +  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
> +
> +  %a = load i32 addrspace(1)* %gep0, align 4
> +  %b = load i32 addrspace(1)* %gep1, align 4
> +  %c = load i32 addrspace(1)* %gep2, align 4
> +  %d = load i32 addrspace(1)* %gep3, align 4
> +
> +  %icmp0 = icmp slt i32 %a, %b
> +  %i0 = select i1 %icmp0, i32 %a, i32 %b
> +
> +  %icmp1 = icmp slt i32 %c, %d
> +  %i1 = select i1 %icmp1, i32 %c, i32 %d
> +
> +  %icmp2 = icmp slt i32 %i0, %i1
> +  %i2 = select i1 %icmp2, i32 %i0, i32 %i1
> +
> +  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_test_umin3_2_uses
> +; SI-NOT: V_MIN3
> +define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %tid2 = mul i32 %tid, 2
> +  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
> +  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
> +  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
> +
> +  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
> +  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
> +  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
> +
> +  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
> +  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
> +
> +  %a = load i32 addrspace(1)* %gep0, align 4
> +  %b = load i32 addrspace(1)* %gep1, align 4
> +  %c = load i32 addrspace(1)* %gep2, align 4
> +  %d = load i32 addrspace(1)* %gep3, align 4
> +
> +  %icmp0 = icmp slt i32 %a, %b
> +  %i0 = select i1 %icmp0, i32 %a, i32 %b
> +
> +  %icmp1 = icmp slt i32 %c, %d
> +  %i1 = select i1 %icmp1, i32 %c, i32 %d
> +
> +  %icmp2 = icmp slt i32 %i0, %c
> +  %i2 = select i1 %icmp2, i32 %i0, i32 %c
> +
> +  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
> +  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
> +  ret void
> +}

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits