[PATCH] R600: Handle ctpop

Tom Stellard tom at stellard.net
Tue Jun 10 10:35:21 PDT 2014


LGTM.

On Thu, Jun 05, 2014 at 06:52:48PM +0000, Matt Arsenault wrote:
> http://reviews.llvm.org/D4035
> 
> Files:
>   lib/Target/R600/AMDGPUISelLowering.cpp
>   lib/Target/R600/AMDGPUSubtarget.h
>   lib/Target/R600/AMDILISelLowering.cpp
>   lib/Target/R600/EvergreenInstructions.td
>   lib/Target/R600/SIInstrInfo.cpp
>   lib/Target/R600/SIInstrInfo.h
>   lib/Target/R600/SIInstrInfo.td
>   lib/Target/R600/SIInstructions.td
>   test/CodeGen/R600/ctpop.ll
>   test/CodeGen/R600/ctpop64.ll

> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -214,12 +214,23 @@
>    setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
>    setOperationAction(ISD::UREM, MVT::i32, Expand);
>  
> +  if (!Subtarget->hasBCNT(32))
> +    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
> +
> +  if (!Subtarget->hasBCNT(64))
> +    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
> +
> +  for (MVT VT : { MVT::i32, MVT::i64 }) {
> +    setOperationAction(ISD::CTTZ, VT, Expand);
> +    setOperationAction(ISD::CTLZ, VT, Expand);
> +  }
> +
>    static const MVT::SimpleValueType IntTypes[] = {
>      MVT::v2i32, MVT::v4i32
>    };
>  
>    for (MVT VT : IntTypes) {
> -    //Expand the following operations for the current type by default
> +    // Expand the following operations for the current type by default.
>      setOperationAction(ISD::ADD,  VT, Expand);
>      setOperationAction(ISD::AND,  VT, Expand);
>      setOperationAction(ISD::FP_TO_SINT, VT, Expand);
> @@ -237,6 +248,9 @@
>      setOperationAction(ISD::SELECT, VT, Expand);
>      setOperationAction(ISD::VSELECT, VT, Expand);
>      setOperationAction(ISD::XOR,  VT, Expand);
> +    setOperationAction(ISD::CTPOP, VT, Expand);
> +    setOperationAction(ISD::CTTZ, VT, Expand);
> +    setOperationAction(ISD::CTLZ, VT, Expand);
>    }
>  
>    static const MVT::SimpleValueType FloatTypes[] = {
> Index: lib/Target/R600/AMDGPUSubtarget.h
> ===================================================================
> --- lib/Target/R600/AMDGPUSubtarget.h
> +++ lib/Target/R600/AMDGPUSubtarget.h
> @@ -76,6 +76,14 @@
>      return hasBFE();
>    }
>  
> +  bool hasBCNT(unsigned Size) const {
> +    if (Size == 32)
> +      return (getGeneration() >= EVERGREEN);
> +
> +    assert(Size == 64);
> +    return (getGeneration() >= SOUTHERN_ISLANDS);
> +  }
> +
>    bool hasMulU24() const {
>      return (getGeneration() >= EVERGREEN);
>    }
> Index: lib/Target/R600/AMDILISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDILISelLowering.cpp
> +++ lib/Target/R600/AMDILISelLowering.cpp
> @@ -125,11 +125,6 @@
>      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
>  
>      setOperationAction(ISD::BSWAP, VT, Expand);
> -
> -    // GPU doesn't have any counting operators
> -    setOperationAction(ISD::CTPOP, VT, Expand);
> -    setOperationAction(ISD::CTTZ, VT, Expand);
> -    setOperationAction(ISD::CTLZ, VT, Expand);
>    }
>  
>    for (MVT VT : VectorTypes) {
> Index: lib/Target/R600/EvergreenInstructions.td
> ===================================================================
> --- lib/Target/R600/EvergreenInstructions.td
> +++ lib/Target/R600/EvergreenInstructions.td
> @@ -326,6 +326,8 @@
>  def DOT4_eg : DOT4_Common<0xBE>;
>  defm CUBE_eg : CUBE_Common<0xC0>;
>  
> +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop>;
> +
>  let hasSideEffects = 1 in {
>    def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
>  }
> Index: lib/Target/R600/SIInstrInfo.cpp
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.cpp
> +++ lib/Target/R600/SIInstrInfo.cpp
> @@ -667,6 +667,7 @@
>    case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
>    case AMDGPU::S_LOAD_DWORDX4_IMM:
>    case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
> +  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
>    }
>  }
>  
> @@ -1176,6 +1177,11 @@
>        Inst->eraseFromParent();
>        continue;
>  
> +    case AMDGPU::S_BCNT1_I32_B64:
> +      splitScalar64BitBCNT(Worklist, Inst);
> +      Inst->eraseFromParent();
> +      continue;
> +
>      case AMDGPU::S_BFE_U64:
>      case AMDGPU::S_BFE_I64:
>      case AMDGPU::S_BFM_B64:
> @@ -1217,6 +1223,10 @@
>        // 3 to not hit an assertion later in MCInstLower.
>        Inst->addOperand(MachineOperand::CreateImm(0));
>        Inst->addOperand(MachineOperand::CreateImm(0));
> +    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
> +      // The VALU version adds the second operand to the result, so insert an
> +      // extra 0 operand.
> +      Inst->addOperand(MachineOperand::CreateImm(0));
>      }
>  
>      addDescImplicitUseDef(NewDesc, Inst);
> @@ -1360,6 +1370,46 @@
>    Worklist.push_back(HiHalf);
>  }
>  
> +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
> +                                       MachineInstr *Inst) const {
> +  MachineBasicBlock &MBB = *Inst->getParent();
> +  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
> +
> +  MachineBasicBlock::iterator MII = Inst;
> +  DebugLoc DL = Inst->getDebugLoc();
> +
> +  MachineOperand &Dest = Inst->getOperand(0);
> +  MachineOperand &Src = Inst->getOperand(1);
> +
> +  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
> +  const TargetRegisterClass *SrcRC = Src.isReg() ?
> +    MRI.getRegClass(Src.getReg()) :
> +    &AMDGPU::SGPR_32RegClass;
> +
> +  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> +  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
> +
> +  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> +                                                      AMDGPU::sub0, SrcSubRC);
> +  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> +                                                      AMDGPU::sub1, SrcSubRC);
> +
> +  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
> +    .addOperand(SrcRegSub0)
> +    .addImm(0);
> +
> +  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
> +    .addOperand(SrcRegSub1)
> +    .addReg(MidReg);
> +
> +  MRI.replaceRegWith(Dest.getReg(), ResultReg);
> +
> +  Worklist.push_back(First);
> +  Worklist.push_back(Second);
> +}
> +
>  void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
>                                          MachineInstr *Inst) const {
>    // Add the implict and explicit register definitions.
> Index: lib/Target/R600/SIInstrInfo.h
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.h
> +++ lib/Target/R600/SIInstrInfo.h
> @@ -44,9 +44,12 @@
>                           const TargetRegisterClass *RC,
>                           const MachineOperand &Op) const;
>  
> -  void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
> +  void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>                            MachineInstr *Inst, unsigned Opcode) const;
>  
> +  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
> +                            MachineInstr *Inst) const;
> +
>    void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
>  
>  public:
> Index: lib/Target/R600/SIInstrInfo.td
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.td
> +++ lib/Target/R600/SIInstrInfo.td
> @@ -187,6 +187,12 @@
>    opName#" $dst, $src0", pattern
>  >;
>  
> +// 64-bit input, 32-bit output.
> +class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
> +  op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
> +  opName#" $dst, $src0", pattern
> +>;
> +
>  class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
>    op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
>    opName#" $dst, $src0, $src1", pattern
> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -105,8 +105,11 @@
>  
>  ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
>  ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
> -////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
> -////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
> +def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
> +  [(set i32:$dst, (ctpop i32:$src0))]
> +>;
> +def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
> +
>  ////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
>  ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
>  ////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
> @@ -1223,7 +1226,7 @@
>  defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
>  defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
>  defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
> -//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
> +defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
>  defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
>  defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
>  
> @@ -2478,6 +2481,18 @@
>    (S_ADD_I32 $src0, $src1)
>  >;
>  
> +def : Pat <
> +  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
> +  (V_BCNT_U32_B32_e32 $popcnt, $val)
> +>;
> +
> +def : Pat <
> +  (i64 (ctpop i64:$src)),
> +  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
> +    (S_BCNT1_I32_B64 $src), sub0),
> +    (S_MOV_B32 0), sub1)
> +>;
> +
>  //============================================================================//
>  // Miscellaneous Optimization Patterns
>  //============================================================================//
> Index: test/CodeGen/R600/ctpop.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/ctpop.ll
> @@ -0,0 +1,254 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
> +
> +declare i32 @llvm.ctpop.i32(i32) nounwind readnone
> +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
> +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
> +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
> +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
> +
> +; FUNC-LABEL: @s_ctpop_i32:
> +; SI: S_LOAD_DWORD [[SVAL:s[0-9]+]],
> +; SI: S_BCNT1_I32_B32 [[SRESULT:s[0-9]+]], [[SVAL]]
> +; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
> +; SI: BUFFER_STORE_DWORD [[VRESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  store i32 %ctpop, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; XXX - Why 0 in register?
> +; FUNC-LABEL: @v_ctpop_i32:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  store i32 %ctpop, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_add_chain_i32
> +; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
> +; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
> +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> +; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
> +; SI-NOT: ADD
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
> +  %val0 = load i32 addrspace(1)* %in0, align 4
> +  %val1 = load i32 addrspace(1)* %in1, align 4
> +  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
> +  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
> +  %add = add i32 %ctpop0, %ctpop1
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v2i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
> +  %val = load <2 x i32> addrspace(1)* %in, align 8
> +  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
> +  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v4i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
> +  %val = load <4 x i32> addrspace(1)* %in, align 16
> +  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
> +  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v8i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
> +  %val = load <8 x i32> addrspace(1)* %in, align 32
> +  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
> +  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v16i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
> +  %val = load <16 x i32> addrspace(1)* %in, align 32
> +  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
> +  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_inline_constant:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  %add = add i32 %ctpop, 4
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_inline_constant_inv:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  %add = add i32 4, %ctpop
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_literal:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_MOV_B32_e32 [[LIT:v[0-9]+]], 0x1869f
> +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  %add = add i32 %ctpop, 99999
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_var:
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  %add = add i32 %ctpop, %const
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_var_inv:
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  %add = add i32 %const, %ctpop
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], {{.*}} + 0x0
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} + 0x10
> +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
> +  %val = load i32 addrspace(1)* %in, align 4
> +  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> +  %gep = getelementptr i32 addrspace(1)* %constptr, i32 4
> +  %const = load i32 addrspace(1)* %gep, align 4
> +  %add = add i32 %const, %ctpop
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> Index: test/CodeGen/R600/ctpop64.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/ctpop64.ll
> @@ -0,0 +1,91 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare i64 @llvm.ctpop.i64(i64) nounwind readnone
> +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
> +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
> +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
> +declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
> +
> +; FUNC-LABEL: @s_ctpop_i64:
> +; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
> +; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
> +; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
> +; SI: BUFFER_STORE_DWORD [[VRESULT]],
> +; SI: S_ENDPGM
> +define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
> +  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
> +  %truncctpop = trunc i64 %ctpop to i32
> +  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i64:
> +; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
> +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> +; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
> +; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> +  %val = load i64 addrspace(1)* %in, align 8
> +  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
> +  %truncctpop = trunc i64 %ctpop to i32
> +  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @s_ctpop_v2i64:
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_ENDPGM
> +define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
> +  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
> +  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
> +  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @s_ctpop_v4i64:
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_ENDPGM
> +define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
> +  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
> +  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
> +  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v2i64:
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: S_ENDPGM
> +define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
> +  %val = load <2 x i64> addrspace(1)* %in, align 16
> +  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
> +  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
> +  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v4i64:
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: S_ENDPGM
> +define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
> +  %val = load <4 x i64> addrspace(1)* %in, align 32
> +  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
> +  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
> +  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
> +  ret void
> +}

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits




More information about the llvm-commits mailing list