[PATCH] R600: Handle ctpop
Tom Stellard
tom at stellard.net
Tue Jun 10 10:35:21 PDT 2014
LGTM.
On Thu, Jun 05, 2014 at 06:52:48PM +0000, Matt Arsenault wrote:
> http://reviews.llvm.org/D4035
>
> Files:
> lib/Target/R600/AMDGPUISelLowering.cpp
> lib/Target/R600/AMDGPUSubtarget.h
> lib/Target/R600/AMDILISelLowering.cpp
> lib/Target/R600/EvergreenInstructions.td
> lib/Target/R600/SIInstrInfo.cpp
> lib/Target/R600/SIInstrInfo.h
> lib/Target/R600/SIInstrInfo.td
> lib/Target/R600/SIInstructions.td
> test/CodeGen/R600/ctpop.ll
> test/CodeGen/R600/ctpop64.ll
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -214,12 +214,23 @@
> setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
> setOperationAction(ISD::UREM, MVT::i32, Expand);
>
> + if (!Subtarget->hasBCNT(32))
> + setOperationAction(ISD::CTPOP, MVT::i32, Expand);
> +
> + if (!Subtarget->hasBCNT(64))
> + setOperationAction(ISD::CTPOP, MVT::i64, Expand);
> +
> + for (MVT VT : { MVT::i32, MVT::i64 }) {
> + setOperationAction(ISD::CTTZ, VT, Expand);
> + setOperationAction(ISD::CTLZ, VT, Expand);
> + }
> +
> static const MVT::SimpleValueType IntTypes[] = {
> MVT::v2i32, MVT::v4i32
> };
>
> for (MVT VT : IntTypes) {
> - //Expand the following operations for the current type by default
> + // Expand the following operations for the current type by default.
> setOperationAction(ISD::ADD, VT, Expand);
> setOperationAction(ISD::AND, VT, Expand);
> setOperationAction(ISD::FP_TO_SINT, VT, Expand);
> @@ -237,6 +248,9 @@
> setOperationAction(ISD::SELECT, VT, Expand);
> setOperationAction(ISD::VSELECT, VT, Expand);
> setOperationAction(ISD::XOR, VT, Expand);
> + setOperationAction(ISD::CTPOP, VT, Expand);
> + setOperationAction(ISD::CTTZ, VT, Expand);
> + setOperationAction(ISD::CTLZ, VT, Expand);
> }
>
> static const MVT::SimpleValueType FloatTypes[] = {
> Index: lib/Target/R600/AMDGPUSubtarget.h
> ===================================================================
> --- lib/Target/R600/AMDGPUSubtarget.h
> +++ lib/Target/R600/AMDGPUSubtarget.h
> @@ -76,6 +76,14 @@
> return hasBFE();
> }
>
> + bool hasBCNT(unsigned Size) const {
> + if (Size == 32)
> + return (getGeneration() >= EVERGREEN);
> +
> + assert(Size == 64);
> + return (getGeneration() >= SOUTHERN_ISLANDS);
> + }
> +
> bool hasMulU24() const {
> return (getGeneration() >= EVERGREEN);
> }
> Index: lib/Target/R600/AMDILISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDILISelLowering.cpp
> +++ lib/Target/R600/AMDILISelLowering.cpp
> @@ -125,11 +125,6 @@
> setOperationAction(ISD::UMUL_LOHI, VT, Expand);
>
> setOperationAction(ISD::BSWAP, VT, Expand);
> -
> - // GPU doesn't have any counting operators
> - setOperationAction(ISD::CTPOP, VT, Expand);
> - setOperationAction(ISD::CTTZ, VT, Expand);
> - setOperationAction(ISD::CTLZ, VT, Expand);
> }
>
> for (MVT VT : VectorTypes) {
> Index: lib/Target/R600/EvergreenInstructions.td
> ===================================================================
> --- lib/Target/R600/EvergreenInstructions.td
> +++ lib/Target/R600/EvergreenInstructions.td
> @@ -326,6 +326,8 @@
> def DOT4_eg : DOT4_Common<0xBE>;
> defm CUBE_eg : CUBE_Common<0xC0>;
>
> +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop>;
> +
> let hasSideEffects = 1 in {
> def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
> }
> Index: lib/Target/R600/SIInstrInfo.cpp
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.cpp
> +++ lib/Target/R600/SIInstrInfo.cpp
> @@ -667,6 +667,7 @@
> case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
> case AMDGPU::S_LOAD_DWORDX4_IMM:
> case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
> + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32;
> }
> }
>
> @@ -1176,6 +1177,11 @@
> Inst->eraseFromParent();
> continue;
>
> + case AMDGPU::S_BCNT1_I32_B64:
> + splitScalar64BitBCNT(Worklist, Inst);
> + Inst->eraseFromParent();
> + continue;
> +
> case AMDGPU::S_BFE_U64:
> case AMDGPU::S_BFE_I64:
> case AMDGPU::S_BFM_B64:
> @@ -1217,6 +1223,10 @@
> // 3 to not hit an assertion later in MCInstLower.
> Inst->addOperand(MachineOperand::CreateImm(0));
> Inst->addOperand(MachineOperand::CreateImm(0));
> + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
> + // The VALU version adds the second operand to the result, so insert an
> + // extra 0 operand.
> + Inst->addOperand(MachineOperand::CreateImm(0));
> }
>
> addDescImplicitUseDef(NewDesc, Inst);
> @@ -1360,6 +1370,46 @@
> Worklist.push_back(HiHalf);
> }
>
> +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
> + MachineInstr *Inst) const {
> + MachineBasicBlock &MBB = *Inst->getParent();
> + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
> +
> + MachineBasicBlock::iterator MII = Inst;
> + DebugLoc DL = Inst->getDebugLoc();
> +
> + MachineOperand &Dest = Inst->getOperand(0);
> + MachineOperand &Src = Inst->getOperand(1);
> +
> + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32);
> + const TargetRegisterClass *SrcRC = Src.isReg() ?
> + MRI.getRegClass(Src.getReg()) :
> + &AMDGPU::SGPR_32RegClass;
> +
> + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
> +
> + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> + AMDGPU::sub0, SrcSubRC);
> + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> + AMDGPU::sub1, SrcSubRC);
> +
> + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
> + .addOperand(SrcRegSub0)
> + .addImm(0);
> +
> + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
> + .addOperand(SrcRegSub1)
> + .addReg(MidReg);
> +
> + MRI.replaceRegWith(Dest.getReg(), ResultReg);
> +
> + Worklist.push_back(First);
> + Worklist.push_back(Second);
> +}
> +
> void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
> MachineInstr *Inst) const {
> // Add the implict and explicit register definitions.
> Index: lib/Target/R600/SIInstrInfo.h
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.h
> +++ lib/Target/R600/SIInstrInfo.h
> @@ -44,9 +44,12 @@
> const TargetRegisterClass *RC,
> const MachineOperand &Op) const;
>
> - void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
> + void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
> MachineInstr *Inst, unsigned Opcode) const;
>
> + void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
> + MachineInstr *Inst) const;
> +
> void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
>
> public:
> Index: lib/Target/R600/SIInstrInfo.td
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.td
> +++ lib/Target/R600/SIInstrInfo.td
> @@ -187,6 +187,12 @@
> opName#" $dst, $src0", pattern
> >;
>
> +// 64-bit input, 32-bit output.
> +class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
> + op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
> + opName#" $dst, $src0", pattern
> +>;
> +
> class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
> op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
> opName#" $dst, $src0, $src1", pattern
> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -105,8 +105,11 @@
>
> ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
> ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
> -////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
> -////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
> +def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
> + [(set i32:$dst, (ctpop i32:$src0))]
> +>;
> +def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
> +
> ////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
> ////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
> ////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
> @@ -1223,7 +1226,7 @@
> defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
> defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
> defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
> -//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
> +defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
> defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
> defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
>
> @@ -2478,6 +2481,18 @@
> (S_ADD_I32 $src0, $src1)
> >;
>
> +def : Pat <
> + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
> + (V_BCNT_U32_B32_e32 $popcnt, $val)
> +>;
> +
> +def : Pat <
> + (i64 (ctpop i64:$src)),
> + (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
> + (S_BCNT1_I32_B64 $src), sub0),
> + (S_MOV_B32 0), sub1)
> +>;
> +
> //============================================================================//
> // Miscellaneous Optimization Patterns
> //============================================================================//
> Index: test/CodeGen/R600/ctpop.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/ctpop.ll
> @@ -0,0 +1,254 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
> +
> +declare i32 @llvm.ctpop.i32(i32) nounwind readnone
> +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
> +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
> +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
> +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
> +
> +; FUNC-LABEL: @s_ctpop_i32:
> +; SI: S_LOAD_DWORD [[SVAL:s[0-9]+]],
> +; SI: S_BCNT1_I32_B32 [[SRESULT:s[0-9]+]], [[SVAL]]
> +; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
> +; SI: BUFFER_STORE_DWORD [[VRESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + store i32 %ctpop, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; XXX - Why 0 in register?
> +; FUNC-LABEL: @v_ctpop_i32:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + store i32 %ctpop, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_add_chain_i32
> +; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
> +; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
> +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> +; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
> +; SI-NOT: ADD
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
> + %val0 = load i32 addrspace(1)* %in0, align 4
> + %val1 = load i32 addrspace(1)* %in1, align 4
> + %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
> + %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
> + %add = add i32 %ctpop0, %ctpop1
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v2i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
> + %val = load <2 x i32> addrspace(1)* %in, align 8
> + %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
> + store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v4i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
> + %val = load <4 x i32> addrspace(1)* %in, align 16
> + %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
> + store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v8i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
> + %val = load <8 x i32> addrspace(1)* %in, align 32
> + %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
> + store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v16i32:
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: V_BCNT_U32_B32_e32
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +; EG: BCNT_INT
> +define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
> + %val = load <16 x i32> addrspace(1)* %in, align 32
> + %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
> + store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_inline_constant:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + %add = add i32 %ctpop, 4
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_inline_constant_inv:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + %add = add i32 4, %ctpop
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_literal:
> +; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI: V_MOV_B32_e32 [[LIT:v[0-9]+]], 0x1869f
> +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + %add = add i32 %ctpop, 99999
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_var:
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + %add = add i32 %ctpop, %const
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_var_inv:
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
> +; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
> +; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + %add = add i32 %const, %ctpop
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], {{.*}} + 0x0
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} + 0x10
> +; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +
> +; EG: BCNT_INT
> +define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
> + %val = load i32 addrspace(1)* %in, align 4
> + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
> + %gep = getelementptr i32 addrspace(1)* %constptr, i32 4
> + %const = load i32 addrspace(1)* %gep, align 4
> + %add = add i32 %const, %ctpop
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> Index: test/CodeGen/R600/ctpop64.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/ctpop64.ll
> @@ -0,0 +1,91 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare i64 @llvm.ctpop.i64(i64) nounwind readnone
> +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
> +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
> +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
> +declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
> +
> +; FUNC-LABEL: @s_ctpop_i64:
> +; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
> +; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
> +; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
> +; SI: BUFFER_STORE_DWORD [[VRESULT]],
> +; SI: S_ENDPGM
> +define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
> + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
> + %truncctpop = trunc i64 %ctpop to i32
> + store i32 %truncctpop, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_i64:
> +; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
> +; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> +; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
> +; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> + %val = load i64 addrspace(1)* %in, align 8
> + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
> + %truncctpop = trunc i64 %ctpop to i32
> + store i32 %truncctpop, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @s_ctpop_v2i64:
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_ENDPGM
> +define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
> + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
> + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @s_ctpop_v4i64:
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_BCNT1_I32_B64
> +; SI: S_ENDPGM
> +define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
> + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
> + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v2i64:
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: S_ENDPGM
> +define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
> + %val = load <2 x i64> addrspace(1)* %in, align 16
> + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
> + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctpop_v4i64:
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: V_BCNT_U32_B32
> +; SI: S_ENDPGM
> +define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
> + %val = load <4 x i64> addrspace(1)* %in, align 32
> + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
> + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
> + ret void
> +}
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list