PATCH: R600: 64-bit division
Matt Arsenault
Matthew.Arsenault at amd.com
Fri Sep 19 11:47:13 PDT 2014
On 09/17/2014 11:21 AM, Tom Stellard wrote:
> Hi,
>
> The attached series adds a pass for lowering 64-bit division in the R600
> backend and also fixes some bugs uncovered along the way.
>
> This new pass replaces the old 64-bit div lowering used for Evergreen/NI
> subtargets, which was found to have some bugs.
>
> -Tom
>
> 0001-R600-SI-Use-ISD-MUL-instead-of-ISD-UMULO-when-loweri.patch
>
>
> From ede048f49e8e550176c567c0bfa1bd3679189c10 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 10:35:23 -0400
> Subject: [PATCH 1/8] R600/SI: Use ISD::MUL instead of ISD::UMULO when lowering
> division
>
> ISD::MUL and ISD:UMULO are the same except that UMULO sets an overflow
> bit. Since we aren't using the overflow bit, we should use ISD::MUL.
> ---
> lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index d7f12ef..293a89d 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1510,8 +1510,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
> // e is rounding error.
> SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
>
> - // RCP_LO = umulo(RCP, Den) */
> - SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
> + // RCP_LO = mul(RCP, Den) */
> + SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
>
> // RCP_HI = mulhu (RCP, Den) */
> SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
> @@ -1542,7 +1542,7 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
> SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
>
> // Num_S_Remainder = Quotient * Den
> - SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
> + SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
>
> // Remainder = Num - Num_S_Remainder
> SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
> -- 1.8.5.5
>
> 0002-R600-Don-t-set-BypassSlowDiv-for-64-bit-division.patch
>
>
> From 74a83f33e1df72b942fdbe12701cad75750f50f7 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 15 Sep 2014 12:21:35 -0400
> Subject: [PATCH 2/8] R600: Don't set BypassSlowDiv for 64-bit division
>
> BypassSlowDiv is used by codegen prepare to insert a run-time
> check to see if the operands to a 64-bit division are really 32-bit
> values and if they are it will do 32-bit division instead.
>
> This is not useful for R600, which has predicated control flow since
> both the 32-bit and 64-bit paths will be executed in most cases. It
> also increases code size which can lead to more instruction cache
> misses.
> ---
> lib/Target/R600/AMDGPUISelLowering.cpp | 3 ---
> 1 file changed, 3 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index 293a89d..f353c94 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -388,9 +388,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
> setIntDivIsCheap(false);
> setPow2SDivIsCheap(false);
>
> - // TODO: Investigate this when 64-bit divides are implemented.
> - addBypassSlowDiv(64, 32);
> -
> // FIXME: Need to really handle these.
> MaxStoresPerMemcpy = 4096;
> MaxStoresPerMemmove = 4096;
> -- 1.8.5.5
>
LGTM
> 0003-R600-SI-Use-isOperandLegal-to-simplify-legalization-.patch
>
>
> From 835d8f79d0491611566ac124fe072f8f85723cb3 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:19:11 -0400
> Subject: [PATCH 3/8] R600/SI: Use isOperandLegal() to simplify legalization of
> VOP3 instructions
>
> ---
> lib/Target/R600/SIInstrInfo.cpp | 27 +++------------------------
> 1 file changed, 3 insertions(+), 24 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 294aa70..1b90d41 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -1174,33 +1174,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
> // Legalize VOP3
> if (isVOP3(MI->getOpcode())) {
> int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
> - unsigned SGPRReg = AMDGPU::NoRegister;
> for (unsigned i = 0; i < 3; ++i) {
> int Idx = VOP3Idx[i];
> - if (Idx == -1)
> - continue;
> - MachineOperand &MO = MI->getOperand(Idx);
> -
> - if (MO.isReg()) {
> - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
> - continue; // VGPRs are legal
> -
> - assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
> -
> - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
> - SGPRReg = MO.getReg();
> - // We can use one SGPR in each VOP3 instruction.
> - continue;
> - }
This looks like it checks that only one SGPR is used, but I don't think
isOperandLegal can do that only looking at one operand. This loop could
also be improved to find the operand that requires the fewest moves
(e.g. inst s0, s1, s1 I think would end up finding s0 first and
inserting moves for both copies of s1)
> - } else if (!isLiteralConstant(MO)) {
> - // If it is not a register and not a literal constant, then it must be
> - // an inline constant which is always legal.
> - continue;
> - }
> - // If we make it this far, then the operand is not legal and we must
> - // legalize it.
> - legalizeOpWithMove(MI, Idx);
> + if (Idx != -1 && !isOperandLegal(MI, Idx))
> + legalizeOpWithMove(MI, Idx);
> }
> + return;
> }
>
> // Legalize REG_SEQUENCE and PHI
> -- 1.8.5.5
>
> 0004-R600-SI-Remove-modifier-operands-from-V_CNDMASK_B32_.patch
>
>
> From e3ee19a2997e7634991ee5950e8d4eb9e3448d97 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:19:50 -0400
> Subject: [PATCH 4/8] R600/SI: Remove modifier operands from V_CNDMASK_B32_e64
>
> Modifiers don't work for this instruction.
> ---
> lib/Target/R600/SIInstructions.td | 5 ++---
> lib/Target/R600/SILowerI1Copies.cpp | 6 +-----
> 2 files changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index d27ddf3..8cbdc55 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1239,9 +1239,8 @@ def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
> }
>
> def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
> - (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
> - InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
> - "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
> + (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2),
> + "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2",
> [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
> > {
> let src0_modifiers = 0;
> diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
> index 1f0f24b..3ab0c2a 100644
> --- a/lib/Target/R600/SILowerI1Copies.cpp
> +++ b/lib/Target/R600/SILowerI1Copies.cpp
> @@ -127,11 +127,7 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
> .addOperand(MI.getOperand(0))
> .addImm(0)
> .addImm(-1)
> - .addOperand(MI.getOperand(1))
> - .addImm(0)
> - .addImm(0)
> - .addImm(0)
> - .addImm(0);
> + .addOperand(MI.getOperand(1));
> MI.eraseFromParent();
> } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
> SrcRC == &AMDGPU::VReg_1RegClass) {
> -- 1.8.5.5
LGTM
>
> 0005-R600-SI-Add-pattern-for-i64-ctlz_zero_undef.patch
>
>
> From 5600b629eb48753641ffac3cc73279b42e547e46 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:18:31 -0400
> Subject: [PATCH 5/8] R600/SI: Add pattern for i64 ctlz_zero_undef
>
> ---
> lib/Target/R600/SIInstrInfo.cpp | 127 +++++++++++++++++++++++++++++++++--
> lib/Target/R600/SIInstrInfo.h | 9 +++
> lib/Target/R600/SIInstructions.td | 24 +++++--
> test/CodeGen/R600/ctlz_zero_undef.ll | 33 +++++++++
> 4 files changed, 182 insertions(+), 11 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 1b90d41..ae9cbe9 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -510,7 +510,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
> // This is just a placeholder for register allocation.
> MI->eraseFromParent();
> break;
> +
> + case AMDGPU::S_CTLZ_ZERO_UNDEF_B32_B64:
> + MI->setDesc(get(AMDGPU::S_FLBIT_I32_B64));
> + return false;
> }
> +
> return true;
> }
>
> @@ -1556,6 +1561,19 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
> }
> }
>
> +void SIInstrInfo::getUsesToMoveToVALU(unsigned Reg,
> + const MachineRegisterInfo &MRI,
> + SmallVectorImpl<MachineInstr *> &Worklist) const {
> +
> + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
> + E = MRI.use_end(); I != E; ++I) {
I think you can use range loop with MRI.use_operands(Reg) instead
> + MachineInstr &UseMI = *I->getParent();
> + if (!canReadVGPR(UseMI, I.getOperandNo())) {
> + Worklist.push_back(&UseMI);
> + }
> + }
> +}
> +
> void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
> SmallVector<MachineInstr *, 128> Worklist;
> Worklist.push_back(&TopInst);
> @@ -1624,6 +1642,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
> Inst->eraseFromParent();
> continue;
>
> + case AMDGPU::S_CTLZ_ZERO_UNDEF_B32_B64:
> + splitScalar64BitFLBIT(Worklist, Inst, true);
> + Inst->eraseFromParent();
> + continue;
> +
> + case AMDGPU::S_FLBIT_I32_B64:
> + splitScalar64BitFLBIT(Worklist, Inst);
> + Inst->eraseFromParent();
> + continue;
> +
> case AMDGPU::S_BFE_U64:
> case AMDGPU::S_BFE_I64:
> case AMDGPU::S_BFM_B64:
> @@ -1710,13 +1738,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
> // Legalize the operands
> legalizeOperands(Inst);
>
> - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
> - E = MRI.use_end(); I != E; ++I) {
> - MachineInstr &UseMI = *I->getParent();
> - if (!canReadVGPR(UseMI, I.getOperandNo())) {
> - Worklist.push_back(&UseMI);
> - }
> - }
> + getUsesToMoveToVALU(NewDstReg, MRI, Worklist);
> }
> }
>
> @@ -1890,6 +1912,97 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
> Worklist.push_back(Second);
> }
>
> +void SIInstrInfo::splitScalar64BitFLBIT(SmallVectorImpl<MachineInstr*> &Worklist,
> + MachineInstr *Inst,
> + bool IsZeroUndef) const {
> + MachineBasicBlock &MBB = *Inst->getParent();
> + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
> +
> + MachineBasicBlock::iterator MII = Inst;
> + DebugLoc DL = Inst->getDebugLoc();
> +
> + MachineOperand &Dest = Inst->getOperand(0);
> + MachineOperand &Src = Inst->getOperand(1);
> +
> + const TargetRegisterClass *SrcRC = Src.isReg() ?
> + MRI.getRegClass(Src.getReg()) :
> + &AMDGPU::SGPR_64RegClass;
> +
> + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
> +
> + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> + AMDGPU::sub0, SrcSubRC);
> + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> + AMDGPU::sub1, SrcSubRC);
> +
> +
> + unsigned HiResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> + unsigned LoResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> + unsigned IsHiZeroReg= MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
> + unsigned LHSReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> + unsigned RHSReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> + unsigned DstFinalReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> + // S_FLBIT_I32_B64 src0
> + //
> + // if (src0.hi == 0) {
> + // dst = V_FFBH_U32 src0.lo + 32
> + // } else {
> + // dst = V_FFBH_U32 src0.hi + 0;
> + // }
> + //
> + // if (src0 == 0) {
> + // dst = -1;
> + // } else {
> + // dst = dst;
> + // }
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_FFBH_U32_e32), HiResultReg)
> + .addReg(SrcRegSub1.getReg());
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_FFBH_U32_e32), LoResultReg)
> + .addReg(SrcRegSub0.getReg());
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_CMP_EQ_U32_e64), IsHiZeroReg)
> + .addImm(0)
> + .addReg(SrcRegSub1.getReg());
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), LHSReg)
> + .addReg(HiResultReg)
> + .addReg(LoResultReg)
> + .addReg(IsHiZeroReg);
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), RHSReg)
> + .addImm(0)
> + .addImm(32)
> + .addReg(IsHiZeroReg);
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_I32_e32), DstReg)
> + .addReg(LHSReg)
> + .addReg(RHSReg);
> +
> + if (!IsZeroUndef) {
> + unsigned IsSrcZeroReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_CMP_EQ_U64_e64), IsSrcZeroReg)
> + .addImm(0)
> + .addReg(Src.getReg());
> +
> + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstFinalReg)
> + .addReg(DstReg)
> + .addImm(-1)
> + .addReg(IsSrcZeroReg);
> + } else {
> + DstFinalReg = DstReg;
> + }
> +
> + MRI.replaceRegWith(Dest.getReg(), DstFinalReg);
> +
> + getUsesToMoveToVALU(DstFinalReg, MRI, Worklist);
> +}
> +
> void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
> MachineInstr *Inst) const {
> // Add the implict and explicit register definitions.
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index a32318a..be8776d 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -53,6 +53,10 @@ private:
> void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
> MachineInstr *Inst) const;
>
> + void splitScalar64BitFLBIT(SmallVectorImpl<MachineInstr *> &Worklist,
> + MachineInstr *Inst,
> + bool IsZeroUndef = false) const;
> +
> void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
>
> public:
> @@ -182,6 +186,11 @@ public:
>
> void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
>
> + /// \brief Look at all the uses of \p Reg and add use instructions that need
> + /// to be moved to the VALU to \p Worklist.
> + void getUsesToMoveToVALU(unsigned Reg, const MachineRegisterInfo &MRI,
> + SmallVectorImpl<MachineInstr *> &Worklist) const;
> +
> /// \brief Replace this instruction's opcode with the equivalent VALU
> /// opcode. This function will also move the users of \p MI to the
> /// VALU if necessary.
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 8cbdc55..88147bf 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -119,7 +119,7 @@ def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
> [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
> >;
>
> -//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
> +def S_FLBIT_I32_B64 : SOP1_32_64 <0x00000016, "S_FLBIT_I32_B64", []>;
> def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
> //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
> def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
> @@ -287,6 +287,19 @@ def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
> //def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
> def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
>
> +
> +let isPseudo = 1 in {
> +
> +// We select ctlz_zero_undef to this pseudo instruction rather
> +// than S_FLBIT_I32_B64, so that in the event we need to move it to
> +// the VGPR, we can produce a more optimized VALU version since we
Typo, VGPR -> VALU
> +// know that zero inputs are undefined.
> +def S_CTLZ_ZERO_UNDEF_B32_B64 : SOP2 <0,
> + (outs SReg_32:$dst), (ins SReg_64:$src0), "", []
> +>;
> +
> +}
> +
> //===----------------------------------------------------------------------===//
> // SOPC Instructions
> //===----------------------------------------------------------------------===//
> @@ -1845,13 +1858,16 @@ def : Pat <
> // SOP1 Patterns
> //===----------------------------------------------------------------------===//
>
> -def : Pat <
> - (i64 (ctpop i64:$src)),
> +class Sop3264Pat <SDNode node, Instruction inst> : Pat <
The naming convention other places seems to be to add _s between numbers
> + (i64 (node i64:$src)),
> (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
> - (S_BCNT1_I32_B64 $src), sub0),
> + (inst $src), sub0),
> (S_MOV_B32 0), sub1)
> >;
>
> +def : Sop3264Pat <ctpop, S_BCNT1_I32_B64>;
> +def : Sop3264Pat <ctlz_zero_undef, S_CTLZ_ZERO_UNDEF_B32_B64>;
> +
> //===----------------------------------------------------------------------===//
> // SOP2 Patterns
> //===----------------------------------------------------------------------===//
> diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
> index 1340ef9..1740bd9 100644
> --- a/test/CodeGen/R600/ctlz_zero_undef.ll
> +++ b/test/CodeGen/R600/ctlz_zero_undef.ll
> @@ -4,6 +4,9 @@
> declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
> declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
> declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
> +declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +
>
> ; FUNC-LABEL: @s_ctlz_zero_undef_i32:
> ; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
> @@ -68,3 +71,33 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
> store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
> ret void
> }
> +
> +; FUNC-LABEL: @v_ctlz_zero_undef_i64:
> +; SI: S_FLBIT_I32_B64
> +; EG: FFBH_UINT
> +; EG: FFBH_UINT
> +define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
> + %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
> + store i64 %ctlz, i64 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @v_ctlz_zero_undef_i64_vgpr:
> +; SI-DAG: V_FFBH_U32_e32
> +; SI-DAG: V_FFBH_U32_e32
> +; SI-DAG: V_CMP_EQ_U32_e64
> +; SI: V_CNDMASK_B32_e64
> +; SI: V_CNDMASK_B32_e64
> +; SI: V_ADD_I32_e32
> +; SI-NOT: V_CNDMASK_B32_e64
> +; SI: S_ENDPGM
> +; EG: FFBH_UINT
> +; EG: FFBH_UINT
> +define void @v_ctlz_zero_undef_i64_vgpr(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
> + %tidig = call i32 @llvm.r600.read.tidig.x()
> + %zext = zext i32 %tidig to i64
> + %input = add i64 %val, %zext
> + %ctlz = call i64 @llvm.ctlz.i64(i64 %input, i1 true) nounwind readnone
> + store i64 %ctlz, i64 addrspace(1)* %out, align 4
> + ret void
> +}
> -- 1.8.5.5
>
> 0006-IntegerDivision-Handle-vectors-in-expandDivision-and.patch
>
>
> From 6956e902b0aef46d6c71a6b58995aa22d62acd1f Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 14:23:42 -0400
> Subject: [PATCH 6/8] IntegerDivision: Handle vectors in expandDivision() and
> expandRemainder()
>
> This will be used and tested in a future commit to the R600 backend.
> ---
> lib/Transforms/Utils/IntegerDivision.cpp | 45 +++++++++++++++++++++++++++++---
> 1 file changed, 41 insertions(+), 4 deletions(-)
>
> diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
> index 9f91eeb..66133f6 100644
> --- a/lib/Transforms/Utils/IntegerDivision.cpp
> +++ b/lib/Transforms/Utils/IntegerDivision.cpp
> @@ -366,6 +366,31 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
> return Q_5;
> }
>
> +static void splitVector(BinaryOperator *I,
> + SmallVectorImpl<BinaryOperator*> &Scalars) {
> +
> + Type *Ty = I->getType();
> + unsigned NumElements = Ty->getVectorNumElements();
> +
> + IRBuilder<> Builder(I);
> +
> + Value *Op0 = I->getOperand(0);
> + Value *Op1 = I->getOperand(1);
> + Type *I32Ty = Type::getInt32Ty(I->getContext());
> + Value *Vec = UndefValue::get(Ty);
> + for (unsigned i = 0, e = NumElements; i != e; ++i) {
> + Value *Idx = Constant::getIntegerValue(I32Ty, APInt(32, i));
ConstantInt::get(I32Ty, i) is shorter
> + Value *LHS = Builder.CreateExtractElement(Op0, Idx);
> + Value *RHS = Builder.CreateExtractElement(Op1, Idx);
> + Value *Scalar = Builder.CreateBinOp(I->getOpcode(), LHS, RHS);
> + Vec = Builder.CreateInsertElement(Vec, Scalar, Idx);
> + Scalars.push_back(cast<BinaryOperator>(Scalar));
> + }
> + I->replaceAllUsesWith(Vec);
> + I->dropAllReferences();
> + I->eraseFromParent();
> +}
> +
> /// Generate code to calculate the remainder of two integers, replacing Rem with
> /// the generated code. This currently generates code using the udiv expansion,
> /// but future work includes generating more specialized code, e.g. when more
> @@ -381,8 +406,14 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
> IRBuilder<> Builder(Rem);
>
> Type *RemTy = Rem->getType();
> - if (RemTy->isVectorTy())
> - llvm_unreachable("Div over vectors not supported");
> + if (RemTy->isVectorTy()) {
> + SmallVector<BinaryOperator*, 8> Scalars;
> + splitVector(Rem, Scalars);
> + for (BinaryOperator *ScalarRem : Scalars) {
> + expandRemainder(ScalarRem);
> + }
> + return true;
> + }
>
> unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
>
> @@ -439,8 +470,14 @@ bool llvm::expandDivision(BinaryOperator *Div) {
> IRBuilder<> Builder(Div);
>
> Type *DivTy = Div->getType();
> - if (DivTy->isVectorTy())
> - llvm_unreachable("Div over vectors not supported");
> + if (DivTy->isVectorTy()) {
> + SmallVector<BinaryOperator*, 8> Scalars;
> + splitVector(Div, Scalars);
> + for (BinaryOperator *ScalarDiv : Scalars) {
> + expandDivision(ScalarDiv);
> + }
> + return true;
> + }
>
> unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
>
> -- 1.8.5.5
>
> 0007-R600-Add-a-pass-for-expanding-64-bit-division.patch
>
>
> From 742ec95149d7c7d00438b80254b81ea0b0c1613f Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:21:56 -0400
> Subject: [PATCH 7/8] R600: Add a pass for expanding 64-bit division
Should this just be moved to the generic IR passes? The other utilities
have a corresponding pass version already, it's just weird that
IntegerDivision doesn't and there's nothing really AMDGPU specific here
(other than a new target provided check for if a type should be expanded).
>
> ---
> lib/Target/R600/AMDGPU.h | 1 +
> lib/Target/R600/AMDGPUExpandDIVMOD.cpp | 107 ++++++++++++++++++++++++++++++++
> lib/Target/R600/AMDGPUTargetMachine.cpp | 1 +
> lib/Target/R600/CMakeLists.txt | 1 +
> test/CodeGen/R600/sdiv.ll | 43 +++++++------
> test/CodeGen/R600/sdiv_vec.ll | 46 ++++++++++++++
> test/CodeGen/R600/udiv.ll | 55 ++++++++++++----
> test/CodeGen/R600/udiv_vec.ll | 47 ++++++++++++++
> test/CodeGen/R600/udivrem64.ll | 82 ------------------------
> 9 files changed, 270 insertions(+), 113 deletions(-)
> create mode 100644 lib/Target/R600/AMDGPUExpandDIVMOD.cpp
> create mode 100644 test/CodeGen/R600/sdiv_vec.ll
> create mode 100644 test/CodeGen/R600/udiv_vec.ll
> delete mode 100644 test/CodeGen/R600/udivrem64.ll
>
> diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
> index ff4d6b4..e968eba 100644
> --- a/lib/Target/R600/AMDGPU.h
> +++ b/lib/Target/R600/AMDGPU.h
> @@ -50,6 +50,7 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
> extern char &SILowerI1CopiesID;
>
> // Passes common to R600 and SI
> +FunctionPass *createAMDGPUExpandDIVMODPass();
> FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
> Pass *createAMDGPUStructurizeCFGPass();
> FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
> diff --git a/lib/Target/R600/AMDGPUExpandDIVMOD.cpp b/lib/Target/R600/AMDGPUExpandDIVMOD.cpp
> new file mode 100644
> index 0000000..98d997d
> --- /dev/null
> +++ b/lib/Target/R600/AMDGPUExpandDIVMOD.cpp
> @@ -0,0 +1,107 @@
> +//===-- AMDGPUExpandDIVMOD.cpp - Expand div/mod instructions --------------===//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +/// \file
> +//===----------------------------------------------------------------------===//
> +
> +#include "AMDGPU.h"
> +#include "llvm/IR/IRBuilder.h"
> +#include "llvm/IR/InstVisitor.h"
> +#include "llvm/Transforms/Utils/IntegerDivision.h"
> +
> +#include "llvm/Support/Debug.h"
> +using namespace llvm;
> +
> +namespace {
> +
> +class AMDGPUExpandDIVMOD : public FunctionPass,
> + public InstVisitor<AMDGPUExpandDIVMOD, bool> {
> +
> + static char ID;
> + std::vector<BinaryOperator *> Divs;
> + std::vector<BinaryOperator *> Rems;
> +
> +public:
> + AMDGPUExpandDIVMOD() : FunctionPass(ID) { }
> + bool doInitialization(Module &M) override;
> + bool runOnFunction(Function &F) override;
> + const char *getPassName() const override {
> + return "AMDGPU Expand div/mod";
> + }
> + bool visitInstruction(Instruction &I) { return false; }
> + bool visitSDiv(BinaryOperator &I);
> + bool visitUDiv(BinaryOperator &I);
> + bool visitSRem(BinaryOperator &I);
> + bool visitURem(BinaryOperator &I);
> +
> +};
> +
> +} // End anonymous namespace
> +
> +char AMDGPUExpandDIVMOD::ID = 0;
> +
> +bool AMDGPUExpandDIVMOD::doInitialization(Module &M) {
> + return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::runOnFunction(Function &F) {
> +
> + for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
> + BasicBlock *BB = BBI;
> + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) {
> + Instruction *I = II;
> + if (visit(*I)) {
> + BBI = F.begin();
> + break;
> + }
> + }
> + }
> +
> + return false;
> +}
> +
> +static bool shouldExpandDivMod(const BinaryOperator &I) {
> + return I.getType()->getScalarType() == Type::getInt64Ty(I.getContext());
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitSDiv(BinaryOperator &I) {
> + if (shouldExpandDivMod(I)) {
> + expandDivision(&I);
> + return true;
> + }
> + return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitUDiv(BinaryOperator &I) {
> + if (shouldExpandDivMod(I)) {
> + expandDivision(&I);
> + return true;
> + }
> + return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitSRem(BinaryOperator &I) {
> + if (shouldExpandDivMod(I)) {
> + expandRemainder(&I);
> + return true;
> + }
> + return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitURem(BinaryOperator &I) {
> + if (shouldExpandDivMod(I)) {
> + expandRemainder(&I);
> + return true;
> + }
> + return false;
> +}
> +
> +FunctionPass *llvm::createAMDGPUExpandDIVMODPass() {
> + return new AMDGPUExpandDIVMOD();
> +}
> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> index c95a941..1a95d86 100644
> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> @@ -119,6 +119,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
> bool
> AMDGPUPassConfig::addPreISel() {
> const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
> + addPass(createAMDGPUExpandDIVMODPass());
> addPass(createFlattenCFGPass());
> if (ST.IsIRStructurizerEnabled())
> addPass(createStructurizeCFGPass());
> diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
> index c5f4680..c94b4e3 100644
> --- a/lib/Target/R600/CMakeLists.txt
> +++ b/lib/Target/R600/CMakeLists.txt
> @@ -14,6 +14,7 @@ add_public_tablegen_target(AMDGPUCommonTableGen)
> add_llvm_target(R600CodeGen
> AMDILCFGStructurizer.cpp
> AMDGPUAsmPrinter.cpp
> + AMDGPUExpandDIVMOD.cpp
> AMDGPUFrameLowering.cpp
> AMDGPUIntrinsicInfo.cpp
> AMDGPUISelDAGToDAG.cpp
> diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
> index e922d5c..3d74e90 100644
> --- a/test/CodeGen/R600/sdiv.ll
> +++ b/test/CodeGen/R600/sdiv.ll
> @@ -81,23 +81,30 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
> ret void
> }
>
> -; Tests for 64-bit divide bypass.
> -; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> -; %result = sdiv i64 %a, %b
> -; store i64 %result, i64 addrspace(1)* %out, align 8
> -; ret void
> -; }
> +; For the 64-bit division, just make sure we don't crash with a 'cannot select'
> +; error.
> +; FUNC-LABEL: @test_get_quotient
> +; SI:S_ENDPGM
missing space after :, and the same for the rest of these tests
> +define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> + %result = sdiv i64 %a, %b
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
>
> -; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> -; %result = srem i64 %a, %b
> -; store i64 %result, i64 addrspace(1)* %out, align 8
> -; ret void
> -; }
> +; FUNC-LABEL: @test_get_remainder
> +; SI:S_ENDPGM
> +define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> + %result = srem i64 %a, %b
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
>
> -; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> -; %resultdiv = sdiv i64 %a, %b
> -; %resultrem = srem i64 %a, %b
> -; %result = add i64 %resultdiv, %resultrem
> -; store i64 %result, i64 addrspace(1)* %out, align 8
> -; ret void
> -; }
> +; FUNC-LABEL: @test_get_quotient_and_remainder
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> + %resultdiv = sdiv i64 %a, %b
> + %resultrem = srem i64 %a, %b
> + %result = add i64 %resultdiv, %resultrem
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> diff --git a/test/CodeGen/R600/sdiv_vec.ll b/test/CodeGen/R600/sdiv_vec.ll
> new file mode 100644
> index 0000000..4e8ace4
> --- /dev/null
> +++ b/test/CodeGen/R600/sdiv_vec.ll
> @@ -0,0 +1,46 @@
> +;FIXME: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
> +
> +; FIXME: i64 vector kernel args don't work on Evergeen/NI.
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v2
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v2(<2 x i64> addrspace(1)* %out, <2 x i64> %a, <2 x i64> %b) nounwind {
> + %resultdiv = sdiv <2 x i64> %a, %b
> + %resultrem = srem <2 x i64> %a, %b
> + %result = add <2 x i64> %resultdiv, %resultrem
> + store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v4
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v4(<4 x i64> addrspace(1)* %out, <4 x i64> %a, <4 x i64> %b) nounwind {
> + %resultdiv = sdiv <4 x i64> %a, %b
> + %resultrem = srem <4 x i64> %a, %b
> + %result = add <4 x i64> %resultdiv, %resultrem
> + store <4 x i64> %result, <4 x i64> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v8
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v8( <8 x i64> addrspace(1)* %out, <8 x i64> %a, <8 x i64> %b) nounwind {
> + %resultdiv = sdiv <8 x i64> %a, %b
> + %resultrem = srem <8 x i64> %a, %b
> + %result = add <8 x i64> %resultdiv, %resultrem
> + store <8 x i64> %result, <8 x i64> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FIXME: The v16 case causes machine verifier errors. I think this is related
> +; to register spilling.
> +; FIXME-FUNC-LABEL: @test_get_quotient_and_remainder_v16
> +; FIXME-SI:S_ENDPGM
> +;define void @test_get_quotient_and_remainder_v16(<16 x i64> addrspace(1)* %out, <16 x i64> %a, <16 x i64> %b) nounwind {
> +; %resultdiv = sdiv <16 x i64> %a, %b
> +; %resultrem = srem <16 x i64> %a, %b
> +; %result = add <16 x i64> %resultdiv, %resultrem
> +; store <16 x i64> %result, <16 x i64> addrspace(1)* %out, align 8
> +; ret void
> +;}
> diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
> index 5371321..b483e76 100644
> --- a/test/CodeGen/R600/udiv.ll
> +++ b/test/CodeGen/R600/udiv.ll
> @@ -1,9 +1,9 @@
> -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
> -;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
> +;FIXME: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
>
> -;EG-CHECK-LABEL: @test
> -;EG-CHECK-NOT: SETGE_INT
> -;EG-CHECK: CF_END
> +;EG-LABEL: @test
> +;EG-NOT: SETGE_INT
> +;EG: CF_END
>
> define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
> %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
> @@ -18,10 +18,10 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
> ;The goal of this test is to make sure the ISel doesn't fail when it gets
> ;a v4i32 udiv
>
> -;EG-CHECK-LABEL: @test2
> -;EG-CHECK: CF_END
> -;SI-CHECK-LABEL: @test2
> -;SI-CHECK: S_ENDPGM
> +;EG-LABEL: @test2
> +;EG: CF_END
> +;SI-LABEL: @test2
> +;SI: S_ENDPGM
>
> define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
> %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
> @@ -32,10 +32,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
> ret void
> }
>
> -;EG-CHECK-LABEL: @test4
> -;EG-CHECK: CF_END
> -;SI-CHECK-LABEL: @test4
> -;SI-CHECK: S_ENDPGM
> +;EG-LABEL: @test4
> +;EG: CF_END
> +;SI-LABEL: @test4
> +;SI: S_ENDPGM
>
> define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
> %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
> @@ -45,3 +45,32 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
> store <4 x i32> %result, <4 x i32> addrspace(1)* %out
> ret void
> }
> +
> +; For the 64-bit division, just make sure we don't crash with a 'cannot select'
> +; error.
> +; FUNC-LABEL: @test_get_quotient
> +; SI: S_ENDPGM
> +define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> + %result = udiv i64 %a, %b
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FIXME: The AMDILCFGStructurizer crashes on this function for redwood.
> +; FUNC-LABEL: @test_get_remainder
> +; SI: S_ENDPGM
> +define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> + %result = urem i64 %a, %b
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder
> +; SI: S_ENDPGM
> +define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> + %resultdiv = udiv i64 %a, %b
> + %resultrem = urem i64 %a, %b
> + %result = add i64 %resultdiv, %resultrem
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> diff --git a/test/CodeGen/R600/udiv_vec.ll b/test/CodeGen/R600/udiv_vec.ll
> new file mode 100644
> index 0000000..942c323
> --- /dev/null
> +++ b/test/CodeGen/R600/udiv_vec.ll
> @@ -0,0 +1,47 @@
> +;FIXME: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
> +
> +; FIXME: i64 vector kernel args don't work on Evergeen/NI.
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v2
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v2(<2 x i64> addrspace(1)* %out, <2 x i64> %a, <2 x i64> %b) nounwind {
> + %resultdiv = udiv <2 x i64> %a, %b
> + %resultrem = urem <2 x i64> %a, %b
> + %result = add <2 x i64> %resultdiv, %resultrem
> + store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v4
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v4(<4 x i64> addrspace(1)* %out, <4 x i64> %a, <4 x i64> %b) nounwind {
> + %resultdiv = udiv <4 x i64> %a, %b
> + %resultrem = urem <4 x i64> %a, %b
> + %result = add <4 x i64> %resultdiv, %resultrem
> + store <4 x i64> %result, <4 x i64> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v8
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v8( <8 x i64> addrspace(1)* %out, <8 x i64> %a, <8 x i64> %b) nounwind {
> + %resultdiv = udiv <8 x i64> %a, %b
> + %resultrem = urem <8 x i64> %a, %b
> + %result = add <8 x i64> %resultdiv, %resultrem
> + store <8 x i64> %result, <8 x i64> addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FIXME: The v16 case causes machine verifier errors. I think this is related
> +; to register spilling.
> +
> +; FIXME-FUNC-LABEL: @test_get_quotient_and_remainder_v16
> +; FIXME-SI:S_ENDPGM
> +;define void @test_get_quotient_and_remainder_v16(<16 x i64> addrspace(1)* %out, <16 x i64> %a, <16 x i64> %b) nounwind {
> +; %resultdiv = udiv <16 x i64> %a, %b
> +; %resultrem = urem <16 x i64> %a, %b
> +; %result = add <16 x i64> %resultdiv, %resultrem
> +; store <16 x i64> %result, <16 x i64> addrspace(1)* %out, align 8
> +; ret void
> +;}
> diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
> deleted file mode 100644
> index a71315a..0000000
> --- a/test/CodeGen/R600/udivrem64.ll
> +++ /dev/null
> @@ -1,82 +0,0 @@
> -;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
> -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> -
> -;FUNC-LABEL: @test_udiv
> -;EG: RECIP_UINT
> -;EG: LSHL {{.*}}, 1,
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;SI: S_ENDPGM
> -define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
> - %result = udiv i64 %x, %y
> - store i64 %result, i64 addrspace(1)* %out
> - ret void
> -}
> -
> -;FUNC-LABEL: @test_urem
> -;EG: RECIP_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: AND_INT {{.*}}, 1,
> -;SI: S_ENDPGM
> -define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
> - %result = urem i64 %x, %y
> - store i64 %result, i64 addrspace(1)* %out
> - ret void
> -}
> -- 1.8.5.5
>
> 0008-R600-Factor-i64-UDIVREM-lowering-into-its-own-fuctio.patch
>
>
> From 1fbfb38377036de2a1e32ebcda911dadea252994 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 14:59:51 -0400
> Subject: [PATCH 8/8] R600: Factor i64 UDIVREM lowering into its own fuction
>
> This is so it could potentially be used by SI. Howerver, the current
> implemtation does not always produce correct results, so the
Typo "implemtation"
> AMDGPUExpandDIVMOD pass is being used instead.
Is the output from this better than what you get from the expand divmod
pass? I would expect so since I thought the pass inserts branching.
What kind of bugs? Does it not work for a certain range of values?
> ---
> lib/Target/R600/AMDGPUISelLowering.cpp | 84 ++++++++++++++++++++++++++++++++++
> lib/Target/R600/AMDGPUISelLowering.h | 2 +
> lib/Target/R600/R600ISelLowering.cpp | 1 +
> 3 files changed, 87 insertions(+)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index f353c94..c66bb7e 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1485,11 +1485,95 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
> return DAG.getMergeValues(Res, DL);
> }
>
> +/// XXX: FIXME This function appears to have some bugs and does not always
> +/// produce correct results. It is currently superseded by the
> +/// AMDGPUExpandDIVREM pass.
> +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
> + SelectionDAG &DAG,
> + SmallVectorImpl<SDValue> &Results) const {
> + assert(Op.getValueType() == MVT::i64);
> +
> + SDLoc DL(Op);
> + EVT VT = Op.getValueType();
> + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
> +
> + SDValue one = DAG.getConstant(1, HalfVT);
> + SDValue zero = DAG.getConstant(0, HalfVT);
> +
> + //HiLo split
> + SDValue LHS = Op.getOperand(0);
> + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
> + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
> +
> + SDValue RHS = Op.getOperand(1);
> + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
> + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
> +
> + // Get Speculative values
> + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
> + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
> +
> + SDValue REM_Hi = zero;
> + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
> +
> + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
> + SDValue DIV_Lo = zero;
> +
> + const unsigned halfBitWidth = HalfVT.getSizeInBits();
> +
> + for (unsigned i = 0; i < halfBitWidth; ++i) {
> + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
> + // Get Value of high bit
> + SDValue HBit;
> + if (halfBitWidth == 32 && Subtarget->hasBFE()) {
> + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
> + } else {
> + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
> + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
> + }
> +
> + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
> + DAG.getConstant(halfBitWidth - 1, HalfVT));
> + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
> + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
> +
> + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
> + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
> +
> +
> + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
> +
> + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
> + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
> +
> + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
> +
> + // Update REM
> +
> + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
> +
> + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
> + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
> + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
> + }
> +
> + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
> + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
> + Results.push_back(DIV);
> + Results.push_back(REM);
> +}
> +
> SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
> SelectionDAG &DAG) const {
> SDLoc DL(Op);
> EVT VT = Op.getValueType();
>
> + if (VT == MVT::i64) {
> + SmallVector<SDValue, 2> Results;
> + LowerUDIVREM64(Op, DAG, Results);
> + return DAG.getMergeValues(Results, DL);
> + }
> +
> SDValue Num = Op.getOperand(0);
> SDValue Den = Op.getOperand(1);
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
> index fc4c006..e94c333 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.h
> +++ b/lib/Target/R600/AMDGPUISelLowering.h
> @@ -83,6 +83,8 @@ protected:
> SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
> SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
> SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
> + void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
> + SmallVectorImpl<SDValue> &Results) const;
> bool isHWTrueValue(SDValue Op) const;
> bool isHWFalseValue(SDValue Op) const;
>
> diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
> index 3bc8cb9..04ba910 100644
> --- a/lib/Target/R600/R600ISelLowering.cpp
> +++ b/lib/Target/R600/R600ISelLowering.cpp
> @@ -901,6 +901,7 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
> }
> case ISD::UDIVREM: {
> SDValue Op = SDValue(N, 0);
> + LowerUDIVREM64(Op, DAG, Results);
> SDLoc DL(Op);
> EVT VT = Op.getValueType();
> EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
> -- 1.8.5.5
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140919/95933f1a/attachment.html>
More information about the llvm-commits
mailing list