PATCH: R600: 64-bit division

Fri Sep 19 11:47:13 PDT 2014

On 09/17/2014 11:21 AM, Tom Stellard wrote:
> Hi,
>
> The attached series adds a pass for lowering 64-bit division in the R600
> backend and also fixes some bugs uncovered along the way.
>
> This new pass replaces the old 64-bit div lowering used for Evergreen/NI
> subtargets, which was found to have some bugs.
>
> -Tom
>
> 0001-R600-SI-Use-ISD-MUL-instead-of-ISD-UMULO-when-loweri.patch
>
>
>  From ede048f49e8e550176c567c0bfa1bd3679189c10 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 10:35:23 -0400
> Subject: [PATCH 1/8] R600/SI: Use ISD::MUL instead of ISD::UMULO when lowering
>   division
>
> ISD::MUL and ISD:UMULO are the same except that UMULO sets an overflow
> bit.  Since we aren't using the overflow bit, we should use ISD::MUL.
> ---
>   lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index d7f12ef..293a89d 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1510,8 +1510,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
>     // e is rounding error.
>     SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
>   
> -  // RCP_LO = umulo(RCP, Den) */
> -  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
> +  // RCP_LO = mul(RCP, Den) */
> +  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
>   
>     // RCP_HI = mulhu (RCP, Den) */
>     SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
> @@ -1542,7 +1542,7 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
>     SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
>   
>     // Num_S_Remainder = Quotient * Den
> -  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
> +  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
>   
>     // Remainder = Num - Num_S_Remainder
>     SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
> -- 1.8.5.5
>
> 0002-R600-Don-t-set-BypassSlowDiv-for-64-bit-division.patch
>
>
>  From 74a83f33e1df72b942fdbe12701cad75750f50f7 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 15 Sep 2014 12:21:35 -0400
> Subject: [PATCH 2/8] R600: Don't set BypassSlowDiv for 64-bit division
>
> BypassSlowDiv is used by codegen prepare to insert a run-time
> check to see if the operands to a 64-bit division are really 32-bit
> values and if they are it will do 32-bit division instead.
>
> This is not useful for R600, which has predicated control flow since
> both the 32-bit and 64-bit paths will be executed in most cases.  It
> also increases code size which can lead to more instruction cache
> misses.
> ---
>   lib/Target/R600/AMDGPUISelLowering.cpp | 3 ---
>   1 file changed, 3 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index 293a89d..f353c94 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -388,9 +388,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
>     setIntDivIsCheap(false);
>     setPow2SDivIsCheap(false);
>   
> -  // TODO: Investigate this when 64-bit divides are implemented.
> -  addBypassSlowDiv(64, 32);
> -
>     // FIXME: Need to really handle these.
>     MaxStoresPerMemcpy  = 4096;
>     MaxStoresPerMemmove = 4096;
> -- 1.8.5.5
>

LGTM

> 0003-R600-SI-Use-isOperandLegal-to-simplify-legalization-.patch
>
>
>  From 835d8f79d0491611566ac124fe072f8f85723cb3 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:19:11 -0400
> Subject: [PATCH 3/8] R600/SI: Use isOperandLegal() to simplify legalization of
>   VOP3 instructions
>
> ---
>   lib/Target/R600/SIInstrInfo.cpp | 27 +++------------------------
>   1 file changed, 3 insertions(+), 24 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 294aa70..1b90d41 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -1174,33 +1174,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
>     // Legalize VOP3
>     if (isVOP3(MI->getOpcode())) {
>       int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
> -    unsigned SGPRReg = AMDGPU::NoRegister;
>       for (unsigned i = 0; i < 3; ++i) {
>         int Idx = VOP3Idx[i];
> -      if (Idx == -1)
> -        continue;
> -      MachineOperand &MO = MI->getOperand(Idx);
> -
> -      if (MO.isReg()) {
> -        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
> -          continue; // VGPRs are legal
> -
> -        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
> -
> -        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
> -          SGPRReg = MO.getReg();
> -          // We can use one SGPR in each VOP3 instruction.
> -          continue;
> -        }
This looks like it checks that only one SGPR is used, but I don't think 
isOperandLegal can do that only looking at one operand. This loop could 
also be improved to find the operand that requires the fewest moves 
(e.g. inst s0, s1, s1 I think would end up finding s0 first and 
inserting moves for both copies of s1)

> -      } else if (!isLiteralConstant(MO)) {
> -        // If it is not a register and not a literal constant, then it must be
> -        // an inline constant which is always legal.
> -        continue;
> -      }
> -      // If we make it this far, then the operand is not legal and we must
> -      // legalize it.
> -      legalizeOpWithMove(MI, Idx);
> +      if (Idx != -1 && !isOperandLegal(MI, Idx))
> +        legalizeOpWithMove(MI, Idx);
>       }
> +    return;
>     }
>   
>     // Legalize REG_SEQUENCE and PHI
> -- 1.8.5.5
>
> 0004-R600-SI-Remove-modifier-operands-from-V_CNDMASK_B32_.patch
>
>
>  From e3ee19a2997e7634991ee5950e8d4eb9e3448d97 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:19:50 -0400
> Subject: [PATCH 4/8] R600/SI: Remove modifier operands from V_CNDMASK_B32_e64
>
> Modifiers don't work for this instruction.
> ---
>   lib/Target/R600/SIInstructions.td   | 5 ++---
>   lib/Target/R600/SILowerI1Copies.cpp | 6 +-----
>   2 files changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index d27ddf3..8cbdc55 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1239,9 +1239,8 @@ def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
>   }
>   
>   def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
> -  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
> -   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
> -  "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
> +  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2),
> +  "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2",
>     [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
>   > {
>     let src0_modifiers = 0;
> diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
> index 1f0f24b..3ab0c2a 100644
> --- a/lib/Target/R600/SILowerI1Copies.cpp
> +++ b/lib/Target/R600/SILowerI1Copies.cpp
> @@ -127,11 +127,7 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
>                   .addOperand(MI.getOperand(0))
>                   .addImm(0)
>                   .addImm(-1)
> -                .addOperand(MI.getOperand(1))
> -                .addImm(0)
> -                .addImm(0)
> -                .addImm(0)
> -                .addImm(0);
> +                .addOperand(MI.getOperand(1));
>           MI.eraseFromParent();
>         } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
>                    SrcRC == &AMDGPU::VReg_1RegClass) {
> -- 1.8.5.5
LGTM

>
> 0005-R600-SI-Add-pattern-for-i64-ctlz_zero_undef.patch
>
>
>  From 5600b629eb48753641ffac3cc73279b42e547e46 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:18:31 -0400
> Subject: [PATCH 5/8] R600/SI: Add pattern for i64 ctlz_zero_undef
>
> ---
>   lib/Target/R600/SIInstrInfo.cpp      | 127 +++++++++++++++++++++++++++++++++--
>   lib/Target/R600/SIInstrInfo.h        |   9 +++
>   lib/Target/R600/SIInstructions.td    |  24 +++++--
>   test/CodeGen/R600/ctlz_zero_undef.ll |  33 +++++++++
>   4 files changed, 182 insertions(+), 11 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 1b90d41..ae9cbe9 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -510,7 +510,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
>       // This is just a placeholder for register allocation.
>       MI->eraseFromParent();
>       break;
> +
> +  case AMDGPU::S_CTLZ_ZERO_UNDEF_B32_B64:
> +    MI->setDesc(get(AMDGPU::S_FLBIT_I32_B64));
> +    return false;
>     }
> +
>     return true;
>   }
>   
> @@ -1556,6 +1561,19 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
>     }
>   }
>   
> +void SIInstrInfo::getUsesToMoveToVALU(unsigned Reg,
> +               const MachineRegisterInfo &MRI,
> +               SmallVectorImpl<MachineInstr *> &Worklist) const {
> +
> +  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
> +       E = MRI.use_end(); I != E; ++I) {

I think you can use range loop with MRI.use_operands(Reg) instead
> +    MachineInstr &UseMI = *I->getParent();
> +    if (!canReadVGPR(UseMI, I.getOperandNo())) {
> +      Worklist.push_back(&UseMI);
> +    }
> +  }
> +}
> +
>   void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
>     SmallVector<MachineInstr *, 128> Worklist;
>     Worklist.push_back(&TopInst);
> @@ -1624,6 +1642,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
>         Inst->eraseFromParent();
>         continue;
>   
> +    case AMDGPU::S_CTLZ_ZERO_UNDEF_B32_B64:
> +      splitScalar64BitFLBIT(Worklist, Inst, true);
> +      Inst->eraseFromParent();
> +      continue;
> +
> +    case AMDGPU::S_FLBIT_I32_B64:
> +      splitScalar64BitFLBIT(Worklist, Inst);
> +      Inst->eraseFromParent();
> +      continue;
> +
>       case AMDGPU::S_BFE_U64:
>       case AMDGPU::S_BFE_I64:
>       case AMDGPU::S_BFM_B64:
> @@ -1710,13 +1738,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
>       // Legalize the operands
>       legalizeOperands(Inst);
>   
> -    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
> -           E = MRI.use_end(); I != E; ++I) {
> -      MachineInstr &UseMI = *I->getParent();
> -      if (!canReadVGPR(UseMI, I.getOperandNo())) {
> -        Worklist.push_back(&UseMI);
> -      }
> -    }
> +    getUsesToMoveToVALU(NewDstReg, MRI, Worklist);
>     }
>   }
>   
> @@ -1890,6 +1912,97 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
>     Worklist.push_back(Second);
>   }
>   
> +void SIInstrInfo::splitScalar64BitFLBIT(SmallVectorImpl<MachineInstr*> &Worklist,
> +                                        MachineInstr *Inst,
> +                                        bool IsZeroUndef) const {
> +  MachineBasicBlock &MBB = *Inst->getParent();
> +  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
> +
> +  MachineBasicBlock::iterator MII = Inst;
> +  DebugLoc DL = Inst->getDebugLoc();
> +
> +  MachineOperand &Dest = Inst->getOperand(0);
> +  MachineOperand &Src = Inst->getOperand(1);
> +
> +  const TargetRegisterClass *SrcRC = Src.isReg() ?
> +    MRI.getRegClass(Src.getReg()) :
> +    &AMDGPU::SGPR_64RegClass;
> +
> +  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
> +
> +  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> +                                                      AMDGPU::sub0, SrcSubRC);
> +  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
> +                                                      AMDGPU::sub1, SrcSubRC);
> +
> +
> +  unsigned HiResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +  unsigned LoResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> +  unsigned IsHiZeroReg= MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
> +  unsigned LHSReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +  unsigned RHSReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> +  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +  unsigned DstFinalReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> +  // S_FLBIT_I32_B64 src0
> +  //
> +  // if (src0.hi == 0) {
> +  //   dst = V_FFBH_U32 src0.lo + 32
> +  // } else {
> +  //   dst = V_FFBH_U32 src0.hi + 0;
> +  // }
> +  //
> +  // if (src0 == 0) {
> +  //   dst = -1;
> +  // } else {
> +  //   dst = dst;
> +  // }
> +
> +  BuildMI(MBB, MII, DL, get(AMDGPU::V_FFBH_U32_e32), HiResultReg)
> +          .addReg(SrcRegSub1.getReg());
> +
> +  BuildMI(MBB, MII, DL, get(AMDGPU::V_FFBH_U32_e32), LoResultReg)
> +          .addReg(SrcRegSub0.getReg());
> +
> +  BuildMI(MBB, MII, DL, get(AMDGPU::V_CMP_EQ_U32_e64), IsHiZeroReg)
> +          .addImm(0)
> +          .addReg(SrcRegSub1.getReg());
> +
> +  BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), LHSReg)
> +          .addReg(HiResultReg)
> +          .addReg(LoResultReg)
> +          .addReg(IsHiZeroReg);
> +
> +  BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), RHSReg)
> +          .addImm(0)
> +          .addImm(32)
> +          .addReg(IsHiZeroReg);
> +
> +  BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_I32_e32), DstReg)
> +          .addReg(LHSReg)
> +          .addReg(RHSReg);
> +
> +  if (!IsZeroUndef) {
> +    unsigned IsSrcZeroReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
> +    BuildMI(MBB, MII, DL, get(AMDGPU::V_CMP_EQ_U64_e64), IsSrcZeroReg)
> +            .addImm(0)
> +            .addReg(Src.getReg());
> +
> +    BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstFinalReg)
> +            .addReg(DstReg)
> +            .addImm(-1)
> +            .addReg(IsSrcZeroReg);
> +  } else {
> +    DstFinalReg = DstReg;
> +  }
> +
> +  MRI.replaceRegWith(Dest.getReg(), DstFinalReg);
> +
> +  getUsesToMoveToVALU(DstFinalReg, MRI, Worklist);
> +}
> +
>   void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
>                                           MachineInstr *Inst) const {
>     // Add the implict and explicit register definitions.
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index a32318a..be8776d 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -53,6 +53,10 @@ private:
>     void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
>                               MachineInstr *Inst) const;
>   
> +  void splitScalar64BitFLBIT(SmallVectorImpl<MachineInstr *> &Worklist,
> +                             MachineInstr *Inst,
> +                             bool IsZeroUndef = false) const;
> +
>     void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
>   
>   public:
> @@ -182,6 +186,11 @@ public:
>   
>     void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
>   
> +  /// \brief Look at all the uses of \p Reg and add use instructions that need
> +  /// to be moved to the VALU to \p Worklist.
> +  void getUsesToMoveToVALU(unsigned Reg, const MachineRegisterInfo &MRI,
> +                           SmallVectorImpl<MachineInstr *> &Worklist) const;
> +
>     /// \brief Replace this instruction's opcode with the equivalent VALU
>     /// opcode.  This function will also move the users of \p MI to the
>     /// VALU if necessary.
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 8cbdc55..88147bf 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -119,7 +119,7 @@ def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
>     [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
>   >;
>   
> -//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
> +def S_FLBIT_I32_B64 : SOP1_32_64 <0x00000016, "S_FLBIT_I32_B64", []>;
>   def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
>   //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
>   def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
> @@ -287,6 +287,19 @@ def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
>   //def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
>   def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
>   
> +
> +let isPseudo = 1 in {
> +
> +// We select ctlz_zero_undef to this pseudo instruction rather
> +// than S_FLBIT_I32_B64, so that in the event we need to move it to
> +// the VGPR, we can produce a more optimized VALU version since we
Typo, VGPR -> VALU
> +// know that zero inputs are undefined.
> +def S_CTLZ_ZERO_UNDEF_B32_B64 : SOP2 <0,
> +  (outs SReg_32:$dst), (ins SReg_64:$src0), "", []
> +>;
> +
> +}
> +
>   //===----------------------------------------------------------------------===//
>   // SOPC Instructions
>   //===----------------------------------------------------------------------===//
> @@ -1845,13 +1858,16 @@ def : Pat <
>   // SOP1 Patterns
>   //===----------------------------------------------------------------------===//
>   
> -def : Pat <
> -  (i64 (ctpop i64:$src)),
> +class Sop3264Pat <SDNode node, Instruction inst> : Pat <
The naming convention other places seems to be to add _s between numbers
> +  (i64 (node i64:$src)),
>     (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
> -    (S_BCNT1_I32_B64 $src), sub0),
> +    (inst $src), sub0),
>       (S_MOV_B32 0), sub1)
>   >;
>   
> +def : Sop3264Pat <ctpop, S_BCNT1_I32_B64>;
> +def : Sop3264Pat <ctlz_zero_undef, S_CTLZ_ZERO_UNDEF_B32_B64>;
> +
>   //===----------------------------------------------------------------------===//
>   // SOP2 Patterns
>   //===----------------------------------------------------------------------===//
> diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
> index 1340ef9..1740bd9 100644
> --- a/test/CodeGen/R600/ctlz_zero_undef.ll
> +++ b/test/CodeGen/R600/ctlz_zero_undef.ll
> @@ -4,6 +4,9 @@
>   declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
>   declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
>   declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
> +declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +
>   
>   ; FUNC-LABEL: @s_ctlz_zero_undef_i32:
>   ; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
> @@ -68,3 +71,33 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
>     store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
>     ret void
>   }
> +
> +; FUNC-LABEL: @v_ctlz_zero_undef_i64:
> +; SI: S_FLBIT_I32_B64
> +; EG: FFBH_UINT
> +; EG: FFBH_UINT
> +define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
> +  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
> +  store i64 %ctlz, i64 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @v_ctlz_zero_undef_i64_vgpr:
> +; SI-DAG: V_FFBH_U32_e32
> +; SI-DAG: V_FFBH_U32_e32
> +; SI-DAG: V_CMP_EQ_U32_e64
> +; SI: V_CNDMASK_B32_e64
> +; SI: V_CNDMASK_B32_e64
> +; SI: V_ADD_I32_e32
> +; SI-NOT: V_CNDMASK_B32_e64
> +; SI: S_ENDPGM
> +; EG: FFBH_UINT
> +; EG: FFBH_UINT
> +define void @v_ctlz_zero_undef_i64_vgpr(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
> +  %tidig = call i32 @llvm.r600.read.tidig.x()
> +  %zext = zext i32 %tidig to i64
> +  %input = add i64 %val, %zext
> +  %ctlz = call i64 @llvm.ctlz.i64(i64 %input, i1 true) nounwind readnone
> +  store i64 %ctlz, i64 addrspace(1)* %out, align 4
> +  ret void
> +}
> -- 1.8.5.5
>
> 0006-IntegerDivision-Handle-vectors-in-expandDivision-and.patch
>
>
>  From 6956e902b0aef46d6c71a6b58995aa22d62acd1f Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 14:23:42 -0400
> Subject: [PATCH 6/8] IntegerDivision: Handle vectors in expandDivision() and
>   expandRemainder()
>
> This will be used and tested in a future commit to the R600 backend.
> ---
>   lib/Transforms/Utils/IntegerDivision.cpp | 45 +++++++++++++++++++++++++++++---
>   1 file changed, 41 insertions(+), 4 deletions(-)
>
> diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
> index 9f91eeb..66133f6 100644
> --- a/lib/Transforms/Utils/IntegerDivision.cpp
> +++ b/lib/Transforms/Utils/IntegerDivision.cpp
> @@ -366,6 +366,31 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
>     return Q_5;
>   }
>   
> +static void splitVector(BinaryOperator *I,
> +                        SmallVectorImpl<BinaryOperator*> &Scalars) {
> +
> +  Type *Ty = I->getType();
> +  unsigned NumElements = Ty->getVectorNumElements();
> +
> +  IRBuilder<> Builder(I);
> +
> +  Value *Op0 = I->getOperand(0);
> +  Value *Op1 = I->getOperand(1);
> +  Type *I32Ty = Type::getInt32Ty(I->getContext());
> +  Value *Vec = UndefValue::get(Ty);
> +  for (unsigned i = 0, e = NumElements; i != e; ++i) {
> +    Value *Idx = Constant::getIntegerValue(I32Ty, APInt(32, i));
ConstantInt::get(I32Ty, i) is shorter
> +    Value *LHS = Builder.CreateExtractElement(Op0, Idx);
> +    Value *RHS = Builder.CreateExtractElement(Op1, Idx);
> +    Value *Scalar = Builder.CreateBinOp(I->getOpcode(), LHS, RHS);
> +    Vec = Builder.CreateInsertElement(Vec, Scalar, Idx);
> +    Scalars.push_back(cast<BinaryOperator>(Scalar));
> +  }
> +  I->replaceAllUsesWith(Vec);
> +  I->dropAllReferences();
> +  I->eraseFromParent();
> +}
> +
>   /// Generate code to calculate the remainder of two integers, replacing Rem with
>   /// the generated code. This currently generates code using the udiv expansion,
>   /// but future work includes generating more specialized code, e.g. when more
> @@ -381,8 +406,14 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
>     IRBuilder<> Builder(Rem);
>   
>     Type *RemTy = Rem->getType();
> -  if (RemTy->isVectorTy())
> -    llvm_unreachable("Div over vectors not supported");
> +  if (RemTy->isVectorTy()) {
> +    SmallVector<BinaryOperator*, 8> Scalars;
> +    splitVector(Rem, Scalars);
> +    for (BinaryOperator *ScalarRem : Scalars) {
> +      expandRemainder(ScalarRem);
> +    }
> +    return true;
> +  }
>   
>     unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
>   
> @@ -439,8 +470,14 @@ bool llvm::expandDivision(BinaryOperator *Div) {
>     IRBuilder<> Builder(Div);
>   
>     Type *DivTy = Div->getType();
> -  if (DivTy->isVectorTy())
> -    llvm_unreachable("Div over vectors not supported");
> +  if (DivTy->isVectorTy()) {
> +    SmallVector<BinaryOperator*, 8> Scalars;
> +    splitVector(Div, Scalars);
> +    for (BinaryOperator *ScalarDiv : Scalars) {
> +      expandDivision(ScalarDiv);
> +    }
> +    return true;
> +  }
>   
>     unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
>   
> -- 1.8.5.5
>
> 0007-R600-Add-a-pass-for-expanding-64-bit-division.patch
>
>
>  From 742ec95149d7c7d00438b80254b81ea0b0c1613f Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 09:21:56 -0400
> Subject: [PATCH 7/8] R600: Add a pass for expanding 64-bit division
Should this just be moved to the generic IR passes? The other utilities 
have a corresponding pass version already, it's just weird that 
IntegerDivision doesn't and there's nothing really AMDGPU specific here 
(other than a new target provided check for if a type should be expanded).

>
> ---
>   lib/Target/R600/AMDGPU.h                |   1 +
>   lib/Target/R600/AMDGPUExpandDIVMOD.cpp  | 107 ++++++++++++++++++++++++++++++++
>   lib/Target/R600/AMDGPUTargetMachine.cpp |   1 +
>   lib/Target/R600/CMakeLists.txt          |   1 +
>   test/CodeGen/R600/sdiv.ll               |  43 +++++++------
>   test/CodeGen/R600/sdiv_vec.ll           |  46 ++++++++++++++
>   test/CodeGen/R600/udiv.ll               |  55 ++++++++++++----
>   test/CodeGen/R600/udiv_vec.ll           |  47 ++++++++++++++
>   test/CodeGen/R600/udivrem64.ll          |  82 ------------------------
>   9 files changed, 270 insertions(+), 113 deletions(-)
>   create mode 100644 lib/Target/R600/AMDGPUExpandDIVMOD.cpp
>   create mode 100644 test/CodeGen/R600/sdiv_vec.ll
>   create mode 100644 test/CodeGen/R600/udiv_vec.ll
>   delete mode 100644 test/CodeGen/R600/udivrem64.ll
>
> diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
> index ff4d6b4..e968eba 100644
> --- a/lib/Target/R600/AMDGPU.h
> +++ b/lib/Target/R600/AMDGPU.h
> @@ -50,6 +50,7 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
>   extern char &SILowerI1CopiesID;
>   
>   // Passes common to R600 and SI
> +FunctionPass *createAMDGPUExpandDIVMODPass();
>   FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
>   Pass *createAMDGPUStructurizeCFGPass();
>   FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
> diff --git a/lib/Target/R600/AMDGPUExpandDIVMOD.cpp b/lib/Target/R600/AMDGPUExpandDIVMOD.cpp
> new file mode 100644
> index 0000000..98d997d
> --- /dev/null
> +++ b/lib/Target/R600/AMDGPUExpandDIVMOD.cpp
> @@ -0,0 +1,107 @@
> +//===-- AMDGPUExpandDIVMOD.cpp - Expand div/mod instructions --------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +/// \file
> +//===----------------------------------------------------------------------===//
> +
> +#include "AMDGPU.h"
> +#include "llvm/IR/IRBuilder.h"
> +#include "llvm/IR/InstVisitor.h"
> +#include "llvm/Transforms/Utils/IntegerDivision.h"
> +
> +#include "llvm/Support/Debug.h"
> +using namespace llvm;
> +
> +namespace {
> +
> +class AMDGPUExpandDIVMOD : public FunctionPass,
> +                           public InstVisitor<AMDGPUExpandDIVMOD, bool> {
> +
> +  static char ID;
> +  std::vector<BinaryOperator *> Divs;
> +  std::vector<BinaryOperator *> Rems;
> +
> +public:
> +  AMDGPUExpandDIVMOD() : FunctionPass(ID) { }
> +  bool doInitialization(Module &M) override;
> +  bool runOnFunction(Function &F) override;
> +  const char *getPassName() const override {
> +    return "AMDGPU Expand div/mod";
> +  }
> +  bool visitInstruction(Instruction &I) { return false; }
> +  bool visitSDiv(BinaryOperator &I);
> +  bool visitUDiv(BinaryOperator &I);
> +  bool visitSRem(BinaryOperator &I);
> +  bool visitURem(BinaryOperator &I);
> +
> +};
> +
> +} // End anonymous namespace
> +
> +char AMDGPUExpandDIVMOD::ID = 0;
> +
> +bool AMDGPUExpandDIVMOD::doInitialization(Module &M) {
> +  return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::runOnFunction(Function &F) {
> +
> +  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
> +       BasicBlock *BB = BBI;
> +    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) {
> +        Instruction *I = II;
> +        if (visit(*I)) {
> +          BBI = F.begin();
> +          break;
> +        }
> +    }
> +  }
> +
> +  return false;
> +}
> +
> +static bool shouldExpandDivMod(const BinaryOperator &I) {
> +  return I.getType()->getScalarType() == Type::getInt64Ty(I.getContext());
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitSDiv(BinaryOperator &I) {
> +  if (shouldExpandDivMod(I)) {
> +    expandDivision(&I);
> +    return true;
> +  }
> +  return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitUDiv(BinaryOperator &I) {
> +  if (shouldExpandDivMod(I)) {
> +    expandDivision(&I);
> +    return true;
> +  }
> +  return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitSRem(BinaryOperator &I) {
> +  if (shouldExpandDivMod(I)) {
> +    expandRemainder(&I);
> +    return true;
> +  }
> +  return false;
> +}
> +
> +bool AMDGPUExpandDIVMOD::visitURem(BinaryOperator &I) {
> +  if (shouldExpandDivMod(I)) {
> +    expandRemainder(&I);
> +    return true;
> +  }
> +  return false;
> +}
> +
> +FunctionPass *llvm::createAMDGPUExpandDIVMODPass() {
> +  return new AMDGPUExpandDIVMOD();
> +}
> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> index c95a941..1a95d86 100644
> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> @@ -119,6 +119,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
>   bool
>   AMDGPUPassConfig::addPreISel() {
>     const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
> +  addPass(createAMDGPUExpandDIVMODPass());
>     addPass(createFlattenCFGPass());
>     if (ST.IsIRStructurizerEnabled())
>       addPass(createStructurizeCFGPass());
> diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
> index c5f4680..c94b4e3 100644
> --- a/lib/Target/R600/CMakeLists.txt
> +++ b/lib/Target/R600/CMakeLists.txt
> @@ -14,6 +14,7 @@ add_public_tablegen_target(AMDGPUCommonTableGen)
>   add_llvm_target(R600CodeGen
>     AMDILCFGStructurizer.cpp
>     AMDGPUAsmPrinter.cpp
> +  AMDGPUExpandDIVMOD.cpp
>     AMDGPUFrameLowering.cpp
>     AMDGPUIntrinsicInfo.cpp
>     AMDGPUISelDAGToDAG.cpp
> diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
> index e922d5c..3d74e90 100644
> --- a/test/CodeGen/R600/sdiv.ll
> +++ b/test/CodeGen/R600/sdiv.ll
> @@ -81,23 +81,30 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
>     ret void
>   }
>   
> -; Tests for 64-bit divide bypass.
> -; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> -;   %result = sdiv i64 %a, %b
> -;   store i64 %result, i64 addrspace(1)* %out, align 8
> -;   ret void
> -; }
> +; For the 64-bit division, just make sure we don't crash with a 'cannot select'
> +; error.
> +; FUNC-LABEL: @test_get_quotient
> +; SI:S_ENDPGM
missing space after :, and the same for the rest of these tests
> +define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> +  %result = sdiv i64 %a, %b
> +  store i64 %result, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
>   
> -; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> -;   %result = srem i64 %a, %b
> -;   store i64 %result, i64 addrspace(1)* %out, align 8
> -;   ret void
> -; }
> +; FUNC-LABEL: @test_get_remainder
> +; SI:S_ENDPGM
> +define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> +  %result = srem i64 %a, %b
> +  store i64 %result, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
>   
> -; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> -;   %resultdiv = sdiv i64 %a, %b
> -;   %resultrem = srem i64 %a, %b
> -;   %result = add i64 %resultdiv, %resultrem
> -;   store i64 %result, i64 addrspace(1)* %out, align 8
> -;   ret void
> -; }
> +; FUNC-LABEL: @test_get_quotient_and_remainder
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> +  %resultdiv = sdiv i64 %a, %b
> +  %resultrem = srem i64 %a, %b
> +  %result = add i64 %resultdiv, %resultrem
> +  store i64 %result, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
> diff --git a/test/CodeGen/R600/sdiv_vec.ll b/test/CodeGen/R600/sdiv_vec.ll
> new file mode 100644
> index 0000000..4e8ace4
> --- /dev/null
> +++ b/test/CodeGen/R600/sdiv_vec.ll
> @@ -0,0 +1,46 @@
> +;FIXME: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
> +
> +; FIXME: i64 vector kernel args don't work on Evergeen/NI.
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v2
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v2(<2 x i64> addrspace(1)* %out, <2 x i64> %a, <2 x i64> %b) nounwind {
> +  %resultdiv = sdiv <2 x i64> %a, %b
> +  %resultrem = srem <2 x i64> %a, %b
> +  %result = add <2 x i64> %resultdiv, %resultrem
> +  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v4
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v4(<4 x i64> addrspace(1)* %out, <4 x i64> %a, <4 x i64> %b) nounwind {
> +  %resultdiv = sdiv <4 x i64> %a, %b
> +  %resultrem = srem <4 x i64> %a, %b
> +  %result = add <4 x i64> %resultdiv, %resultrem
> +  store <4 x i64> %result, <4 x i64> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v8
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v8( <8 x i64> addrspace(1)* %out, <8 x i64> %a, <8 x i64> %b) nounwind {
> +  %resultdiv = sdiv <8 x i64> %a, %b
> +  %resultrem = srem <8 x i64> %a, %b
> +  %result = add <8 x i64> %resultdiv, %resultrem
> +  store <8 x i64> %result, <8 x i64> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FIXME: The v16 case causes machine verifier errors.  I think this is related
> +; to register spilling.
> +; FIXME-FUNC-LABEL: @test_get_quotient_and_remainder_v16
> +; FIXME-SI:S_ENDPGM
> +;define void @test_get_quotient_and_remainder_v16(<16 x i64> addrspace(1)* %out, <16 x i64> %a, <16 x i64> %b) nounwind {
> +;  %resultdiv = sdiv <16 x i64> %a, %b
> +;  %resultrem = srem <16 x i64> %a, %b
> +;  %result = add <16 x i64> %resultdiv, %resultrem
> +;  store <16 x i64> %result, <16 x i64> addrspace(1)* %out, align 8
> +;  ret void
> +;}
> diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
> index 5371321..b483e76 100644
> --- a/test/CodeGen/R600/udiv.ll
> +++ b/test/CodeGen/R600/udiv.ll
> @@ -1,9 +1,9 @@
> -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
> -;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
> +;FIXME: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
>   
> -;EG-CHECK-LABEL: @test
> -;EG-CHECK-NOT: SETGE_INT
> -;EG-CHECK: CF_END
> +;EG-LABEL: @test
> +;EG-NOT: SETGE_INT
> +;EG: CF_END
>   
>   define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
>     %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
> @@ -18,10 +18,10 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
>   ;The goal of this test is to make sure the ISel doesn't fail when it gets
>   ;a v4i32 udiv
>   
> -;EG-CHECK-LABEL: @test2
> -;EG-CHECK: CF_END
> -;SI-CHECK-LABEL: @test2
> -;SI-CHECK: S_ENDPGM
> +;EG-LABEL: @test2
> +;EG: CF_END
> +;SI-LABEL: @test2
> +;SI: S_ENDPGM
>   
>   define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
>     %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
> @@ -32,10 +32,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
>     ret void
>   }
>   
> -;EG-CHECK-LABEL: @test4
> -;EG-CHECK: CF_END
> -;SI-CHECK-LABEL: @test4
> -;SI-CHECK: S_ENDPGM
> +;EG-LABEL: @test4
> +;EG: CF_END
> +;SI-LABEL: @test4
> +;SI: S_ENDPGM
>   
>   define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
>     %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
> @@ -45,3 +45,32 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
>     store <4 x i32> %result, <4 x i32> addrspace(1)* %out
>     ret void
>   }
> +
> +; For the 64-bit division, just make sure we don't crash with a 'cannot select'
> +; error.
> +; FUNC-LABEL: @test_get_quotient
> +; SI: S_ENDPGM
> +define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> +  %result = udiv i64 %a, %b
> +  store i64 %result, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FIXME: The AMDILCFGStructurizer crashes on this function for redwood.
> +; FUNC-LABEL: @test_get_remainder
> +; SI: S_ENDPGM
> +define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> +  %result = urem i64 %a, %b
> +  store i64 %result, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder
> +; SI: S_ENDPGM
> +define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> +  %resultdiv = udiv i64 %a, %b
> +  %resultrem = urem i64 %a, %b
> +  %result = add i64 %resultdiv, %resultrem
> +  store i64 %result, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
> diff --git a/test/CodeGen/R600/udiv_vec.ll b/test/CodeGen/R600/udiv_vec.ll
> new file mode 100644
> index 0000000..942c323
> --- /dev/null
> +++ b/test/CodeGen/R600/udiv_vec.ll
> @@ -0,0 +1,47 @@
> +;FIXME: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
> +
> +; FIXME: i64 vector kernel args don't work on Evergeen/NI.
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v2
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v2(<2 x i64> addrspace(1)* %out, <2 x i64> %a, <2 x i64> %b) nounwind {
> +  %resultdiv = udiv <2 x i64> %a, %b
> +  %resultrem = urem <2 x i64> %a, %b
> +  %result = add <2 x i64> %resultdiv, %resultrem
> +  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v4
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v4(<4 x i64> addrspace(1)* %out, <4 x i64> %a, <4 x i64> %b) nounwind {
> +  %resultdiv = udiv <4 x i64> %a, %b
> +  %resultrem = urem <4 x i64> %a, %b
> +  %result = add <4 x i64> %resultdiv, %resultrem
> +  store <4 x i64> %result, <4 x i64> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_get_quotient_and_remainder_v8
> +; SI:S_ENDPGM
> +define void @test_get_quotient_and_remainder_v8( <8 x i64> addrspace(1)* %out, <8 x i64> %a, <8 x i64> %b) nounwind {
> +  %resultdiv = udiv <8 x i64> %a, %b
> +  %resultrem = urem <8 x i64> %a, %b
> +  %result = add <8 x i64> %resultdiv, %resultrem
> +  store <8 x i64> %result, <8 x i64> addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FIXME: The v16 case causes machine verifier errors.  I think this is related
> +; to register spilling.
> +
> +; FIXME-FUNC-LABEL: @test_get_quotient_and_remainder_v16
> +; FIXME-SI:S_ENDPGM
> +;define void @test_get_quotient_and_remainder_v16(<16 x i64> addrspace(1)* %out, <16 x i64> %a, <16 x i64> %b) nounwind {
> +;  %resultdiv = udiv <16 x i64> %a, %b
> +;  %resultrem = urem <16 x i64> %a, %b
> +;  %result = add <16 x i64> %resultdiv, %resultrem
> +;  store <16 x i64> %result, <16 x i64> addrspace(1)* %out, align 8
> +;  ret void
> +;}
> diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
> deleted file mode 100644
> index a71315a..0000000
> --- a/test/CodeGen/R600/udivrem64.ll
> +++ /dev/null
> @@ -1,82 +0,0 @@
> -;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
> -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> -
> -;FUNC-LABEL: @test_udiv
> -;EG: RECIP_UINT
> -;EG: LSHL {{.*}}, 1,
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;SI: S_ENDPGM
> -define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
> -  %result = udiv i64 %x, %y
> -  store i64 %result, i64 addrspace(1)* %out
> -  ret void
> -}
> -
> -;FUNC-LABEL: @test_urem
> -;EG: RECIP_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: BFE_UINT
> -;EG: AND_INT {{.*}}, 1,
> -;SI: S_ENDPGM
> -define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
> -  %result = urem i64 %x, %y
> -  store i64 %result, i64 addrspace(1)* %out
> -  ret void
> -}
> -- 1.8.5.5
>
> 0008-R600-Factor-i64-UDIVREM-lowering-into-its-own-fuctio.patch
>
>
>  From 1fbfb38377036de2a1e32ebcda911dadea252994 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 16 Sep 2014 14:59:51 -0400
> Subject: [PATCH 8/8] R600: Factor i64 UDIVREM lowering into its own fuction
>
> This is so it could potentially be used by SI.  Howerver, the current
> implemtation does not always produce correct results, so the
Typo "implemtation"
> AMDGPUExpandDIVMOD pass is being used instead.
Is the output from this better than what you get from the expand divmod 
pass? I would expect so since I thought the pass inserts branching.

What kind of bugs? Does it not work for a certain range of values?

> ---
>   lib/Target/R600/AMDGPUISelLowering.cpp | 84 ++++++++++++++++++++++++++++++++++
>   lib/Target/R600/AMDGPUISelLowering.h   |  2 +
>   lib/Target/R600/R600ISelLowering.cpp   |  1 +
>   3 files changed, 87 insertions(+)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index f353c94..c66bb7e 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -1485,11 +1485,95 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
>     return DAG.getMergeValues(Res, DL);
>   }
>   
> +/// XXX: FIXME This function appears to have some bugs and does not always
> +/// produce correct results.  It is currently superseded by the
> +/// AMDGPUExpandDIVREM pass.
> +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
> +                                      SelectionDAG &DAG,
> +                                      SmallVectorImpl<SDValue> &Results) const {
> +  assert(Op.getValueType() == MVT::i64);
> +
> +  SDLoc DL(Op);
> +  EVT VT = Op.getValueType();
> +  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
> +
> +  SDValue one = DAG.getConstant(1, HalfVT);
> +  SDValue zero = DAG.getConstant(0, HalfVT);
> +
> +  //HiLo split
> +  SDValue LHS = Op.getOperand(0);
> +  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
> +  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
> +
> +  SDValue RHS = Op.getOperand(1);
> +  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
> +  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
> +
> +  // Get Speculative values
> +  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
> +  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
> +
> +  SDValue REM_Hi = zero;
> +  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
> +
> +  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
> +  SDValue DIV_Lo = zero;
> +
> +  const unsigned halfBitWidth = HalfVT.getSizeInBits();
> +
> +  for (unsigned i = 0; i < halfBitWidth; ++i) {
> +    SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
> +    // Get Value of high bit
> +    SDValue HBit;
> +    if (halfBitWidth == 32 && Subtarget->hasBFE()) {
> +      HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
> +    } else {
> +      HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
> +      HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
> +    }
> +
> +    SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
> +      DAG.getConstant(halfBitWidth - 1, HalfVT));
> +    REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
> +    REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
> +
> +    REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
> +    REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
> +
> +
> +    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
> +
> +    SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
> +    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
> +
> +    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
> +
> +    // Update REM
> +
> +    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
> +
> +    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
> +    REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
> +    REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
> +  }
> +
> +  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
> +  SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
> +  Results.push_back(DIV);
> +  Results.push_back(REM);
> +}
> +
>   SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
>                                              SelectionDAG &DAG) const {
>     SDLoc DL(Op);
>     EVT VT = Op.getValueType();
>   
> +  if (VT == MVT::i64) {
> +    SmallVector<SDValue, 2> Results;
> +    LowerUDIVREM64(Op, DAG, Results);
> +    return DAG.getMergeValues(Results, DL);
> +  }
> +
>     SDValue Num = Op.getOperand(0);
>     SDValue Den = Op.getOperand(1);
>   
> diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
> index fc4c006..e94c333 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.h
> +++ b/lib/Target/R600/AMDGPUISelLowering.h
> @@ -83,6 +83,8 @@ protected:
>     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
> +  void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
> +                                    SmallVectorImpl<SDValue> &Results) const;
>     bool isHWTrueValue(SDValue Op) const;
>     bool isHWFalseValue(SDValue Op) const;
>   
> diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
> index 3bc8cb9..04ba910 100644
> --- a/lib/Target/R600/R600ISelLowering.cpp
> +++ b/lib/Target/R600/R600ISelLowering.cpp
> @@ -901,6 +901,7 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
>     }
>     case ISD::UDIVREM: {
>       SDValue Op = SDValue(N, 0);
> +    LowerUDIVREM64(Op, DAG, Results);
>       SDLoc DL(Op);
>       EVT VT = Op.getValueType();
>       EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
> -- 1.8.5.5
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140919/95933f1a/attachment.html>