R600: Implement Bottom Up scheduling

Fri May 17 07:43:34 PDT 2013

Hi Vincent,

The updated patches look good:

Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

On Fri, May 17, 2013 at 07:03:40AM -0700, Vincent Lejeune wrote:
> 
> 
> 
> 
> ----- Mail original -----
> > De : Tom Stellard <tom at stellard.net>
> > À : Vincent Lejeune <vljn at ovi.com>
> > Cc : "llvm-commits at cs.uiuc.edu" <llvm-commits at cs.uiuc.edu>
> > Envoyé le : Jeudi 16 mai 2013 19h05
> > Objet : Re: R600: Implement Bottom Up scheduling
> > 
> > On Tue, May 14, 2013 at 04:14:02PM -0700, Vincent Lejeune wrote:
> >>  Hi,
> >> 
> >>  This serie implement bottom up scheduling, which is be less register hungry 
> > in most situations and 
> >> 
> >>  improves packetization.
> >> 
> >>  The new heuristic however perform badly when there are a lot of vector 
> > operations and thus crashes in
> >>  Unigine when Ambient Occlusion shader is compiled : it uses indeed a lot of 
> > dot4 operations.
> >>  The purpose of the first patch is to avoid such situation.
> >>  I still need to improve handling of copy from physical register (they 
> > currently translate to a MOV) in a
> >>  future patch.
> >> 
> >> 
> > 
> > Hi Vincent,
> > 
> > Just a few small comments, but otherwise these patches are:
> > 
> > Reviewed-by: Tom Stellard <thomas.stellard at amd.com>
> > 
> > 
> >>  Vincent
> > 
> >>  From 40b235966b3771a446d2b720f96e58b4b09f92b6 Mon Sep 17 00:00:00 2001
> >>  From: Vincent Lejeune <vljn at ovi.com>
> >>  Date: Sun, 24 Feb 2013 16:31:32 +0100
> >>  Subject: [PATCH 1/5] R600: Relax some vector constraints on Dot4.
> >> 
> >>  Dot4 now uses 8 scalar operands instead of 2 vectors one which allows 
> > register
> >>  coalescer to remove some unneeded COPY.
> >>  This patch also defines some structures/functions that can be used to 
> > handle
> >>  every vector instructions (CUBE, Cayman special instructions...) in a 
> > similar
> >>  fashion.
> >>  ---
> >>   lib/Target/R600/AMDGPUISelLowering.h        |  1 +
> >>   lib/Target/R600/R600Defines.h               | 74 ++++++++++++++++++++++++
> >>   lib/Target/R600/R600EmitClauseMarkers.cpp   |  6 +-
> >>   lib/Target/R600/R600ExpandSpecialInstrs.cpp | 34 +++++++++++
> >>   lib/Target/R600/R600ISelLowering.cpp        | 21 +++++++
> >>   lib/Target/R600/R600InstrInfo.cpp           | 89 
> > +++++++++++++++++++++++++++++
> >>   lib/Target/R600/R600InstrInfo.h             |  5 ++
> >>   lib/Target/R600/R600Instructions.td         | 51 ++++++++++++++++-
> >>   lib/Target/R600/R600MachineScheduler.cpp    |  2 +
> >>   test/CodeGen/R600/pv.ll                     |  2 +-
> >>   10 files changed, 279 insertions(+), 6 deletions(-)
> >> 
> >>  diff --git a/lib/Target/R600/AMDGPUISelLowering.h 
> > b/lib/Target/R600/AMDGPUISelLowering.h
> >>  index f108fbc..4c25886 100644
> >>  --- a/lib/Target/R600/AMDGPUISelLowering.h
> >>  +++ b/lib/Target/R600/AMDGPUISelLowering.h
> >>  @@ -126,6 +126,7 @@ enum {
> >>     SMIN,
> >>     UMIN,
> >>     URECIP,
> >>  +  DOT4,
> >>     TEXTURE_FETCH,
> >>     EXPORT,
> >>     CONST_ADDRESS,
> >>  diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
> >>  index 36bfb18..aebe581 100644
> >>  --- a/lib/Target/R600/R600Defines.h
> >>  +++ b/lib/Target/R600/R600Defines.h
> >>  @@ -98,6 +98,80 @@ namespace R600Operands {
> >>       {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 
> > 9,-1,10,11,12,13,14,15,16,17,18}
> >>     };
> >>   
> >>  +  enum VecOps {
> >>  +    UPDATE_EXEC_MASK_X,
> >>  +    UPDATE_PREDICATE_X,
> >>  +    WRITE_X,
> >>  +    OMOD_X,
> >>  +    DST_REL_X,
> >>  +    CLAMP_X,
> >>  +    SRC0_X,
> >>  +    SRC0_NEG_X,
> >>  +    SRC0_REL_X,
> >>  +    SRC0_ABS_X,
> >>  +    SRC0_SEL_X,
> >>  +    SRC1_X,
> >>  +    SRC1_NEG_X,
> >>  +    SRC1_REL_X,
> >>  +    SRC1_ABS_X,
> >>  +    SRC1_SEL_X,
> >>  +    PRED_SEL_X,
> >>  +    UPDATE_EXEC_MASK_Y,
> >>  +    UPDATE_PREDICATE_Y,
> >>  +    WRITE_Y,
> >>  +    OMOD_Y,
> >>  +    DST_REL_Y,
> >>  +    CLAMP_Y,
> >>  +    SRC0_Y,
> >>  +    SRC0_NEG_Y,
> >>  +    SRC0_REL_Y,
> >>  +    SRC0_ABS_Y,
> >>  +    SRC0_SEL_Y,
> >>  +    SRC1_Y,
> >>  +    SRC1_NEG_Y,
> >>  +    SRC1_REL_Y,
> >>  +    SRC1_ABS_Y,
> >>  +    SRC1_SEL_Y,
> >>  +    PRED_SEL_Y,
> >>  +    UPDATE_EXEC_MASK_Z,
> >>  +    UPDATE_PREDICATE_Z,
> >>  +    WRITE_Z,
> >>  +    OMOD_Z,
> >>  +    DST_REL_Z,
> >>  +    CLAMP_Z,
> >>  +    SRC0_Z,
> >>  +    SRC0_NEG_Z,
> >>  +    SRC0_REL_Z,
> >>  +    SRC0_ABS_Z,
> >>  +    SRC0_SEL_Z,
> >>  +    SRC1_Z,
> >>  +    SRC1_NEG_Z,
> >>  +    SRC1_REL_Z,
> >>  +    SRC1_ABS_Z,
> >>  +    SRC1_SEL_Z,
> >>  +    PRED_SEL_Z,
> >>  +    UPDATE_EXEC_MASK_W,
> >>  +    UPDATE_PREDICATE_W,
> >>  +    WRITE_W,
> >>  +    OMOD_W,
> >>  +    DST_REL_W,
> >>  +    CLAMP_W,
> >>  +    SRC0_W,
> >>  +    SRC0_NEG_W,
> >>  +    SRC0_REL_W,
> >>  +    SRC0_ABS_W,
> >>  +    SRC0_SEL_W,
> >>  +    SRC1_W,
> >>  +    SRC1_NEG_W,
> >>  +    SRC1_REL_W,
> >>  +    SRC1_ABS_W,
> >>  +    SRC1_SEL_W,
> >>  +    PRED_SEL_W,
> >>  +    IMM_0,
> >>  +    IMM_1,
> >>  +    VEC_COUNT
> >>  + };
> >>  +
> >>   }
> >>   
> >>   
> > //===----------------------------------------------------------------------===//
> >>  diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp 
> > b/lib/Target/R600/R600EmitClauseMarkers.cpp
> >>  index bae39c5..f45e958 100644
> >>  --- a/lib/Target/R600/R600EmitClauseMarkers.cpp
> >>  +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
> >>  @@ -36,8 +36,7 @@ private:
> >>       case AMDGPU::INTERP_PAIR_XY:
> >>       case AMDGPU::INTERP_PAIR_ZW:
> >>       case AMDGPU::INTERP_VEC_LOAD:
> >>  -    case AMDGPU::DOT4_eg_pseudo:
> >>  -    case AMDGPU::DOT4_r600_pseudo:
> >>  +    case AMDGPU::DOT_4:
> >>         return 4;
> >>       case AMDGPU::KILL:
> >>         return 0;
> >>  @@ -71,8 +70,7 @@ private:
> >>       case AMDGPU::INTERP_PAIR_ZW:
> >>       case AMDGPU::INTERP_VEC_LOAD:
> >>       case AMDGPU::COPY:
> >>  -    case AMDGPU::DOT4_eg_pseudo:
> >>  -    case AMDGPU::DOT4_r600_pseudo:
> >>  +    case AMDGPU::DOT_4:
> >>         return true;
> >>       default:
> >>         return false;
> >>  diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp 
> > b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
> >>  index f8c900f..da48bba 100644
> >>  --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
> >>  +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
> >>  @@ -182,6 +182,40 @@ bool 
> > R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
> >>           MI.eraseFromParent();
> >>           continue;
> >>           }
> >>  +      case AMDGPU::DOT_4: {
> >>  +
> >>  +        const R600RegisterInfo &TRI = TII->getRegisterInfo();
> >>  +
> >>  +        unsigned DstReg = MI.getOperand(0).getReg();
> >>  +        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
> >>  +
> >>  +        for (unsigned Chan = 0; Chan < 4; ++Chan) {
> >>  +          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
> >>  +          unsigned SubDstReg =
> >>  +              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + 
> > Chan);
> >>  +          MachineInstr *BMI =
> >>  +              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, 
> > SubDstReg);
> >>  +          if (Chan > 0) {
> >>  +            BMI->bundleWithPred();
> >>  +          }
> >>  +          if (Mask) {
> >>  +            TII->addFlag(BMI, 0, MO_FLAG_MASK);
> >>  +          }
> >>  +          if (Chan != 3)
> >>  +            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
> >>  +          // While not strictly necessary from hw point of view, we force
> >>  +          // all src operands of a dot4 inst to belong to the same slot.
> >>  +          unsigned Src0 = BMI->getOperand(
> >>  +              TII->getOperandIdx(AMDGPU::DOT4_eg_real, 
> > R600Operands::SRC0))
> >>  +              .getReg();
> >>  +          unsigned Src1 = BMI->getOperand(
> >>  +              TII->getOperandIdx(AMDGPU::DOT4_eg_real, 
> > R600Operands::SRC1))
> >>  +              .getReg();
> >>  +          assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
> >>  +        }
> >>  +        MI.eraseFromParent();
> >>  +        continue;
> >>  +      }
> >>         }
> >>   
> >>         bool IsReduction = TII->isReductionOp(MI.getOpcode());
> >>  diff --git a/lib/Target/R600/R600ISelLowering.cpp 
> > b/lib/Target/R600/R600ISelLowering.cpp
> >>  index d232022..728769b 100644
> >>  --- a/lib/Target/R600/R600ISelLowering.cpp
> >>  +++ b/lib/Target/R600/R600ISelLowering.cpp
> >>  @@ -622,6 +622,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, 
> > SelectionDAG &DAG) const
> >>         };
> >>         return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, 
> > TexArgs, 19);
> >>       }
> >>  +    case AMDGPUIntrinsic::AMDGPU_dp4: {
> >>  +      SDValue Args[8] = {
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
> >>  +          DAG.getConstant(0, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
> >>  +          DAG.getConstant(0, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
> >>  +          DAG.getConstant(1, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
> >>  +          DAG.getConstant(1, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
> >>  +          DAG.getConstant(2, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
> >>  +          DAG.getConstant(2, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
> >>  +          DAG.getConstant(3, MVT::i32)),
> >>  +      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
> >>  +          DAG.getConstant(3, MVT::i32))
> >>  +      };
> >>  +      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
> >>  +    }
> >>   
> >>       case r600_read_ngroups_x:
> >>         return LowerImplicitParameter(DAG, VT, DL, 0);
> >>  diff --git a/lib/Target/R600/R600InstrInfo.cpp 
> > b/lib/Target/R600/R600InstrInfo.cpp
> >>  index 53f1ce6..66207ba 100644
> >>  --- a/lib/Target/R600/R600InstrInfo.cpp
> >>  +++ b/lib/Target/R600/R600InstrInfo.cpp
> >>  @@ -865,6 +865,95 @@ MachineInstrBuilder 
> > R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
> >>     return MIB;
> >>   }
> >>   
> >>  +#define OPERAND_CASE(Label) \
> >>  +  case Label: { \
> >>  +    static const R600Operands::VecOps Ops[] = \
> >>  +    { \
> >>  +      Label##_X, \
> >>  +      Label##_Y, \
> >>  +      Label##_Z, \
> >>  +      Label##_W \
> >>  +    }; \
> >>  +    return Ops[Slot]; \
> >>  +  }
> >>  +
> >>  +static R600Operands::VecOps
> >>  +getSlotedOps(R600Operands::Ops Op, unsigned Slot) {
> >>  +  switch (Op) {
> >>  +  OPERAND_CASE(R600Operands::UPDATE_EXEC_MASK)
> >>  +  OPERAND_CASE(R600Operands::UPDATE_PREDICATE)
> >>  +  OPERAND_CASE(R600Operands::WRITE)
> >>  +  OPERAND_CASE(R600Operands::OMOD)
> >>  +  OPERAND_CASE(R600Operands::DST_REL)
> >>  +  OPERAND_CASE(R600Operands::CLAMP)
> >>  +  OPERAND_CASE(R600Operands::SRC0)
> >>  +  OPERAND_CASE(R600Operands::SRC0_NEG)
> >>  +  OPERAND_CASE(R600Operands::SRC0_REL)
> >>  +  OPERAND_CASE(R600Operands::SRC0_ABS)
> >>  +  OPERAND_CASE(R600Operands::SRC0_SEL)
> >>  +  OPERAND_CASE(R600Operands::SRC1)
> >>  +  OPERAND_CASE(R600Operands::SRC1_NEG)
> >>  +  OPERAND_CASE(R600Operands::SRC1_REL)
> >>  +  OPERAND_CASE(R600Operands::SRC1_ABS)
> >>  +  OPERAND_CASE(R600Operands::SRC1_SEL)
> >>  +  OPERAND_CASE(R600Operands::PRED_SEL)
> >>  +  default:
> >>  +    llvm_unreachable("Wrong Operand");
> >>  +  }
> >>  +}
> >>  +
> >>  +#undef OPERAND_CASE
> >>  +
> >>  +static int
> >>  +getVecOperandIdx(R600Operands::VecOps Op) {
> >>  +  return 1 + Op;
> >>  +}
> >>  +
> >>  +
> >>  +MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
> >>  +    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned 
> > DstReg)
> >>  +    const {
> >>  +  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not 
> > Implemented");
> >>  +  unsigned Opcode;
> >>  +  const AMDGPUSubtarget &ST = 
> > TM.getSubtarget<AMDGPUSubtarget>();
> >>  +  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
> >>  +    Opcode = AMDGPU::DOT4_r600_real;
> >>  +  else
> >>  +    Opcode = AMDGPU::DOT4_eg_real;
> >>  +  MachineBasicBlock::iterator I = MI;
> >>  +  MachineOperand &Src0 = MI->getOperand(
> >>  +      getVecOperandIdx(getSlotedOps(R600Operands::SRC0, Slot)));
> >>  +  MachineOperand &Src1 = MI->getOperand(
> >>  +      getVecOperandIdx(getSlotedOps(R600Operands::SRC1, Slot)));
> >>  +  MachineInstr *MIB = buildDefaultInstruction(
> >>  +      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
> >>  +  static const R600Operands::Ops Operands[14] = {
> >>  +    R600Operands::UPDATE_EXEC_MASK,
> >>  +    R600Operands::UPDATE_PREDICATE,
> >>  +    R600Operands::WRITE,
> >>  +    R600Operands::OMOD,
> >>  +    R600Operands::DST_REL,
> >>  +    R600Operands::CLAMP,
> >>  +    R600Operands::SRC0_NEG,
> >>  +    R600Operands::SRC0_REL,
> >>  +    R600Operands::SRC0_ABS,
> >>  +    R600Operands::SRC0_SEL,
> >>  +    R600Operands::SRC1_NEG,
> >>  +    R600Operands::SRC1_REL,
> >>  +    R600Operands::SRC1_ABS,
> >>  +    R600Operands::SRC1_SEL,
> >>  +  };
> >>  +
> >>  +  for (unsigned i = 0; i < 14; i++) {
> >>  +    MachineOperand &MO = MI->getOperand(
> >>  +        getVecOperandIdx(getSlotedOps(Operands[i], Slot)));
> >>  +    assert (MO.isImm());
> >>  +    setImmOperand(MIB, Operands[i], MO.getImm());
> >>  +  }
> >>  +  MIB->getOperand(20).setImm(0);
> >>  +  return MIB;
> >>  +}
> >>  +
> >>   MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
> >>                                            MachineBasicBlock::iterator I,
> >>                                            unsigned DstReg,
> >>  diff --git a/lib/Target/R600/R600InstrInfo.h 
> > b/lib/Target/R600/R600InstrInfo.h
> >>  index 5a84cd5..f9ccf4f 100644
> >>  --- a/lib/Target/R600/R600InstrInfo.h
> >>  +++ b/lib/Target/R600/R600InstrInfo.h
> >>  @@ -198,6 +198,11 @@ namespace llvm {
> >>                                                 unsigned Src0Reg,
> >>                                                 unsigned Src1Reg = 0) const;
> >>   
> >>  +  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
> >>  +                                             MachineInstr *MI,
> >>  +                                             unsigned Slot,
> >>  +                                             unsigned DstReg) const;
> >>  +
> >>     MachineInstr *buildMovImm(MachineBasicBlock &BB,
> >>                                     MachineBasicBlock::iterator I,
> >>                                     unsigned DstReg,
> >>  diff --git a/lib/Target/R600/R600Instructions.td 
> > b/lib/Target/R600/R600Instructions.td
> >>  index 83bbab1..80a15b3 100644
> >>  --- a/lib/Target/R600/R600Instructions.td
> >>  +++ b/lib/Target/R600/R600Instructions.td
> >>  @@ -593,6 +593,13 @@ def CONST_ADDRESS: 
> > SDNode<"AMDGPUISD::CONST_ADDRESS",
> >>     [SDNPVariadic]
> >>   >;
> >>   
> >>  +def DOT4 : SDNode<"AMDGPUISD::DOT4",
> >>  +  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, 
> > SDTCisVT<2, f32>,
> >>  +      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, 
> > f32>,
> >>  +      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, 
> > f32>]>,
> >>  +  []
> >>  +>;
> >>  +
> >>   def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
> >>   
> >>   def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", 
> > TEXTURE_FETCH_Type, []>;
> >>  @@ -1229,12 +1236,54 @@ class CNDGE_Common <bits<5> inst> : 
> > R600_3OP <
> >>     [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, 
> > COND_GE))]
> >>   >;
> >>   
> >>  +
> >>  +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
> >>  +class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs 
> > R600_Reg32:$dst), (ins
> >>  +// Slot X
> >>  +   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
> >>  +   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
> >>  +   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, 
> > ABS:$src0_abs_X, SEL:$src0_sel_X,
> >>  +   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, 
> > ABS:$src1_abs_X, SEL:$src1_sel_X,
> >>  +   R600_Pred:$pred_sel_X,
> >>  +// Slot Y
> >>  +   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
> >>  +   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
> >>  +   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, 
> > ABS:$src0_abs_Y, SEL:$src0_sel_Y,
> >>  +   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, 
> > ABS:$src1_abs_Y, SEL:$src1_sel_Y,
> >>  +   R600_Pred:$pred_sel_Y,
> >>  +// Slot Z
> >>  +   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
> >>  +   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
> >>  +   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, 
> > ABS:$src0_abs_Z, SEL:$src0_sel_Z,
> >>  +   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, 
> > ABS:$src1_abs_Z, SEL:$src1_sel_Z,
> >>  +   R600_Pred:$pred_sel_Z,
> >>  +// Slot W
> >>  +   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
> >>  +   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
> >>  +   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, 
> > ABS:$src0_abs_W, SEL:$src0_sel_W,
> >>  +   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, 
> > ABS:$src1_abs_W, SEL:$src1_sel_W,
> >>  +   R600_Pred:$pred_sel_W,
> >>  +   LITERAL:$literal0, LITERAL:$literal1),
> >>  +  "",
> >>  +  pattern,
> >>  +  AnyALU> {}
> >>  +}
> >>  +
> >>  +def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
> >>  +  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
> >>  +  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
> >>  +  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
> >>  +  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
> >>  +
> >>  +
> >>  +
> >>  +
> >>   multiclass DOT4_Common <bits<11> inst> {
> > 
> > Are the DOT4_Pseudo and DOT4_real instructions still being used?  If
> > not, we delete this multiclass.
> >>   
> >>     def _pseudo : R600_REDUCTION <inst,
> >>       (ins R600_Reg128:$src0, R600_Reg128:$src1),
> >>       "DOT4 $dst $src0, $src1",
> >>  -    [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))]
> >>  +    []
> >>     >;
> >>   
> >>     def _real : R600_2OP <inst, "DOT4", []>;
> >>  diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
> > b/lib/Target/R600/R600MachineScheduler.cpp
> >>  index c6709a8..e2cab1e 100644
> >>  --- a/lib/Target/R600/R600MachineScheduler.cpp
> >>  +++ b/lib/Target/R600/R600MachineScheduler.cpp
> >>  @@ -185,6 +185,7 @@ R600SchedStrategy::AluKind 
> > R600SchedStrategy::getAluKind(SUnit *SU) const {
> >>       case AMDGPU::INTERP_PAIR_XY:
> >>       case AMDGPU::INTERP_PAIR_ZW:
> >>       case AMDGPU::INTERP_VEC_LOAD:
> >>  +    case AMDGPU::DOT_4:
> >>         return AluT_XYZW;
> >>       case AMDGPU::COPY:
> >>         if 
> > (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
> >>  @@ -254,6 +255,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
> >>     case AMDGPU::INTERP_VEC_LOAD:
> >>     case AMDGPU::DOT4_eg_pseudo:
> >>     case AMDGPU::DOT4_r600_pseudo:
> >>  +  case AMDGPU::DOT_4:
> >>       return IDAlu;
> >>     case AMDGPU::TEX_VTX_CONSTBUF:
> >>     case AMDGPU::TEX_VTX_TEXBUF:
> >>  diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
> >>  index 37c3d9d..062b741 100644
> >>  --- a/test/CodeGen/R600/pv.ll
> >>  +++ b/test/CodeGen/R600/pv.ll
> >>  @@ -1,7 +1,7 @@
> >>   ; RUN: llc < %s -march=r600 | FileCheck %s
> >>   
> >>   ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
> >>  -;CHECK-NEXT: CNDGE T{{[0-9].[XYZW]}}, PV.x
> >>  +;CHECK: CNDGE * T{{[0-9].[XYZW]}}, PV.x
> >>   
> >>   define void @main() #0 {
> >>   main_body:
> >>  -- 
> >>  1.8.2.1
> >> 
> > 
> >>  From 8ae345f0b0b32a5696389abcec8fcc6a08dfe834 Mon Sep 17 00:00:00 2001
> >>  From: Vincent Lejeune <vljn at ovi.com>
> >>  Date: Sun, 12 May 2013 23:17:59 +0200
> >>  Subject: [PATCH 2/5] R600: Replace big texture opcode switch in scheduler 
> > by
> >>   usesTC/usesVC
> >> 
> >>  ---
> >>   lib/Target/R600/R600MachineScheduler.cpp | 26 +++-----------------------
> >>   1 file changed, 3 insertions(+), 23 deletions(-)
> >> 
> >>  diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
> > b/lib/Target/R600/R600MachineScheduler.cpp
> >>  index e2cab1e..d86d37a 100644
> >>  --- a/lib/Target/R600/R600MachineScheduler.cpp
> >>  +++ b/lib/Target/R600/R600MachineScheduler.cpp
> >>  @@ -243,6 +243,9 @@ R600SchedStrategy::AluKind 
> > R600SchedStrategy::getAluKind(SUnit *SU) const {
> >>   int R600SchedStrategy::getInstKind(SUnit* SU) {
> >>     int Opcode = SU->getInstr()->getOpcode();
> >>   
> >>  +  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
> >>  +    return IDFetch;
> >>  +
> >>     if (TII->isALUInstr(Opcode)) {
> >>       return IDAlu;
> >>     }
> >>  @@ -257,30 +260,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
> >>     case AMDGPU::DOT4_r600_pseudo:
> >>     case AMDGPU::DOT_4:
> >>       return IDAlu;
> >>  -  case AMDGPU::TEX_VTX_CONSTBUF:
> >>  -  case AMDGPU::TEX_VTX_TEXBUF:
> >>  -  case AMDGPU::TEX_LD:
> >>  -  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
> >>  -  case AMDGPU::TEX_GET_GRADIENTS_H:
> >>  -  case AMDGPU::TEX_GET_GRADIENTS_V:
> >>  -  case AMDGPU::TEX_SET_GRADIENTS_H:
> >>  -  case AMDGPU::TEX_SET_GRADIENTS_V:
> >>  -  case AMDGPU::TEX_SAMPLE:
> >>  -  case AMDGPU::TEX_SAMPLE_C:
> >>  -  case AMDGPU::TEX_SAMPLE_L:
> >>  -  case AMDGPU::TEX_SAMPLE_C_L:
> >>  -  case AMDGPU::TEX_SAMPLE_LB:
> >>  -  case AMDGPU::TEX_SAMPLE_C_LB:
> >>  -  case AMDGPU::TEX_SAMPLE_G:
> >>  -  case AMDGPU::TEX_SAMPLE_C_G:
> >>  -  case AMDGPU::TXD:
> >>  -  case AMDGPU::TXD_SHADOW:
> >>  -    return IDFetch;
> >>     default:
> >>  -    DEBUG(
> >>  -        dbgs() << "other inst: ";
> >>  -        SU->dump(DAG);
> >>  -    );
> >>       return IDOther;
> >>     }
> >>   }
> >>  -- 
> >>  1.8.2.1
> >> 
> > 
> >>  From ee3d76a3e7defa85cb2081b6724942ee42336085 Mon Sep 17 00:00:00 2001
> >>  From: Vincent Lejeune <vljn at ovi.com>
> >>  Date: Mon, 13 May 2013 17:33:45 +0200
> >>  Subject: [PATCH 3/5] R600: Use depth first scheduling algorithm
> >> 
> >>  It should increase PV substitution opportunities and lower gpr
> >>  usage (pending computations path are "flushed" sooner)
> >>  ---
> >>   lib/Target/R600/R600MachineScheduler.cpp | 78 
> > +++++++++++---------------------
> >>   lib/Target/R600/R600MachineScheduler.h   | 32 ++-----------
> >>   test/CodeGen/R600/bfi_int.ll             |  2 +-
> >>   test/CodeGen/R600/pv.ll                  |  2 +-
> >>   4 files changed, 33 insertions(+), 81 deletions(-)
> >> 
> >>  diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
> > b/lib/Target/R600/R600MachineScheduler.cpp
> >>  index d86d37a..830fc4a 100644
> >>  --- a/lib/Target/R600/R600MachineScheduler.cpp
> >>  +++ b/lib/Target/R600/R600MachineScheduler.cpp
> >>  @@ -21,7 +21,6 @@
> >>   #include "llvm/Pass.h"
> >>   #include "llvm/PassManager.h"
> >>   #include "llvm/Support/raw_ostream.h"
> >>  -#include <set>
> >>   
> >>   using namespace llvm;
> >>   
> >>  @@ -31,9 +30,6 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
> >>     TII = static_cast<const R600InstrInfo*>(DAG->TII);
> >>     TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
> >>     MRI = &DAG->MRI;
> >>  -  Available[IDAlu]->clear();
> >>  -  Available[IDFetch]->clear();
> >>  -  Available[IDOther]->clear();
> >>     CurInstKind = IDOther;
> >>     CurEmitted = 0;
> >>     OccupedSlotsMask = 15;
> >>  @@ -44,16 +40,11 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) 
> > {
> >>     InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
> >>   }
> >>   
> >>  -void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
> >>  +void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
> >>  +                                  std::vector<SUnit *> &QDst)
> >>   {
> >>  -  if (QSrc->empty())
> >>  -    return;
> >>  -  for (ReadyQueue::iterator I = QSrc->begin(),
> >>  -      E = QSrc->end(); I != E; ++I) {
> >>  -    (*I)->NodeQueueId &= ~QSrc->getID();
> >>  -    QDst->push(*I);
> >>  -  }
> >>  -  QSrc->clear();
> >>  +  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
> >>  +  QSrc.clear();
> >>   }
> >>   
> >>   SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> >>  @@ -64,9 +55,9 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> >>     // check if we might want to switch current clause type
> >>     bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
> >>         (CurEmitted >= InstKindLimit[CurInstKind]) ||
> >>  -      (Available[CurInstKind]->empty());
> >>  +      (Available[CurInstKind].empty());
> >>     bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) 
> > &&
> >>  -      (!Available[IDFetch]->empty() || 
> > !Available[IDOther]->empty());
> >>  +      (!Available[IDFetch].empty() || !Available[IDOther].empty());
> >>   
> >>     if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
> >>         (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
> >>  @@ -99,10 +90,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) 
> > {
> >>           SU->dump(DAG);
> >>         } else {
> >>           dbgs() << "NO NODE ";
> >>  -        for (int i = 0; i < IDLast; ++i) {
> >>  -          Available[i]->dump();
> >>  -          Pending[i]->dump();
> >>  -        }
> >>           for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
> >>             const SUnit &S = DAG->SUnits[i];
> >>             if (!S.isScheduled)
> >>  @@ -163,7 +150,7 @@ void R600SchedStrategy::releaseTopNode(SUnit *SU) {
> >>     DEBUG(dbgs() << IK << " <= ");
> >>     DEBUG(SU->dump(DAG));
> >>   
> >>  -  Pending[IK]->push(SU);
> >>  +  Pending[IK].push_back(SU);
> >>   }
> >>   
> >>   void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
> >>  @@ -265,16 +252,16 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
> >>     }
> >>   }
> >>   
> >>  -SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, 
> > CompareSUnit> &Q) {
> >>  +SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
> >>     if (Q.empty())
> >>       return NULL;
> >>  -  for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = 
> > Q.end();
> >>  +  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = 
> > Q.rend();
> >>         It != E; ++It) {
> >>       SUnit *SU = *It;
> >>       InstructionsGroupCandidate.push_back(SU->getInstr());
> >>       if (TII->canBundle(InstructionsGroupCandidate)) {
> >>         InstructionsGroupCandidate.pop_back();
> >>  -      Q.erase(It);
> >>  +      Q.erase((It + 1).base());
> >>         return SU;
> >>       } else {
> >>         InstructionsGroupCandidate.pop_back();
> >>  @@ -284,14 +271,12 @@ SUnit 
> > *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
> >>   }
> >>   
> >>   void R600SchedStrategy::LoadAlu() {
> >>  -  ReadyQueue *QSrc = Pending[IDAlu];
> >>  -  for (ReadyQueue::iterator I = QSrc->begin(),
> >>  -        E = QSrc->end(); I != E; ++I) {
> >>  -      (*I)->NodeQueueId &= ~QSrc->getID();
> >>  -      AluKind AK = getAluKind(*I);
> >>  -      AvailableAlus[AK].insert(*I);
> >>  -    }
> >>  -    QSrc->clear();
> >>  +  std::vector<SUnit *> &QSrc = Pending[IDAlu];
> >>  +  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
> >>  +    AluKind AK = getAluKind(QSrc[i]);
> >>  +    AvailableAlus[AK].push_back(QSrc[i]);
> >>  +  }
> >>  +  QSrc.clear();
> >>   }
> >>   
> >>   void R600SchedStrategy::PrepareNextSlot() {
> >>  @@ -333,27 +318,16 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, 
> > unsigned Slot) {
> >>   SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
> >>     static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
> >>     SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
> >>  -  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
> >>  -  if (!UnslotedSU) {
> >>  +  if (SlotedSU)
> >>       return SlotedSU;
> >>  -  } else if (!SlotedSU) {
> >>  +  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
> >>  +  if (UnslotedSU)
> >>       AssignSlot(UnslotedSU->getInstr(), Slot);
> >>  -    return UnslotedSU;
> >>  -  } else {
> >>  -    //Determine which one to pick (the lesser one)
> >>  -    if (CompareSUnit()(SlotedSU, UnslotedSU)) {
> >>  -      AvailableAlus[AluAny].insert(UnslotedSU);
> >>  -      return SlotedSU;
> >>  -    } else {
> >>  -      AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
> >>  -      AssignSlot(UnslotedSU->getInstr(), Slot);
> >>  -      return UnslotedSU;
> >>  -    }
> >>  -  }
> >>  +  return UnslotedSU;
> >>   }
> >>   
> >>   bool R600SchedStrategy::isAvailablesAluEmpty() const {
> >>  -  return Pending[IDAlu]->empty() && 
> > AvailableAlus[AluAny].empty() &&
> >>  +  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() 
> > &&
> >>         AvailableAlus[AluT_XYZW].empty() && 
> > AvailableAlus[AluT_X].empty() &&
> >>         AvailableAlus[AluT_Y].empty() && 
> > AvailableAlus[AluT_Z].empty() &&
> >>         AvailableAlus[AluT_W].empty() && 
> > AvailableAlus[AluDiscarded].empty();
> >>  @@ -391,14 +365,14 @@ SUnit* R600SchedStrategy::pickAlu() {
> >>   
> >>   SUnit* R600SchedStrategy::pickOther(int QID) {
> >>     SUnit *SU = 0;
> >>  -  ReadyQueue *AQ = Available[QID];
> >>  +  std::vector<SUnit *> &AQ = Available[QID];
> >>   
> >>  -  if (AQ->empty()) {
> >>  +  if (AQ.empty()) {
> >>       MoveUnits(Pending[QID], AQ);
> >>     }
> >>  -  if (!AQ->empty()) {
> >>  -    SU = *AQ->begin();
> >>  -    AQ->remove(AQ->begin());
> >>  +  if (!AQ.empty()) {
> >>  +    SU = AQ.back();
> >>  +    AQ.resize(AQ.size() - 1);
> >>     }
> >>     return SU;
> >>   }
> >>  diff --git a/lib/Target/R600/R600MachineScheduler.h 
> > b/lib/Target/R600/R600MachineScheduler.h
> >>  index 3d0367f..c82ee49 100644
> >>  --- a/lib/Target/R600/R600MachineScheduler.h
> >>  +++ b/lib/Target/R600/R600MachineScheduler.h
> >>  @@ -24,13 +24,6 @@ using namespace llvm;
> >>   
> >>   namespace llvm {
> >>   
> >>  -class CompareSUnit {
> >>  -public:
> >>  -  bool operator()(const SUnit *S1, const SUnit *S2) {
> >>  -    return S1->getDepth() > S2->getDepth();
> >>  -  }
> >>  -};
> >>  -
> >>   class R600SchedStrategy : public MachineSchedStrategy {
> >>   
> >>     const ScheduleDAGMI *DAG;
> >>  @@ -38,12 +31,6 @@ class R600SchedStrategy : public MachineSchedStrategy {
> >>     const R600RegisterInfo *TRI;
> >>     MachineRegisterInfo *MRI;
> >>   
> >>  -  enum InstQueue {
> >>  -    QAlu = 1,
> >>  -    QFetch = 2,
> >>  -    QOther = 4
> >>  -  };
> >>  -
> >>     enum InstKind {
> >>       IDAlu,
> >>       IDFetch,
> >>  @@ -62,8 +49,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
> >>       AluLast
> >>     };
> >>   
> >>  -  ReadyQueue *Available[IDLast], *Pending[IDLast];
> >>  -  std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast];
> >>  +  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
> >>  +  std::vector<SUnit *> AvailableAlus[AluLast];
> >>  +  std::vector<SUnit *> FakeCopy;
> >>   
> >>     InstKind CurInstKind;
> >>     int CurEmitted;
> >>  @@ -76,19 +64,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
> >>   public:
> >>     R600SchedStrategy() :
> >>       DAG(0), TII(0), TRI(0), MRI(0) {
> >>  -    Available[IDAlu] = new ReadyQueue(QAlu, "AAlu");
> >>  -    Available[IDFetch] = new ReadyQueue(QFetch, "AFetch");
> >>  -    Available[IDOther] = new ReadyQueue(QOther, "AOther");
> >>  -    Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu");
> >>  -    Pending[IDFetch] = new ReadyQueue(QFetch<<4, 
> > "PFetch");
> >>  -    Pending[IDOther] = new ReadyQueue(QOther<<4, 
> > "POther");
> >>     }
> >>   
> >>     virtual ~R600SchedStrategy() {
> >>  -    for (unsigned I = 0; I < IDLast; ++I) {
> >>  -      delete Available[I];
> >>  -      delete Pending[I];
> >>  -    }
> >>     }
> >>   
> >>     virtual void initialize(ScheduleDAGMI *dag);
> >>  @@ -107,12 +85,12 @@ private:
> >>     bool isAvailablesAluEmpty() const;
> >>     SUnit *AttemptFillSlot (unsigned Slot);
> >>     void PrepareNextSlot();
> >>  -  SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q);
> >>  +  SUnit *PopInst(std::vector<SUnit*> &Q);
> >>   
> >>     void AssignSlot(MachineInstr *MI, unsigned Slot);
> >>     SUnit* pickAlu();
> >>     SUnit* pickOther(int QID);
> >>  -  void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
> >>  +  void MoveUnits(std::vector<SUnit *> &QSrc, 
> > std::vector<SUnit *> &QDst);
> >>   };
> >>   
> >>   } // namespace llvm
> >>  diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
> >>  index 4244dcf..f51060f 100644
> >>  --- a/test/CodeGen/R600/bfi_int.ll
> >>  +++ b/test/CodeGen/R600/bfi_int.ll
> >>  @@ -37,7 +37,7 @@ entry:
> >>   ; ((x & z) | (y & (x | z)))
> >>   ; R600-CHECK: @bfi_sha256_ma
> >>   ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], 
> > {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  -; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV.x}}, 
> > {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  +; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, 
> > {{[[DST]]|PV\.[xyzw]}}, {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> > 
> > A TODO for another patch is to use capital letters for the channel of
> > the PV registers.  This way we could use tablegen variables to match the
> > correct PV channel. e.g.:
> > 
> > ; R600-CHECK: XOR_INT * T[0-9]+\.[[DST_CHAN:[XYZW]]], 
> > {{T[0-9]+\.[XYZW],  T[0-9]+\.[XYZW]}}
> > ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{PV.[[DST_CHAN]]}}, 
> > {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> > 
> > 
> >>   ; SI-CHECK: V_XOR_B32_e32 [[DST:VGPR[0-9]+]], {{VGPR[0-9]+, VGPR[0-9]+}}
> >>   ; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{VGPR[0-9]+, VGPR[0-9]+}}
> >>   
> >>  diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
> >>  index 062b741..37c3d9d 100644
> >>  --- a/test/CodeGen/R600/pv.ll
> >>  +++ b/test/CodeGen/R600/pv.ll
> >>  @@ -1,7 +1,7 @@
> >>   ; RUN: llc < %s -march=r600 | FileCheck %s
> >>   
> >>   ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
> >>  -;CHECK: CNDGE * T{{[0-9].[XYZW]}}, PV.x
> >>  +;CHECK-NEXT: CNDGE T{{[0-9].[XYZW]}}, PV.x
> >>   
> >>   define void @main() #0 {
> >>   main_body:
> >>  -- 
> >>  1.8.2.1
> >> 
> > 
> >>  From fa9839686c30077ef55b95066d31217fb53942a6 Mon Sep 17 00:00:00 2001
> >>  From: Vincent Lejeune <vljn at ovi.com>
> >>  Date: Mon, 13 May 2013 18:33:15 +0200
> >>  Subject: [PATCH 4/5] R600: Use bottom up scheduling algorithm
> >> 
> >>  ---
> >>   lib/Target/R600/R600MachineScheduler.cpp | 92 
> > ++++++++++++++++++++++----------
> >>   lib/Target/R600/R600MachineScheduler.h   |  1 +
> >>   lib/Target/R600/R600RegisterInfo.cpp     |  6 ++-
> >>   lib/Target/R600/R600RegisterInfo.h       |  3 ++
> >>   test/CodeGen/R600/fabs.ll                |  2 +-
> >>   test/CodeGen/R600/fadd.ll                |  2 +-
> >>   test/CodeGen/R600/fdiv.ll                |  4 +-
> >>   test/CodeGen/R600/floor.ll               |  2 +-
> >>   test/CodeGen/R600/fmad.ll                |  2 +-
> >>   test/CodeGen/R600/fmax.ll                |  2 +-
> >>   test/CodeGen/R600/fmin.ll                |  2 +-
> >>   test/CodeGen/R600/fmul.ll                |  2 +-
> >>   test/CodeGen/R600/fsub.ll                |  2 +-
> >>   test/CodeGen/R600/llvm.AMDGPU.mul.ll     |  2 +-
> >>   test/CodeGen/R600/llvm.AMDGPU.trunc.ll   |  2 +-
> >>   test/CodeGen/R600/llvm.pow.ll            |  2 +-
> >>   test/CodeGen/R600/pv.ll                  |  1 +
> >>   test/CodeGen/R600/selectcc-opt.ll        |  2 +
> >>   18 files changed, 88 insertions(+), 43 deletions(-)
> >> 
> >>  diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
> > b/lib/Target/R600/R600MachineScheduler.cpp
> >>  index 830fc4a..36e982a 100644
> >>  --- a/lib/Target/R600/R600MachineScheduler.cpp
> >>  +++ b/lib/Target/R600/R600MachineScheduler.cpp
> >>  @@ -24,6 +24,10 @@
> >>   
> >>   using namespace llvm;
> >>   
> >>  +cl::opt<bool> TopDown("r600-topdown", cl::Hidden,
> >>  +    cl::desc("Switch top-down scheduling"), cl::init(false));
> >>  +
> >>  +
> > 
> > Is there any particular reason to keep the topdown scheduler?  It seems
> > like removing it would simplify the code.
> > 
> >>   void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
> >>   
> >>     DAG = dag;
> >>  @@ -34,7 +38,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
> >>     CurEmitted = 0;
> >>     OccupedSlotsMask = 15;
> >>     InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
> >>  -
> >>  +  InstKindLimit[IDOther] = 32;
> >>   
> >>     const AMDGPUSubtarget &ST = 
> > DAG->TM.getSubtarget<AMDGPUSubtarget>();
> >>     InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
> >>  @@ -49,9 +53,10 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit 
> > *> &QSrc,
> >>   
> >>   SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> >>     SUnit *SU = 0;
> >>  -  IsTopNode = true;
> >>     NextInstKind = IDOther;
> >>   
> >>  +  IsTopNode = TopDown;
> >>  +
> >>     // check if we might want to switch current clause type
> >>     bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
> >>         (CurEmitted >= InstKindLimit[CurInstKind]) ||
> >>  @@ -86,10 +91,10 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) 
> > {
> >>   
> >>     DEBUG(
> >>         if (SU) {
> >>  -        dbgs() << "picked node: ";
> >>  +        dbgs() << " ** Pick node **\n";
> >>           SU->dump(DAG);
> >>         } else {
> >>  -        dbgs() << "NO NODE ";
> >>  +        dbgs() << "NO NODE \n";
> >>           for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
> >>             const SUnit &S = DAG->SUnits[i];
> >>             if (!S.isScheduled)
> >>  @@ -103,9 +108,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) 
> > {
> >>   
> >>   void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
> >>   
> >>  -  DEBUG(dbgs() << "scheduled: ");
> >>  -  DEBUG(SU->dump(DAG));
> >>  -
> >>     if (NextInstKind != CurInstKind) {
> >>       DEBUG(dbgs() << "Instruction Type Switch\n");
> >>       if (NextInstKind != IDAlu)
> >>  @@ -141,19 +143,32 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool 
> > IsTopNode) {
> >>     if (CurInstKind != IDFetch) {
> >>       MoveUnits(Pending[IDFetch], Available[IDFetch]);
> >>     }
> >>  -  MoveUnits(Pending[IDOther], Available[IDOther]);
> >>   }
> >>   
> >>   void R600SchedStrategy::releaseTopNode(SUnit *SU) {
> >>  -  int IK = getInstKind(SU);
> >>  -
> >>  -  DEBUG(dbgs() << IK << " <= ");
> >>  -  DEBUG(SU->dump(DAG));
> >>  -
> >>  -  Pending[IK].push_back(SU);
> >>  +  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
> >>  +
> >>  +  if (TopDown) {
> >>  +    int IK = getInstKind(SU);
> >>  +    // There is no export clause, we can schedule one as soon as its ready
> >>  +    if (IK == IDOther)
> >>  +      Available[IDOther].push_back(SU);
> >>  +    else
> >>  +      Pending[IK].push_back(SU);
> >>  +  }
> >>   }
> >>   
> >>   void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
> >>  +  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
> >>  +
> >>  +  if (!TopDown) {
> >>  +    int IK = getInstKind(SU);
> >>  +    // There is no export clause, we can schedule one as soon as its ready
> >>  +    if (IK == IDOther)
> >>  +      Available[IDOther].push_back(SU);
> >>  +    else
> >>  +      Pending[IK].push_back(SU);
> >>  +  }
> >>   }
> >>   
> >>   bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
> >>  @@ -169,18 +184,16 @@ R600SchedStrategy::AluKind 
> > R600SchedStrategy::getAluKind(SUnit *SU) const {
> >>     MachineInstr *MI = SU->getInstr();
> >>   
> >>       switch (MI->getOpcode()) {
> >>  +    case AMDGPU::PRED_X:
> >>  +      if (!TopDown)
> >>  +        return AluPredX;
> >>       case AMDGPU::INTERP_PAIR_XY:
> >>       case AMDGPU::INTERP_PAIR_ZW:
> >>       case AMDGPU::INTERP_VEC_LOAD:
> >>       case AMDGPU::DOT_4:
> >>         return AluT_XYZW;
> >>       case AMDGPU::COPY:
> >>  -      if 
> > (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
> >>  -        // %vregX = COPY Tn_X is likely to be discarded in favor of an
> >>  -        // assignement of Tn_X to %vregX, don't considers it in 
> > scheduling
> >>  -        return AluDiscarded;
> >>  -      }
> >>  -      else if (MI->getOperand(1).isUndef()) {
> >>  +      if (MI->getOperand(1).isUndef()) {
> >>           // MI will become a KILL, don't considers it in scheduling
> >>           return AluDiscarded;
> >>         }
> >>  @@ -238,6 +251,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
> >>     }
> >>   
> >>     switch (Opcode) {
> >>  +  case AMDGPU::PRED_X:
> >>     case AMDGPU::COPY:
> >>     case AMDGPU::CONST_COPY:
> >>     case AMDGPU::INTERP_PAIR_XY:
> >>  @@ -330,12 +344,18 @@ bool R600SchedStrategy::isAvailablesAluEmpty() const 
> > {
> >>     return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() 
> > &&
> >>         AvailableAlus[AluT_XYZW].empty() && 
> > AvailableAlus[AluT_X].empty() &&
> >>         AvailableAlus[AluT_Y].empty() && 
> > AvailableAlus[AluT_Z].empty() &&
> >>  -      AvailableAlus[AluT_W].empty() && 
> > AvailableAlus[AluDiscarded].empty();
> >>  +      AvailableAlus[AluT_W].empty() && 
> > AvailableAlus[AluDiscarded].empty() &&
> >>  +      AvailableAlus[AluPredX].empty();
> >>   }
> >>   
> >>   SUnit* R600SchedStrategy::pickAlu() {
> >>     while (!isAvailablesAluEmpty()) {
> >>       if (!OccupedSlotsMask) {
> >>  +      // If we do bottom up scheduling, predX must comes first
> >>  +      if (!TopDown && !AvailableAlus[AluPredX].empty()) {
> >>  +        OccupedSlotsMask = 15;
> >>  +        return PopInst(AvailableAlus[AluPredX]);
> >>  +      }
> >>         // Flush physical reg copies (RA will discard them)
> >>         if (!AvailableAlus[AluDiscarded].empty()) {
> >>           OccupedSlotsMask = 15;
> >>  @@ -347,14 +367,28 @@ SUnit* R600SchedStrategy::pickAlu() {
> >>           return PopInst(AvailableAlus[AluT_XYZW]);
> >>         }
> >>       }
> >>  -    for (unsigned Chan = 0; Chan < 4; ++Chan) {
> >>  -      bool isOccupied = OccupedSlotsMask & (1 << Chan);
> >>  -      if (!isOccupied) {
> >>  -        SUnit *SU = AttemptFillSlot(Chan);
> >>  -        if (SU) {
> >>  -          OccupedSlotsMask |= (1 << Chan);
> >>  -          InstructionsGroupCandidate.push_back(SU->getInstr());
> >>  -          return SU;
> >>  +    if (TopDown) {
> >>  +      for (int Chan = 0; Chan < 4; ++Chan) {
> >>  +        bool isOccupied = OccupedSlotsMask & (1 << Chan);
> >>  +        if (!isOccupied) {
> >>  +          SUnit *SU = AttemptFillSlot(Chan);
> >>  +          if (SU) {
> >>  +            OccupedSlotsMask |= (1 << Chan);
> >>  +            InstructionsGroupCandidate.push_back(SU->getInstr());
> >>  +            return SU;
> >>  +          }
> >>  +        }
> >>  +      }
> >>  +    } else {
> >>  +      for (int Chan = 3; Chan > -1; --Chan) {
> >>  +        bool isOccupied = OccupedSlotsMask & (1 << Chan);
> >>  +        if (!isOccupied) {
> >>  +          SUnit *SU = AttemptFillSlot(Chan);
> >>  +          if (SU) {
> >>  +            OccupedSlotsMask |= (1 << Chan);
> >>  +            InstructionsGroupCandidate.push_back(SU->getInstr());
> >>  +            return SU;
> >>  +          }
> >>           }
> >>         }
> >>       }
> >>  diff --git a/lib/Target/R600/R600MachineScheduler.h 
> > b/lib/Target/R600/R600MachineScheduler.h
> >>  index c82ee49..da99dbd 100644
> >>  --- a/lib/Target/R600/R600MachineScheduler.h
> >>  +++ b/lib/Target/R600/R600MachineScheduler.h
> >>  @@ -45,6 +45,7 @@ class R600SchedStrategy : public MachineSchedStrategy {
> >>       AluT_Z,
> >>       AluT_W,
> >>       AluT_XYZW,
> >>  +    AluPredX,
> >>       AluDiscarded, // LLVM Instructions that are going to be eliminated
> >>       AluLast
> >>     };
> >>  diff --git a/lib/Target/R600/R600RegisterInfo.cpp 
> > b/lib/Target/R600/R600RegisterInfo.cpp
> >>  index bbd7995..7d13420 100644
> >>  --- a/lib/Target/R600/R600RegisterInfo.cpp
> >>  +++ b/lib/Target/R600/R600RegisterInfo.cpp
> >>  @@ -25,7 +25,7 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine 
> > &tm,
> >>   : AMDGPURegisterInfo(tm, tii),
> >>     TM(tm),
> >>     TII(tii)
> >>  -  { }
> >>  +  { RCW.RegWeight = 0; RCW.WeightLimit = 0;}
> >>   
> >>   BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) 
> > const {
> >>     BitVector Reserved(getNumRegs());
> >>  @@ -97,3 +97,7 @@ unsigned R600RegisterInfo::getSubRegFromChannel(unsigned 
> > Channel) const {
> >>     }
> >>   }
> >>   
> >>  +const RegClassWeight &R600RegisterInfo::getRegClassWeight(
> >>  +  const TargetRegisterClass *RC) const {
> >>  +  return RCW;
> >>  +}
> >>  diff --git a/lib/Target/R600/R600RegisterInfo.h 
> > b/lib/Target/R600/R600RegisterInfo.h
> >>  index f9ca918..1270a1e 100644
> >>  --- a/lib/Target/R600/R600RegisterInfo.h
> >>  +++ b/lib/Target/R600/R600RegisterInfo.h
> >>  @@ -26,6 +26,7 @@ class TargetInstrInfo;
> >>   struct R600RegisterInfo : public AMDGPURegisterInfo {
> >>     AMDGPUTargetMachine &TM;
> >>     const TargetInstrInfo &TII;
> >>  +  RegClassWeight RCW;
> >>   
> >>     R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo 
> > &tii);
> >>   
> >>  @@ -48,6 +49,8 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
> >>     /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
> >>     unsigned getSubRegFromChannel(unsigned Channel) const;
> >>   
> >>  +  virtual const RegClassWeight &getRegClassWeight(const 
> > TargetRegisterClass *RC) const;
> >>  +
> >>   };
> >>   
> >>   } // End namespace llvm
> >>  diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
> >>  index 85f2882..17ac895 100644
> >>  --- a/test/CodeGen/R600/fabs.ll
> >>  +++ b/test/CodeGen/R600/fabs.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: MOV * T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
> >>  +;CHECK: MOV * T{{[0-9]+\.[XYZW], \|PV\.[xyzw]\|}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
> >>  index 9a67232..821d329 100644
> >>  --- a/test/CodeGen/R600/fadd.ll
> >>  +++ b/test/CodeGen/R600/fadd.ll
> >>  @@ -1,7 +1,7 @@
> >>   ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>   ; CHECK: @fadd_f32
> >>  -; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +; CHECK: ADD * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> >>   
> >>   define void @fadd_f32() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
> >>  index 2e68e36..003590b 100644
> >>  --- a/test/CodeGen/R600/fdiv.ll
> >>  +++ b/test/CodeGen/R600/fdiv.ll
> >>  @@ -1,12 +1,12 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>   ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  +;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>   ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>   ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  -;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  -;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>   ;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>   ;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>   ;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>   
> >>   define void @test(<4 x float> addrspace(1)* %out, <4 x float> 
> > addrspace(1)* %in) {
> >>  diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
> >>  index 877d69a..0a807b1 100644
> >>  --- a/test/CodeGen/R600/floor.ll
> >>  +++ b/test/CodeGen/R600/floor.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  +;CHECK: FLOOR * T{{[0-9]+\.[XYZW], PV\.[xyzw]}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
> >>  index 62001ed..8614115 100644
> >>  --- a/test/CodeGen/R600/fmad.ll
> >>  +++ b/test/CodeGen/R600/fmad.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: MULADD_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  +;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], PV\.[xyzw], PV.[xyzw], 
> > PV\.[xyzw]}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
> >>  index 8b704e5..ef3daad 100644
> >>  --- a/test/CodeGen/R600/fmax.ll
> >>  +++ b/test/CodeGen/R600/fmax.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +;CHECK: MAX * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
> >>  index 5e34b7c..026481c 100644
> >>  --- a/test/CodeGen/R600/fmin.ll
> >>  +++ b/test/CodeGen/R600/fmin.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +;CHECK: MIN * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
> >>  index c292946..dbb6424 100644
> >>  --- a/test/CodeGen/R600/fmul.ll
> >>  +++ b/test/CodeGen/R600/fmul.ll
> >>  @@ -1,7 +1,7 @@
> >>   ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>   ; CHECK: @fmul_f32
> >>  -; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], PV\.[xyzw], 
> > PV\.[xyzw]}}
> >>   
> >>   define void @fmul_f32() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
> >>  index f784cde..f88729e 100644
> >>  --- a/test/CodeGen/R600/fsub.ll
> >>  +++ b/test/CodeGen/R600/fsub.ll
> >>  @@ -1,7 +1,7 @@
> >>   ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>   ; CHECK: @fsub_f32
> >>  -; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > -T[0-9]+\.[XYZW]}}
> >>  +; CHECK: ADD * T{{[0-9]+\.[XYZW], PV\.[xyzw], -PV\.[xyzw]}}
> >>   
> >>   define void @fsub_f32() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll 
> > b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> >>  index cc0732b..69fbe58 100644
> >>  --- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> >>  +++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], PV\.[xyzw], 
> > PV\.[xyzw]}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll 
> > b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
> >>  index ff22a69..ae4df21 100644
> >>  --- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
> >>  +++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -;CHECK: TRUNC * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  +;CHECK: TRUNC * T{{[0-9]+\.[XYZW], PV\.[xyzw]}}
> >>   
> >>   define void @test() {
> >>      %r0 = call float @llvm.R600.load.input(i32 0)
> >>  diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
> >>  index 91b7742..3800abf 100644
> >>  --- a/test/CodeGen/R600/llvm.pow.ll
> >>  +++ b/test/CodeGen/R600/llvm.pow.ll
> >>  @@ -1,7 +1,7 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>   ;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>  -;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 
> > T[0-9]+\.[XYZW]}}
> >>  +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], PV\.[xyzw], 
> > T[0-9]+\.[XYZW]}}
> >>   ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> >>   
> >>   define void @test() {
> >>  diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
> >>  index 37c3d9d..f7617bb 100644
> >>  --- a/test/CodeGen/R600/pv.ll
> >>  +++ b/test/CodeGen/R600/pv.ll
> >>  @@ -1,5 +1,6 @@
> >>   ; RUN: llc < %s -march=r600 | FileCheck %s
> >>   
> >>  +;CHECK: LOG_IEEE
> >>   ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
> >>   ;CHECK-NEXT: CNDGE T{{[0-9].[XYZW]}}, PV.x
> >>   
> >>  diff --git a/test/CodeGen/R600/selectcc-opt.ll 
> > b/test/CodeGen/R600/selectcc-opt.ll
> >>  index 02d9353..7f568fc 100644
> >>  --- a/test/CodeGen/R600/selectcc-opt.ll
> >>  +++ b/test/CodeGen/R600/selectcc-opt.ll
> >>  @@ -29,8 +29,10 @@ ENDIF:
> >>   ; for the icmp instruction
> >>   
> >>   ; CHECK: @test_b
> >>  +; CHECK: VTX_READ
> >>   ; CHECK: SET{{[GTEQN]+}}_DX10
> >>   ; CHECK-NEXT: PRED_
> >>  +; CHECK-NEXT: ALU clause starting
> >>   define void @test_b(i32 addrspace(1)* %out, float %in) {
> >>   entry:
> >>     %0 = fcmp ult float %in, 0.0
> >>  -- 
> >>  1.8.2.1
> >> 
> > 
> >>  From 8c2add6f72573de0efa091c01679a6eb90eae486 Mon Sep 17 00:00:00 2001
> >>  From: Vincent Lejeune <vljn at ovi.com>
> >>  Date: Tue, 14 May 2013 01:13:27 +0200
> >>  Subject: [PATCH 5/5] R600: Lower int_load_input to copyFromReg instead of
> >>   Register node
> >> 
> >>  It solves a bug uncovered by dot4 patch where the register class of
> >>  int_load_input use was ignored.
> >>  ---
> >>   lib/Target/R600/R600ISelLowering.cpp |   6 +-
> >>   test/CodeGen/R600/load-input-fold.ll | 462 
> > +++++++++++++++++++++++++++++++++++
> >>   2 files changed, 467 insertions(+), 1 deletion(-)
> >>   create mode 100644 test/CodeGen/R600/load-input-fold.ll
> >> 
> >>  diff --git a/lib/Target/R600/R600ISelLowering.cpp 
> > b/lib/Target/R600/R600ISelLowering.cpp
> >>  index 728769b..c89206b 100644
> >>  --- a/lib/Target/R600/R600ISelLowering.cpp
> >>  +++ b/lib/Target/R600/R600ISelLowering.cpp
> >>  @@ -521,7 +521,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, 
> > SelectionDAG &DAG) const
> >>       case AMDGPUIntrinsic::R600_load_input: {
> >>         int64_t RegIndex = 
> > cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
> >>         unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
> >>  -      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 
> > Reg, VT);
> >>  +      MachineFunction &MF = DAG.getMachineFunction();
> >>  +      MachineRegisterInfo &MRI = MF.getRegInfo();
> >>  +      MRI.addLiveIn(Reg);
> >>  +      return DAG.getCopyFromReg(DAG.getEntryNode(),
> >>  +          DAG.getEntryNode().getDebugLoc(), Reg, VT);
> >>       }
> >>   
> >>       case AMDGPUIntrinsic::R600_interp_input: {
> >>  diff --git a/test/CodeGen/R600/load-input-fold.ll 
> > b/test/CodeGen/R600/load-input-fold.ll
> >>  new file mode 100644
> >>  index 0000000..8d2ece2
> >>  --- /dev/null
> >>  +++ b/test/CodeGen/R600/load-input-fold.ll
> >>  @@ -0,0 +1,462 @@
> >>  +;RUN: llc < %s -march=r600 -mcpu=cayman
> >>  +;REQUIRES: asserts
> >>  +
> > 
> > Is this test sill effective if we only use one llvm.AMDGPU.dp4
> > intrinsic call?
> > 
> >>  +define void @main() #0 {
> >>  +main_body:
> >>  +  %0 = call float @llvm.R600.load.input(i32 4)
> >>  +  %1 = call float @llvm.R600.load.input(i32 5)
> >>  +  %2 = call float @llvm.R600.load.input(i32 6)
> >>  +  %3 = call float @llvm.R600.load.input(i32 7)
> >>  +  %4 = call float @llvm.R600.load.input(i32 8)
> >>  +  %5 = call float @llvm.R600.load.input(i32 9)
> >>  +  %6 = call float @llvm.R600.load.input(i32 10)
> >>  +  %7 = call float @llvm.R600.load.input(i32 11)
> >>  +  %8 = call float @llvm.R600.load.input(i32 12)
> >>  +  %9 = call float @llvm.R600.load.input(i32 13)
> >>  +  %10 = call float @llvm.R600.load.input(i32 14)
> >>  +  %11 = call float @llvm.R600.load.input(i32 15)
> >>  +  %12 = load <4 x float> addrspace(8)* null
> >>  +  %13 = extractelement <4 x float> %12, i32 0
> >>  +  %14 = fmul float %0, %13
> >>  +  %15 = load <4 x float> addrspace(8)* null
> >>  +  %16 = extractelement <4 x float> %15, i32 1
> >>  +  %17 = fmul float %0, %16
> >>  +  %18 = load <4 x float> addrspace(8)* null
> >>  +  %19 = extractelement <4 x float> %18, i32 2
> >>  +  %20 = fmul float %0, %19
> >>  +  %21 = load <4 x float> addrspace(8)* null
> >>  +  %22 = extractelement <4 x float> %21, i32 3
> >>  +  %23 = fmul float %0, %22
> >>  +  %24 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 1)
> >>  +  %25 = extractelement <4 x float> %24, i32 0
> >>  +  %26 = fmul float %1, %25
> >>  +  %27 = fadd float %26, %14
> >>  +  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 1)
> >>  +  %29 = extractelement <4 x float> %28, i32 1
> >>  +  %30 = fmul float %1, %29
> >>  +  %31 = fadd float %30, %17
> >>  +  %32 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 1)
> >>  +  %33 = extractelement <4 x float> %32, i32 2
> >>  +  %34 = fmul float %1, %33
> >>  +  %35 = fadd float %34, %20
> >>  +  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 1)
> >>  +  %37 = extractelement <4 x float> %36, i32 3
> >>  +  %38 = fmul float %1, %37
> >>  +  %39 = fadd float %38, %23
> >>  +  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 2)
> >>  +  %41 = extractelement <4 x float> %40, i32 0
> >>  +  %42 = fmul float %2, %41
> >>  +  %43 = fadd float %42, %27
> >>  +  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 2)
> >>  +  %45 = extractelement <4 x float> %44, i32 1
> >>  +  %46 = fmul float %2, %45
> >>  +  %47 = fadd float %46, %31
> >>  +  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 2)
> >>  +  %49 = extractelement <4 x float> %48, i32 2
> >>  +  %50 = fmul float %2, %49
> >>  +  %51 = fadd float %50, %35
> >>  +  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 2)
> >>  +  %53 = extractelement <4 x float> %52, i32 3
> >>  +  %54 = fmul float %2, %53
> >>  +  %55 = fadd float %54, %39
> >>  +  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 3)
> >>  +  %57 = extractelement <4 x float> %56, i32 0
> >>  +  %58 = fmul float %3, %57
> >>  +  %59 = fadd float %58, %43
> >>  +  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 3)
> >>  +  %61 = extractelement <4 x float> %60, i32 1
> >>  +  %62 = fmul float %3, %61
> >>  +  %63 = fadd float %62, %47
> >>  +  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 3)
> >>  +  %65 = extractelement <4 x float> %64, i32 2
> >>  +  %66 = fmul float %3, %65
> >>  +  %67 = fadd float %66, %51
> >>  +  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 3)
> >>  +  %69 = extractelement <4 x float> %68, i32 3
> >>  +  %70 = fmul float %3, %69
> >>  +  %71 = fadd float %70, %55
> >>  +  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 4)
> >>  +  %73 = extractelement <4 x float> %72, i32 0
> >>  +  %74 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 4)
> >>  +  %75 = extractelement <4 x float> %74, i32 1
> >>  +  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 4)
> >>  +  %77 = extractelement <4 x float> %76, i32 2
> >>  +  %78 = insertelement <4 x float> undef, float %4, i32 0
> >>  +  %79 = insertelement <4 x float> %78, float %5, i32 1
> >>  +  %80 = insertelement <4 x float> %79, float %6, i32 2
> >>  +  %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3
> >>  +  %82 = insertelement <4 x float> undef, float %73, i32 0
> >>  +  %83 = insertelement <4 x float> %82, float %75, i32 1
> >>  +  %84 = insertelement <4 x float> %83, float %77, i32 2
> >>  +  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
> >>  +  %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x 
> > float> %85)
> >>  +  %87 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 5)
> >>  +  %88 = extractelement <4 x float> %87, i32 0
> >>  +  %89 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 5)
> >>  +  %90 = extractelement <4 x float> %89, i32 1
> >>  +  %91 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 5)
> >>  +  %92 = extractelement <4 x float> %91, i32 2
> >>  +  %93 = insertelement <4 x float> undef, float %4, i32 0
> >>  +  %94 = insertelement <4 x float> %93, float %5, i32 1
> >>  +  %95 = insertelement <4 x float> %94, float %6, i32 2
> >>  +  %96 = insertelement <4 x float> %95, float 0.000000e+00, i32 3
> >>  +  %97 = insertelement <4 x float> undef, float %88, i32 0
> >>  +  %98 = insertelement <4 x float> %97, float %90, i32 1
> >>  +  %99 = insertelement <4 x float> %98, float %92, i32 2
> >>  +  %100 = insertelement <4 x float> %99, float 0.000000e+00, i32 3
> >>  +  %101 = call float @llvm.AMDGPU.dp4(<4 x float> %96, <4 x 
> > float> %100)
> >>  +  %102 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 6)
> >>  +  %103 = extractelement <4 x float> %102, i32 0
> >>  +  %104 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 6)
> >>  +  %105 = extractelement <4 x float> %104, i32 1
> >>  +  %106 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 6)
> >>  +  %107 = extractelement <4 x float> %106, i32 2
> >>  +  %108 = insertelement <4 x float> undef, float %4, i32 0
> >>  +  %109 = insertelement <4 x float> %108, float %5, i32 1
> >>  +  %110 = insertelement <4 x float> %109, float %6, i32 2
> >>  +  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
> >>  +  %112 = insertelement <4 x float> undef, float %103, i32 0
> >>  +  %113 = insertelement <4 x float> %112, float %105, i32 1
> >>  +  %114 = insertelement <4 x float> %113, float %107, i32 2
> >>  +  %115 = insertelement <4 x float> %114, float 0.000000e+00, i32 3
> >>  +  %116 = call float @llvm.AMDGPU.dp4(<4 x float> %111, <4 x 
> > float> %115)
> >>  +  %117 = insertelement <4 x float> undef, float %86, i32 0
> >>  +  %118 = insertelement <4 x float> %117, float %101, i32 1
> >>  +  %119 = insertelement <4 x float> %118, float %116, i32 2
> >>  +  %120 = insertelement <4 x float> %119, float 0.000000e+00, i32 3
> >>  +  %121 = insertelement <4 x float> undef, float %86, i32 0
> >>  +  %122 = insertelement <4 x float> %121, float %101, i32 1
> >>  +  %123 = insertelement <4 x float> %122, float %116, i32 2
> >>  +  %124 = insertelement <4 x float> %123, float 0.000000e+00, i32 3
> >>  +  %125 = call float @llvm.AMDGPU.dp4(<4 x float> %120, <4 x 
> > float> %124)
> >>  +  %126 = call float @fabs(float %125)
> >>  +  %127 = call float @llvm.AMDGPU.rsq(float %126)
> >>  +  %128 = fmul float %86, %127
> >>  +  %129 = fmul float %101, %127
> >>  +  %130 = fmul float %116, %127
> >>  +  %131 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 7)
> >>  +  %132 = extractelement <4 x float> %131, i32 0
> >>  +  %133 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 8)
> >>  +  %134 = extractelement <4 x float> %133, i32 0
> >>  +  %135 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 8)
> >>  +  %136 = extractelement <4 x float> %135, i32 1
> >>  +  %137 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 8)
> >>  +  %138 = extractelement <4 x float> %137, i32 2
> >>  +  %139 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 8)
> >>  +  %140 = extractelement <4 x float> %139, i32 3
> >>  +  %141 = call float @llvm.AMDIL.clamp.(float %134, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %142 = call float @llvm.AMDIL.clamp.(float %136, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %143 = call float @llvm.AMDIL.clamp.(float %138, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %144 = call float @llvm.AMDIL.clamp.(float %140, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %145 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 
> > 0.000000e+00, float 1.000000e+00)
> >>  +  %146 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 
> > 0.000000e+00, float 1.000000e+00)
> >>  +  %147 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 
> > 0.000000e+00, float 1.000000e+00)
> >>  +  %148 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 
> > 0.000000e+00, float 1.000000e+00)
> >>  +  %149 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 11)
> >>  +  %150 = extractelement <4 x float> %149, i32 0
> >>  +  %151 = fmul float %0, %150
> >>  +  %152 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 11)
> >>  +  %153 = extractelement <4 x float> %152, i32 1
> >>  +  %154 = fmul float %0, %153
> >>  +  %155 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 11)
> >>  +  %156 = extractelement <4 x float> %155, i32 2
> >>  +  %157 = fmul float %0, %156
> >>  +  %158 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 12)
> >>  +  %159 = extractelement <4 x float> %158, i32 0
> >>  +  %160 = fmul float %1, %159
> >>  +  %161 = fadd float %160, %151
> >>  +  %162 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 12)
> >>  +  %163 = extractelement <4 x float> %162, i32 1
> >>  +  %164 = fmul float %1, %163
> >>  +  %165 = fadd float %164, %154
> >>  +  %166 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 12)
> >>  +  %167 = extractelement <4 x float> %166, i32 2
> >>  +  %168 = fmul float %1, %167
> >>  +  %169 = fadd float %168, %157
> >>  +  %170 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 13)
> >>  +  %171 = extractelement <4 x float> %170, i32 0
> >>  +  %172 = fmul float %2, %171
> >>  +  %173 = fadd float %172, %161
> >>  +  %174 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 13)
> >>  +  %175 = extractelement <4 x float> %174, i32 1
> >>  +  %176 = fmul float %2, %175
> >>  +  %177 = fadd float %176, %165
> >>  +  %178 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 13)
> >>  +  %179 = extractelement <4 x float> %178, i32 2
> >>  +  %180 = fmul float %2, %179
> >>  +  %181 = fadd float %180, %169
> >>  +  %182 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 14)
> >>  +  %183 = extractelement <4 x float> %182, i32 0
> >>  +  %184 = fmul float %3, %183
> >>  +  %185 = fadd float %184, %173
> >>  +  %186 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 14)
> >>  +  %187 = extractelement <4 x float> %186, i32 1
> >>  +  %188 = fmul float %3, %187
> >>  +  %189 = fadd float %188, %177
> >>  +  %190 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 14)
> >>  +  %191 = extractelement <4 x float> %190, i32 2
> >>  +  %192 = fmul float %3, %191
> >>  +  %193 = fadd float %192, %181
> >>  +  %194 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 10)
> >>  +  %195 = extractelement <4 x float> %194, i32 0
> >>  +  %196 = fsub float %195, %185
> >>  +  %197 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 10)
> >>  +  %198 = extractelement <4 x float> %197, i32 1
> >>  +  %199 = fsub float %198, %189
> >>  +  %200 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 10)
> >>  +  %201 = extractelement <4 x float> %200, i32 2
> >>  +  %202 = fsub float %201, %193
> >>  +  %203 = insertelement <4 x float> undef, float %196, i32 0
> >>  +  %204 = insertelement <4 x float> %203, float %199, i32 1
> >>  +  %205 = insertelement <4 x float> %204, float %202, i32 2
> >>  +  %206 = insertelement <4 x float> %205, float 0.000000e+00, i32 3
> >>  +  %207 = insertelement <4 x float> undef, float %196, i32 0
> >>  +  %208 = insertelement <4 x float> %207, float %199, i32 1
> >>  +  %209 = insertelement <4 x float> %208, float %202, i32 2
> >>  +  %210 = insertelement <4 x float> %209, float 0.000000e+00, i32 3
> >>  +  %211 = call float @llvm.AMDGPU.dp4(<4 x float> %206, <4 x 
> > float> %210)
> >>  +  %212 = call float @fabs(float %211)
> >>  +  %213 = call float @llvm.AMDGPU.rsq(float %212)
> >>  +  %214 = fmul float %196, %213
> >>  +  %215 = fmul float %199, %213
> >>  +  %216 = fmul float %202, %213
> >>  +  %217 = insertelement <4 x float> undef, float %185, i32 0
> >>  +  %218 = insertelement <4 x float> %217, float %189, i32 1
> >>  +  %219 = insertelement <4 x float> %218, float %193, i32 2
> >>  +  %220 = insertelement <4 x float> %219, float 0.000000e+00, i32 3
> >>  +  %221 = insertelement <4 x float> undef, float %185, i32 0
> >>  +  %222 = insertelement <4 x float> %221, float %189, i32 1
> >>  +  %223 = insertelement <4 x float> %222, float %193, i32 2
> >>  +  %224 = insertelement <4 x float> %223, float 0.000000e+00, i32 3
> >>  +  %225 = call float @llvm.AMDGPU.dp4(<4 x float> %220, <4 x 
> > float> %224)
> >>  +  %226 = call float @fabs(float %225)
> >>  +  %227 = call float @llvm.AMDGPU.rsq(float %226)
> >>  +  %228 = fmul float %185, %227
> >>  +  %229 = fmul float %189, %227
> >>  +  %230 = fmul float %193, %227
> >>  +  %231 = fsub float %214, %228
> >>  +  %232 = fsub float %215, %229
> >>  +  %233 = fsub float %216, %230
> >>  +  %234 = insertelement <4 x float> undef, float %231, i32 0
> >>  +  %235 = insertelement <4 x float> %234, float %232, i32 1
> >>  +  %236 = insertelement <4 x float> %235, float %233, i32 2
> >>  +  %237 = insertelement <4 x float> %236, float 0.000000e+00, i32 3
> >>  +  %238 = insertelement <4 x float> undef, float %231, i32 0
> >>  +  %239 = insertelement <4 x float> %238, float %232, i32 1
> >>  +  %240 = insertelement <4 x float> %239, float %233, i32 2
> >>  +  %241 = insertelement <4 x float> %240, float 0.000000e+00, i32 3
> >>  +  %242 = call float @llvm.AMDGPU.dp4(<4 x float> %237, <4 x 
> > float> %241)
> >>  +  %243 = call float @fabs(float %242)
> >>  +  %244 = call float @llvm.AMDGPU.rsq(float %243)
> >>  +  %245 = fmul float %231, %244
> >>  +  %246 = fmul float %232, %244
> >>  +  %247 = fmul float %233, %244
> >>  +  %248 = insertelement <4 x float> undef, float %128, i32 0
> >>  +  %249 = insertelement <4 x float> %248, float %129, i32 1
> >>  +  %250 = insertelement <4 x float> %249, float %130, i32 2
> >>  +  %251 = insertelement <4 x float> %250, float 0.000000e+00, i32 3
> >>  +  %252 = insertelement <4 x float> undef, float %214, i32 0
> >>  +  %253 = insertelement <4 x float> %252, float %215, i32 1
> >>  +  %254 = insertelement <4 x float> %253, float %216, i32 2
> >>  +  %255 = insertelement <4 x float> %254, float 0.000000e+00, i32 3
> >>  +  %256 = call float @llvm.AMDGPU.dp4(<4 x float> %251, <4 x 
> > float> %255)
> >>  +  %257 = insertelement <4 x float> undef, float %128, i32 0
> >>  +  %258 = insertelement <4 x float> %257, float %129, i32 1
> >>  +  %259 = insertelement <4 x float> %258, float %130, i32 2
> >>  +  %260 = insertelement <4 x float> %259, float 0.000000e+00, i32 3
> >>  +  %261 = insertelement <4 x float> undef, float %245, i32 0
> >>  +  %262 = insertelement <4 x float> %261, float %246, i32 1
> >>  +  %263 = insertelement <4 x float> %262, float %247, i32 2
> >>  +  %264 = insertelement <4 x float> %263, float 0.000000e+00, i32 3
> >>  +  %265 = call float @llvm.AMDGPU.dp4(<4 x float> %260, <4 x 
> > float> %264)
> >>  +  %266 = fcmp uge float %256, 0.000000e+00
> >>  +  %267 = select i1 %266, float %256, float 0.000000e+00
> >>  +  %268 = fcmp uge float %265, 0.000000e+00
> >>  +  %269 = select i1 %268, float %265, float 0.000000e+00
> >>  +  %270 = call float @llvm.pow.f32(float %269, float %132)
> >>  +  %271 = fcmp ult float %256, 0.000000e+00
> >>  +  %272 = select i1 %271, float 0.000000e+00, float %270
> >>  +  %273 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 16)
> >>  +  %274 = extractelement <4 x float> %273, i32 0
> >>  +  %275 = fadd float %274, %134
> >>  +  %276 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 16)
> >>  +  %277 = extractelement <4 x float> %276, i32 1
> >>  +  %278 = fadd float %277, %136
> >>  +  %279 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 16)
> >>  +  %280 = extractelement <4 x float> %279, i32 2
> >>  +  %281 = fadd float %280, %138
> >>  +  %282 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 17)
> >>  +  %283 = extractelement <4 x float> %282, i32 0
> >>  +  %284 = fmul float %267, %283
> >>  +  %285 = fadd float %284, %275
> >>  +  %286 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 17)
> >>  +  %287 = extractelement <4 x float> %286, i32 1
> >>  +  %288 = fmul float %267, %287
> >>  +  %289 = fadd float %288, %278
> >>  +  %290 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 17)
> >>  +  %291 = extractelement <4 x float> %290, i32 2
> >>  +  %292 = fmul float %267, %291
> >>  +  %293 = fadd float %292, %281
> >>  +  %294 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 18)
> >>  +  %295 = extractelement <4 x float> %294, i32 0
> >>  +  %296 = fmul float %272, %295
> >>  +  %297 = fadd float %296, 0.000000e+00
> >>  +  %298 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 18)
> >>  +  %299 = extractelement <4 x float> %298, i32 1
> >>  +  %300 = fmul float %272, %299
> >>  +  %301 = fadd float %300, 0.000000e+00
> >>  +  %302 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 18)
> >>  +  %303 = extractelement <4 x float> %302, i32 2
> >>  +  %304 = fmul float %272, %303
> >>  +  %305 = fadd float %304, 0.000000e+00
> >>  +  %306 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 19)
> >>  +  %307 = extractelement <4 x float> %306, i32 0
> >>  +  %308 = fsub float %307, %185
> >>  +  %309 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 19)
> >>  +  %310 = extractelement <4 x float> %309, i32 1
> >>  +  %311 = fsub float %310, %189
> >>  +  %312 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 19)
> >>  +  %313 = extractelement <4 x float> %312, i32 2
> >>  +  %314 = fsub float %313, %193
> >>  +  %315 = insertelement <4 x float> undef, float %308, i32 0
> >>  +  %316 = insertelement <4 x float> %315, float %311, i32 1
> >>  +  %317 = insertelement <4 x float> %316, float %314, i32 2
> >>  +  %318 = insertelement <4 x float> %317, float 0.000000e+00, i32 3
> >>  +  %319 = insertelement <4 x float> undef, float %308, i32 0
> >>  +  %320 = insertelement <4 x float> %319, float %311, i32 1
> >>  +  %321 = insertelement <4 x float> %320, float %314, i32 2
> >>  +  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
> >>  +  %323 = call float @llvm.AMDGPU.dp4(<4 x float> %318, <4 x 
> > float> %322)
> >>  +  %324 = call float @fabs(float %323)
> >>  +  %325 = call float @llvm.AMDGPU.rsq(float %324)
> >>  +  %326 = fmul float %308, %325
> >>  +  %327 = fmul float %311, %325
> >>  +  %328 = fmul float %314, %325
> >>  +  %329 = fsub float %326, %228
> >>  +  %330 = fsub float %327, %229
> >>  +  %331 = fsub float %328, %230
> >>  +  %332 = insertelement <4 x float> undef, float %329, i32 0
> >>  +  %333 = insertelement <4 x float> %332, float %330, i32 1
> >>  +  %334 = insertelement <4 x float> %333, float %331, i32 2
> >>  +  %335 = insertelement <4 x float> %334, float 0.000000e+00, i32 3
> >>  +  %336 = insertelement <4 x float> undef, float %329, i32 0
> >>  +  %337 = insertelement <4 x float> %336, float %330, i32 1
> >>  +  %338 = insertelement <4 x float> %337, float %331, i32 2
> >>  +  %339 = insertelement <4 x float> %338, float 0.000000e+00, i32 3
> >>  +  %340 = call float @llvm.AMDGPU.dp4(<4 x float> %335, <4 x 
> > float> %339)
> >>  +  %341 = call float @fabs(float %340)
> >>  +  %342 = call float @llvm.AMDGPU.rsq(float %341)
> >>  +  %343 = fmul float %329, %342
> >>  +  %344 = fmul float %330, %342
> >>  +  %345 = fmul float %331, %342
> >>  +  %346 = insertelement <4 x float> undef, float %128, i32 0
> >>  +  %347 = insertelement <4 x float> %346, float %129, i32 1
> >>  +  %348 = insertelement <4 x float> %347, float %130, i32 2
> >>  +  %349 = insertelement <4 x float> %348, float 0.000000e+00, i32 3
> >>  +  %350 = insertelement <4 x float> undef, float %326, i32 0
> >>  +  %351 = insertelement <4 x float> %350, float %327, i32 1
> >>  +  %352 = insertelement <4 x float> %351, float %328, i32 2
> >>  +  %353 = insertelement <4 x float> %352, float 0.000000e+00, i32 3
> >>  +  %354 = call float @llvm.AMDGPU.dp4(<4 x float> %349, <4 x 
> > float> %353)
> >>  +  %355 = insertelement <4 x float> undef, float %128, i32 0
> >>  +  %356 = insertelement <4 x float> %355, float %129, i32 1
> >>  +  %357 = insertelement <4 x float> %356, float %130, i32 2
> >>  +  %358 = insertelement <4 x float> %357, float 0.000000e+00, i32 3
> >>  +  %359 = insertelement <4 x float> undef, float %343, i32 0
> >>  +  %360 = insertelement <4 x float> %359, float %344, i32 1
> >>  +  %361 = insertelement <4 x float> %360, float %345, i32 2
> >>  +  %362 = insertelement <4 x float> %361, float 0.000000e+00, i32 3
> >>  +  %363 = call float @llvm.AMDGPU.dp4(<4 x float> %358, <4 x 
> > float> %362)
> >>  +  %364 = fcmp uge float %354, 0.000000e+00
> >>  +  %365 = select i1 %364, float %354, float 0.000000e+00
> >>  +  %366 = fcmp uge float %363, 0.000000e+00
> >>  +  %367 = select i1 %366, float %363, float 0.000000e+00
> >>  +  %368 = call float @llvm.pow.f32(float %367, float %132)
> >>  +  %369 = fcmp ult float %354, 0.000000e+00
> >>  +  %370 = select i1 %369, float 0.000000e+00, float %368
> >>  +  %371 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 21)
> >>  +  %372 = extractelement <4 x float> %371, i32 0
> >>  +  %373 = fadd float %372, %285
> >>  +  %374 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 21)
> >>  +  %375 = extractelement <4 x float> %374, i32 1
> >>  +  %376 = fadd float %375, %289
> >>  +  %377 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 21)
> >>  +  %378 = extractelement <4 x float> %377, i32 2
> >>  +  %379 = fadd float %378, %293
> >>  +  %380 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 22)
> >>  +  %381 = extractelement <4 x float> %380, i32 0
> >>  +  %382 = fmul float %365, %381
> >>  +  %383 = fadd float %382, %373
> >>  +  %384 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 22)
> >>  +  %385 = extractelement <4 x float> %384, i32 1
> >>  +  %386 = fmul float %365, %385
> >>  +  %387 = fadd float %386, %376
> >>  +  %388 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 22)
> >>  +  %389 = extractelement <4 x float> %388, i32 2
> >>  +  %390 = fmul float %365, %389
> >>  +  %391 = fadd float %390, %379
> >>  +  %392 = call float @llvm.AMDIL.clamp.(float %383, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %393 = call float @llvm.AMDIL.clamp.(float %387, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %394 = call float @llvm.AMDIL.clamp.(float %391, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %395 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 23)
> >>  +  %396 = extractelement <4 x float> %395, i32 0
> >>  +  %397 = fmul float %370, %396
> >>  +  %398 = fadd float %397, %297
> >>  +  %399 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 23)
> >>  +  %400 = extractelement <4 x float> %399, i32 1
> >>  +  %401 = fmul float %370, %400
> >>  +  %402 = fadd float %401, %301
> >>  +  %403 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 
> > x float>] addrspace(8)* null, i64 0, i32 23)
> >>  +  %404 = extractelement <4 x float> %403, i32 2
> >>  +  %405 = fmul float %370, %404
> >>  +  %406 = fadd float %405, %305
> >>  +  %407 = call float @llvm.AMDIL.clamp.(float %398, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %408 = call float @llvm.AMDIL.clamp.(float %402, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %409 = call float @llvm.AMDIL.clamp.(float %406, float 0.000000e+00, 
> > float 1.000000e+00)
> >>  +  %410 = insertelement <4 x float> undef, float %59, i32 0
> >>  +  %411 = insertelement <4 x float> %410, float %63, i32 1
> >>  +  %412 = insertelement <4 x float> %411, float %67, i32 2
> >>  +  %413 = insertelement <4 x float> %412, float %71, i32 3
> >>  +  call void @llvm.R600.store.swizzle(<4 x float> %413, i32 60, i32 
> > 1)
> >>  +  %414 = insertelement <4 x float> undef, float %392, i32 0
> >>  +  %415 = insertelement <4 x float> %414, float %393, i32 1
> >>  +  %416 = insertelement <4 x float> %415, float %394, i32 2
> >>  +  %417 = insertelement <4 x float> %416, float %144, i32 3
> >>  +  call void @llvm.R600.store.swizzle(<4 x float> %417, i32 0, i32 2)
> >>  +  %418 = insertelement <4 x float> undef, float %407, i32 0
> >>  +  %419 = insertelement <4 x float> %418, float %408, i32 1
> >>  +  %420 = insertelement <4 x float> %419, float %409, i32 2
> >>  +  %421 = insertelement <4 x float> %420, float %148, i32 3
> >>  +  call void @llvm.R600.store.swizzle(<4 x float> %421, i32 1, i32 2)
> >>  +  %422 = insertelement <4 x float> undef, float %8, i32 0
> >>  +  %423 = insertelement <4 x float> %422, float %9, i32 1
> >>  +  %424 = insertelement <4 x float> %423, float %10, i32 2
> >>  +  %425 = insertelement <4 x float> %424, float %11, i32 3
> >>  +  call void @llvm.R600.store.swizzle(<4 x float> %425, i32 2, i32 2)
> >>  +  ret void
> >>  +}
> >>  +
> >>  +; Function Attrs: readnone
> >>  +declare float @llvm.R600.load.input(i32) #1
> >>  +
> >>  +; Function Attrs: readnone
> >>  +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
> >>  +
> >>  +; Function Attrs: readonly
> >>  +declare float @fabs(float) #2
> >>  +
> >>  +; Function Attrs: readnone
> >>  +declare float @llvm.AMDGPU.rsq(float) #1
> >>  +
> >>  +; Function Attrs: readnone
> >>  +declare float @llvm.AMDIL.clamp.(float, float, float) #1
> >>  +
> >>  +; Function Attrs: nounwind readonly
> >>  +declare float @llvm.pow.f32(float, float) #3
> >>  +
> >>  +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
> >>  +
> >>  +attributes #0 = { "ShaderType"="1" }
> >>  +attributes #1 = { readnone }
> >>  +attributes #2 = { readonly }
> >>  +attributes #3 = { nounwind readonly }
> >>  -- 
> >>  1.8.2.1
> >> 
> > 
> >>  _______________________________________________
> >>  llvm-commits mailing list
> >>  llvm-commits at cs.uiuc.edu
> >>  http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> >