R600 optimizations
Vincent Lejeune
vljn at ovi.com
Tue Jun 4 15:01:43 PDT 2013
I updated the patches.
Vincent
----- Mail original -----
> De : Tom Stellard <tom at stellard.net>
> À : Vincent Lejeune <vljn at ovi.com>
> Cc : "llvm-commits at cs.uiuc.edu" <llvm-commits at cs.uiuc.edu>
> Envoyé le : Lundi 3 juin 2013 22h40
> Objet : Re: R600 optimizations
>
> On Fri, May 31, 2013 at 02:59:59PM -0700, Vincent Lejeune wrote:
>> Hi,
>>
>> these patches improve code generation for R600 backend.
>> The first one allows dot4 instruction (which is a pseudo instruction
> representing 4 dot4_eg/r600 instructions) to be folded with neg/abs/const
> instructions.
>> Lot of shaders use dot4 on const provided value (for clipping for instance)
> and should benefit from such optimization by reducing number of COPY.
>> The second and third ones uses abilities of texture/export instructions to
> swizzle their instruction to avoid COPY by removing duplication.
>> I'm working on adding lit test case, the ones I'm using to test my
> implementation is generated by Lightmark and I doubt I can provide it.
>>
>>
>> Vincent
>
>> >From 90832890b21525ac366653c6bb17ea9cdf3d1aba Mon Sep 17 00:00:00 2001
>> From: Vincent Lejeune <vljn at ovi.com>
>> Date: Wed, 22 May 2013 02:19:53 +0200
>> Subject: [PATCH 1/3] R600: Const/Neg/Abs can be folded to dot4
>>
>> ---
>> lib/Target/R600/AMDILISelDAGToDAG.cpp | 186
> +++++++++++++++++++++-------
>> lib/Target/R600/R600EmitClauseMarkers.cpp | 6 +-
>> lib/Target/R600/R600ExpandSpecialInstrs.cpp | 4 +-
>> lib/Target/R600/R600InstrInfo.cpp | 35 ++++++
>> lib/Target/R600/R600InstrInfo.h | 2 +
>> 5 files changed, 186 insertions(+), 47 deletions(-)
>>
>
> Can you add a simple test case for this, it shouldn't be too hard.
>
>> diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp
> b/lib/Target/R600/AMDILISelDAGToDAG.cpp
>> index 00d7c8f..6083609 100644
>> --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
>> +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
>> @@ -48,7 +48,10 @@ public:
>>
>> private:
>> inline SDValue getSmallIPtrImm(unsigned Imm);
>> + bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg,
> SDValue &Abs,
>> + const R600InstrInfo *TII, std::vector<unsigned>
> Cst);
>> bool FoldOperands(unsigned, const R600InstrInfo *,
> std::vector<SDValue> &);
>> + bool FoldDotOperands(unsigned, const R600InstrInfo *,
> std::vector<SDValue> &);
>>
>> // Complex pattern selectors
>> bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
>> @@ -317,6 +320,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
>> if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
>> const R600InstrInfo *TII =
>> static_cast<const R600InstrInfo*>(TM.getInstrInfo());
>> + if (Result && Result->isMachineOpcode() &&
> Result->getMachineOpcode() == AMDGPU::DOT_4) {
>> + bool IsModified = false;
>> + do {
>> + std::vector<SDValue> Ops;
>> + for(SDNode::op_iterator I = Result->op_begin(), E =
> Result->op_end();
>> + I != E; ++I)
>> + Ops.push_back(*I);
>> + IsModified = FoldDotOperands(Result->getMachineOpcode(), TII,
> Ops);
>> + if (IsModified) {
>> + Result = CurDAG->UpdateNodeOperands(Result, Ops.data(),
> Ops.size());
>> + }
>> + } while (IsModified);
>> +
>> + }
>> if (Result && Result->isMachineOpcode() &&
>> !(TII->get(Result->getMachineOpcode()).TSFlags &
> R600_InstFlag::VECTOR)
>> && TII->isALUInstr(Result->getMachineOpcode())) {
>> @@ -359,6 +376,43 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
>> return Result;
>> }
>>
>> +bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel,
> SDValue &Neg,
>> + SDValue &Abs, const R600InstrInfo
> *TII,
>> + std::vector<unsigned> Consts) {
>> + switch (Src.getOpcode()) {
>> + case AMDGPUISD::CONST_ADDRESS: {
>> + SDValue CstOffset;
>> + if (Src.getValueType().isVector() ||
>> + !SelectGlobalValueConstantOffset(Src.getOperand(0), CstOffset))
>> + return false;
>> +
>> + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
>> + Consts.push_back(Cst->getZExtValue());
>> + if (!TII->fitsConstReadLimitations(Consts))
>> + return false;
>> +
>> + Src = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
>> + Sel = CstOffset;
>> + return true;
>> + }
>> + case ISD::FNEG:
>> + Src = Src.getOperand(0);
>> + Neg = CurDAG->getTargetConstant(1, MVT::i32);
>> + return true;
>> + case ISD::FABS:
>> + if (!Abs.getNode())
>> + return false;
>> + Src = Src.getOperand(0);
>> + Abs = CurDAG->getTargetConstant(1, MVT::i32);
>> + return true;
>> + case ISD::BITCAST:
>> + Src = Src.getOperand(0);
>> + return true;
>> + default:
>> + return false;
>> + }
>> +}
>> +
>> bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
>> const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
>> int OperandIdx[] = {
>> @@ -382,59 +436,101 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned
> Opcode,
>> -1
>> };
>>
>> + // Gather constants values
>> + std::vector<unsigned> Consts;
>> + for (unsigned j = 0; j < 3; j++) {
>> + int SrcIdx = OperandIdx[j];
>> + if (SrcIdx < 0)
>> + break;
>> + if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx -
> 1])) {
>> + if (Reg->getReg() == AMDGPU::ALU_CONST) {
>> + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j]
> - 1]);
>> + Consts.push_back(Cst->getZExtValue());
>> + }
>> + }
>> + }
>> +
>> for (unsigned i = 0; i < 3; i++) {
>> if (OperandIdx[i] < 0)
>> return false;
>> - SDValue Operand = Ops[OperandIdx[i] - 1];
>> - switch (Operand.getOpcode()) {
>> - case AMDGPUISD::CONST_ADDRESS: {
>> - SDValue CstOffset;
>> - if (Operand.getValueType().isVector() ||
>> - !SelectGlobalValueConstantOffset(Operand.getOperand(0),
> CstOffset))
>> - break;
>> -
>> - // Gather others constants values
>> - std::vector<unsigned> Consts;
>> - for (unsigned j = 0; j < 3; j++) {
>> - int SrcIdx = OperandIdx[j];
>> - if (SrcIdx < 0)
>> - break;
>> - if (RegisterSDNode *Reg =
> dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
>> - if (Reg->getReg() == AMDGPU::ALU_CONST) {
>> - ConstantSDNode *Cst =
> dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
>> - Consts.push_back(Cst->getZExtValue());
>> - }
>> - }
>> - }
>> + SDValue &Src = Ops[OperandIdx[i] - 1];
>> + SDValue &Sel = Ops[SelIdx[i] - 1];
>> + SDValue &Neg = Ops[NegIdx[i] - 1];
>> + SDValue FakeAbs;
>> + SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
>> + if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
>> + return true;
>> + }
>> + return false;
>> +}
>>
>> - ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
>> - Consts.push_back(Cst->getZExtValue());
>> - if (!TII->fitsConstReadLimitations(Consts))
>> - break;
>> +bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode,
>> + const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
>> + int OperandIdx[] = {
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_W),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_W)
>> + };
>> + int SelIdx[] = {
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL_W),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL_W)
>> + };
>> + int NegIdx[] = {
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG_W),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG_W)
>> + };
>> + int AbsIdx[] = {
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS_W),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_X),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Y),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_Z),
>> + TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS_W)
>> + };
>>
>> - Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST,
> MVT::f32);
>> - Ops[SelIdx[i] - 1] = CstOffset;
>> - return true;
>> - }
>> - case ISD::FNEG:
>> - if (NegIdx[i] < 0)
>> - break;
>> - Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
>> - Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
>> - return true;
>> - case ISD::FABS:
>> - if (AbsIdx[i] < 0)
>> - break;
>> - Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
>> - Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
>> - return true;
>> - case ISD::BITCAST:
>> - Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
>> - return true;
>> - default:
>> + // Gather constants values
>> + std::vector<unsigned> Consts;
>> + for (unsigned j = 0; j < 8; j++) {
>> + int SrcIdx = OperandIdx[j];
>> + if (SrcIdx < 0)
>> break;
>> + if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx -
> 1])) {
>> + if (Reg->getReg() == AMDGPU::ALU_CONST) {
>> + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j]
> - 1]);
>> + Consts.push_back(Cst->getZExtValue());
>> + }
>> }
>> }
>> +
>> + for (unsigned i = 0; i < 8; i++) {
>> + if (OperandIdx[i] < 0)
>> + return false;
>> + SDValue &Src = Ops[OperandIdx[i] - 1];
>> + SDValue &Sel = Ops[SelIdx[i] - 1];
>> + SDValue &Neg = Ops[NegIdx[i] - 1];
>> + SDValue &Abs = Ops[AbsIdx[i] - 1];
>> + if (FoldOperand(Src, Sel, Neg, Abs, TII, Consts))
>> + return true;
>> + }
>> return false;
>> }
>>
>> diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp
> b/lib/Target/R600/R600EmitClauseMarkers.cpp
>> index ecfcfeb..c9d8ed1 100644
>> --- a/lib/Target/R600/R600EmitClauseMarkers.cpp
>> +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
>> @@ -108,7 +108,8 @@ private:
>> std::vector<std::pair<unsigned, unsigned> > UsedKCache;
>> const SmallVector<std::pair<MachineOperand *, int64_t>, 3>
> &Consts =
>> TII->getSrcs(MI);
>> - assert(TII->isALUInstr(MI->getOpcode()) &&
> "Can't assign Const");
>> + assert((TII->isALUInstr(MI->getOpcode()) ||
>> + MI->getOpcode() == AMDGPU::DOT_4) && "Can't
> assign Const");
>> for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
>> if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
>> continue;
>> @@ -183,6 +184,9 @@ private:
>> if (TII->isALUInstr(I->getOpcode()) &&
>> !SubstituteKCacheBank(I, KCacheBanks))
>> break;
>> + if (I->getOpcode() == AMDGPU::DOT_4 &&
>> + !SubstituteKCacheBank(I, KCacheBanks))
>> + break;
>> AluInstCount += OccupiedDwords(I);
>> }
>> unsigned Opcode = PushBeforeModifier ?
>> diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
> b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
>> index b9d5303..072ae3a 100644
>> --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
>> +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
>> @@ -214,7 +214,9 @@ bool
> R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
>> .getReg();
>> (void) Src0;
>> (void) Src1;
>> - assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
>> + if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
>> + (TRI.getEncodingValue(Src1) & 0xff) < 127)
>> + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
>> }
>> MI.eraseFromParent();
>> continue;
>> diff --git a/lib/Target/R600/R600InstrInfo.cpp
> b/lib/Target/R600/R600InstrInfo.cpp
>> index 5f8486d..2a4a245 100644
>> --- a/lib/Target/R600/R600InstrInfo.cpp
>> +++ b/lib/Target/R600/R600InstrInfo.cpp
>> @@ -169,6 +169,31 @@ SmallVector<std::pair<MachineOperand *,
> int64_t>, 3>
>> R600InstrInfo::getSrcs(MachineInstr *MI) const {
>> SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
>>
>> + if (MI->getOpcode() == AMDGPU::DOT_4) {
>> + static const R600Operands::VecOps OpTable[8][2] = {
>> + {R600Operands::SRC0_X, R600Operands::SRC0_SEL_X},
>> + {R600Operands::SRC0_Y, R600Operands::SRC0_SEL_Y},
>> + {R600Operands::SRC0_Z, R600Operands::SRC0_SEL_Z},
>> + {R600Operands::SRC0_W, R600Operands::SRC0_SEL_W},
>> + {R600Operands::SRC1_X, R600Operands::SRC1_SEL_X},
>> + {R600Operands::SRC1_Y, R600Operands::SRC1_SEL_Y},
>> + {R600Operands::SRC1_Z, R600Operands::SRC1_SEL_Z},
>> + {R600Operands::SRC1_W, R600Operands::SRC1_SEL_W},
>> + };
>> +
>> + for (unsigned j = 0; j < 8; j++) {
>> + MachineOperand &MO = MI->getOperand(OpTable[j][0] + 1);
>> + unsigned Reg = MO.getReg();
>> + if (Reg == AMDGPU::ALU_CONST) {
>> + unsigned Sel = MI->getOperand(OpTable[j][1] + 1).getImm();
>> + Result.push_back(std::pair<MachineOperand *,
> int64_t>(&MO, Sel));
>> + continue;
>> + }
>> +
>> + }
>> + return Result;
>> + }
>> +
>> static const R600Operands::Ops OpTable[3][2] = {
>> {R600Operands::SRC0, R600Operands::SRC0_SEL},
>> {R600Operands::SRC1, R600Operands::SRC1_SEL},
>> @@ -967,6 +992,11 @@ int R600InstrInfo::getOperandIdx(const MachineInstr
> &MI,
>> return getOperandIdx(MI.getOpcode(), Op);
>> }
>>
>> +int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
>> + R600Operands::VecOps Op) const {
>> + return getOperandIdx(MI.getOpcode(), Op);
>> +}
>> +
>> int R600InstrInfo::getOperandIdx(unsigned Opcode,
>> R600Operands::Ops Op) const {
>> unsigned TargetFlags = get(Opcode).TSFlags;
>> @@ -997,6 +1027,11 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode,
>> return R600Operands::ALUOpTable[OpTableIdx][Op];
>> }
>>
>> +int R600InstrInfo::getOperandIdx(unsigned Opcode,
>> + R600Operands::VecOps Op) const {
>> + return Op + 1;
>> +}
>> +
>> void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
>> int64_t Imm) const {
>> int Idx = getOperandIdx(*MI, Op);
>> diff --git a/lib/Target/R600/R600InstrInfo.h
> b/lib/Target/R600/R600InstrInfo.h
>> index f9ccf4f..afc24e2 100644
>> --- a/lib/Target/R600/R600InstrInfo.h
>> +++ b/lib/Target/R600/R600InstrInfo.h
>> @@ -212,11 +212,13 @@ namespace llvm {
>> ///
>> /// \returns -1 if the Instruction does not contain the specified
> \p Op.
>> int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op)
> const;
>> + int getOperandIdx(const MachineInstr &MI, R600Operands::VecOps Op)
> const;
>>
>> /// \brief Get the index of \p Op for the given Opcode.
>> ///
>> /// \returns -1 if the Instruction does not contain the specified
> \p Op.
>> int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
>> + int getOperandIdx(unsigned Opcode, R600Operands::VecOps Op) const;
>>
>> /// \brief Helper function for setting instruction flag values.
>> void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm)
> const;
>> --
>> 1.8.2.1
>>
>
>> From f832ad1a86472206640cbf1f79c06804cbcf3177 Mon Sep 17 00:00:00 2001
>> From: Vincent Lejeune <vljn at ovi.com>
>> Date: Sun, 26 May 2013 18:51:20 +0200
>> Subject: [PATCH 3/3] R600: Add a pass that merge Vector Register
>>
>
> Could you write a small test case for this as well?
>
>> ---
>> lib/Target/R600/AMDGPU.h | 1 +
>> lib/Target/R600/AMDGPUTargetMachine.cpp | 5 +
>> lib/Target/R600/CMakeLists.txt | 1 +
>> lib/Target/R600/R600OptimizeVectorRegisters.cpp | 336
> ++++++++++++++++++++++++
>> 4 files changed, 343 insertions(+)
>> create mode 100644 lib/Target/R600/R600OptimizeVectorRegisters.cpp
>>
>> diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
>> index f9d70c9..d3ef426 100644
>> --- a/lib/Target/R600/AMDGPU.h
>> +++ b/lib/Target/R600/AMDGPU.h
>> @@ -23,6 +23,7 @@ class AMDGPUTargetMachine;
>> // R600 Passes
>> FunctionPass* createR600TextureIntrinsicsReplacer();
>> FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
>> +FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
>> FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
>> FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
>> FunctionPass *createR600Packetizer(TargetMachine &tm);
>> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp
> b/lib/Target/R600/AMDGPUTargetMachine.cpp
>> index 88dc583..c52af56 100644
>> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
>> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
>> @@ -130,6 +130,11 @@ bool AMDGPUPassConfig::addInstSelector() {
>>
>> bool AMDGPUPassConfig::addPreRegAlloc() {
>> addPass(createAMDGPUConvertToISAPass(*TM));
>> + const AMDGPUSubtarget &ST =
> TM->getSubtarget<AMDGPUSubtarget>();
>> +
>> + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
>> + addPass(createR600VectorRegMerger(*TM));
>> + }
>> return false;
>> }
>>
>> diff --git a/lib/Target/R600/CMakeLists.txt
> b/lib/Target/R600/CMakeLists.txt
>> index c5ce9dc..558d001 100644
>> --- a/lib/Target/R600/CMakeLists.txt
>> +++ b/lib/Target/R600/CMakeLists.txt
>> @@ -41,6 +41,7 @@ add_llvm_target(R600CodeGen
>> R600ISelLowering.cpp
>> R600MachineFunctionInfo.cpp
>> R600MachineScheduler.cpp
>> + R600OptimizeVectorRegisters.cpp
>> R600Packetizer.cpp
>> R600RegisterInfo.cpp
>> R600TextureIntrinsicsReplacer.cpp
>> diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
> b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
>> new file mode 100644
>> index 0000000..387799f
>> --- /dev/null
>> +++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
>> @@ -0,0 +1,336 @@
>> +//===--------------------- R600MergeVectorRegisters.cpp
> -------------------===//
>> +//
>> +// The LLVM Compiler Infrastructure
>> +//
>> +// This file is distributed under the University of Illinois Open Source
>> +// License. See LICENSE.TXT for details.
>> +//
>>
> +//===----------------------------------------------------------------------===//
>> +//
>> +/// \file
>> +/// This pass merges inputs of swizzeable instructions into vector sharing
>> +/// common data and/or have enough undef subreg using swizzle abilities.
>> +///
>> +/// For instance let's consider the following pseudo code :
>> +/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2,
> undef, sub3
>> +/// ...
>> +/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2,
> vreg4, sub3
>> +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
>> +///
>> +/// is turned into :
>> +/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2,
> undef, sub3
>> +/// ...
>> +/// vreg7<def> = INSERT_SUBREG vreg4, sub3
>> +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
>> +///
>> +/// This allow regalloc to reduce register pressure for vector registers
> and
>> +/// to reduce MOV count.
>>
> +//===----------------------------------------------------------------------===//
>> +
>> +#define DEBUG_TYPE "vec-merger"
>> +#include "llvm/Support/Debug.h"
>> +#include "AMDGPU.h"
>> +#include "R600InstrInfo.h"
>> +#include "llvm/CodeGen/DFAPacketizer.h"
>> +#include "llvm/CodeGen/MachineDominators.h"
>> +#include "llvm/CodeGen/MachineFunctionPass.h"
>> +#include "llvm/CodeGen/MachineLoopInfo.h"
>> +#include "llvm/CodeGen/Passes.h"
>> +#include "llvm/CodeGen/MachineInstrBuilder.h"
>> +#include "llvm/Support/raw_ostream.h"
>> +#include "llvm/CodeGen/MachineRegisterInfo.h"
>> +
>> +using namespace llvm;
>> +
>> +namespace {
>> +
>> +static bool
>> +isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
>> + for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
>> + E = MRI.def_end(); It != E; ++It) {
>> + return (*It).isImplicitDef();
>> + }
>> + llvm_unreachable("Reg without a def");
>> + return false;
>> +}
>> +
>> +class RegSeqInfo {
>> +public:
>> + MachineInstr *Instr;
>> + DenseMap<unsigned, unsigned> RegToChan;
>> + std::vector<unsigned> UndefReg;
>> + RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
>> + assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
>> + for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
>> + MachineOperand &MO = Instr->getOperand(i);
>> + unsigned Chan = Instr->getOperand(i + 1).getImm();
>> + if (isImplicitlyDef(MRI, MO.getReg()))
>> + UndefReg.push_back(Chan);
>> + else
>> + RegToChan[MO.getReg()] = Chan;
>> + }
>> + }
>> + RegSeqInfo() {}
>> +
>> + bool operator==(const RegSeqInfo &RSI) const {
>> + return RSI.Instr == Instr;
>> + }
>> +};
>> +
>> +class R600VectorRegMerger : public MachineFunctionPass {
>> +private:
>> + const R600InstrInfo *TII;
>> + bool canSwizzle(const MachineInstr &) const;
>> + void SwizzleInput(MachineInstr &,
>> + const std::vector<std::pair<unsigned, unsigned> > &)
> const;
>> + bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
>> + std::vector<std::pair<unsigned, unsigned> > &Remap)
> const;
>> + MachineInstr *RebuildVector(MachineRegisterInfo &MRI, RegSeqInfo
> *MI,
>> + const RegSeqInfo *BaseVec,
>> + const std::vector<std::pair<unsigned, unsigned> >
> &RemapChan) const;
>> + void RemoveMI(MachineInstr *);
>> +
>> + typedef DenseMap<unsigned, std::vector<MachineInstr *> >
> InstructionSetMap;
>> + DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
>> + InstructionSetMap PreviousRegSeqByReg;
>> + InstructionSetMap PreviousRegSeqByUndefCount;
>> +public:
>> + static char ID;
>> + R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
>> + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
>> +
>> + void getAnalysisUsage(AnalysisUsage &AU) const {
>> + AU.setPreservesCFG();
>> + AU.addRequired<MachineDominatorTree>();
>> + AU.addPreserved<MachineDominatorTree>();
>> + AU.addRequired<MachineLoopInfo>();
>> + AU.addPreserved<MachineLoopInfo>();
>> + MachineFunctionPass::getAnalysisUsage(AU);
>> + }
>> +
>> + const char *getPassName() const {
>> + return "R600 Vector Registers Merge Pass";
>> + }
>> +
>> + bool runOnMachineFunction(MachineFunction &Fn);
>> +};
>> +
>> +char R600VectorRegMerger::ID = 0;
>> +
>> +bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
>> + const {
>> + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
>> + return true;
>> + switch (MI.getOpcode()) {
>> + case AMDGPU::R600_ExportSwz:
>> + case AMDGPU::EG_ExportSwz:
>> + return true;
>> + default:
>> + return false;
>> + }
>> +}
>> +
>> +bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
>> + RegSeqInfo *ToMerge, std::vector< std::pair<unsigned,
> unsigned> > &Remap)
>> + const {
>> + unsigned CurrentUndexIdx = 0;
>> + for (DenseMap<unsigned, unsigned>::iterator It =
> ToMerge->RegToChan.begin(),
>> + E = ToMerge->RegToChan.end(); It != E; ++It) {
>> + DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
>> + Untouched->RegToChan.find((*It).first);
>> + if (PosInUntouched != Untouched->RegToChan.end()) {
>> + Remap.push_back(std::pair<unsigned, unsigned>
>> + ((*It).second, (*PosInUntouched).second));
>> + continue;
>> + }
>> + if (CurrentUndexIdx >= Untouched->UndefReg.size())
>> + return false;
>> + Remap.push_back(std::pair<unsigned, unsigned>
>> + ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
>> + }
>> +
>> + return true;
>> +}
>> +
>> +MachineInstr *R600VectorRegMerger::RebuildVector(MachineRegisterInfo
> &MRI,
>> +RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
>> +const std::vector<std::pair<unsigned, unsigned> >
> &RemapChan) const {
>
> Indentation looks strange here
>
>> + MachineBasicBlock::iterator Pos = RSI->Instr;
>> + MachineBasicBlock &MBB = *Pos->getParent();
>> + DebugLoc DL = Pos->getDebugLoc();
>> +
>> + unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
>> + DenseMap<unsigned, unsigned> UpdatedRegToChan =
> BaseRSI->RegToChan;
>> + std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
>> + DEBUG(
>> + dbgs() << "Turning ";
>> + RSI->Instr->dump();
>> + dbgs() << " into ";
>> + );
>> + for (DenseMap<unsigned, unsigned>::iterator It =
> RSI->RegToChan.begin(),
>> + E = RSI->RegToChan.end(); It != E; ++It) {
>> + if (BaseRSI->RegToChan.find((*It).first) !=
> BaseRSI->RegToChan.end()) {
>> + UpdatedRegToChan[(*It).first] = (*It).second;
>> + continue;
>> + }
>> + unsigned DstReg =
> MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
>> + unsigned SubReg = (*It).first;
>> + unsigned Swizzle = (*It).second;
>> + unsigned Chan;
>> + for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
>> + if (RemapChan[j].first == Swizzle) {
>> + Chan = RemapChan[j].second;
>> + break;
>> + }
>> + }
>> + MachineInstr *Tmp = BuildMI(MBB, Pos, DL,
> TII->get(AMDGPU::INSERT_SUBREG),
>> + DstReg)
>> + .addReg(SrcVec)
>> + .addReg(SubReg)
>> + .addImm(Chan);
>> + UpdatedRegToChan[SubReg] = Chan;
>> + UpdatedUndef.erase(
>> + std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan));
>> + DEBUG(
>> + Tmp->dump();
>> + );
>> + SrcVec = DstReg;
>> + }
>> + Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY),
>> + RSI->Instr->getOperand(0).getReg())
>> + .addReg(SrcVec);
>> + DEBUG(
>> + Pos->dump();
>> + dbgs() << "-----------\n";
>> + );
>> + RSI->Instr = Pos;
>> + RSI->RegToChan = UpdatedRegToChan;
>> + RSI->UndefReg = UpdatedUndef;
>> +
>> + return Pos;
>> +}
>> +
>> +void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
>> + for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
>> + E = PreviousRegSeqByReg.end(); It != E; ++It) {
>> + std::vector<MachineInstr *> &MIs = (*It).second;
>> + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
>> + }
>> + for (InstructionSetMap::iterator It =
> PreviousRegSeqByUndefCount.begin(),
>> + E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
>> + std::vector<MachineInstr *> &MIs = (*It).second;
>> + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
>> + }
>> +}
>> +
>> +void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
>> + const std::vector<std::pair<unsigned, unsigned> >
> &RemapChan) const {
>> + unsigned Offset;
>> + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
>> + Offset = 2;
>> + else
>> + Offset = 3;
>> + for (unsigned i = 0; i < 4; i++) {
>> + unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
>> + for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
>> + if (RemapChan[j].first == Swizzle) {
>> + MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
>> + break;
>> + }
>> + }
>> + }
>> +}
>> +
>> +bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
>> + MachineRegisterInfo &MRI = Fn.getRegInfo();
>> + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
>> + MBB != MBBe; ++MBB) {
>> + MachineBasicBlock *MB = MBB;
>> + PreviousRegSeq.clear();
>> + PreviousRegSeqByReg.clear();
>> + PreviousRegSeqByUndefCount.clear();
>> +
>> + for (MachineBasicBlock::iterator MII = MB->begin(), MIIE =
> MB->end();
>> + MII != MIIE; ++MII) {
>> + MachineInstr *MI = MII;
>> + if (MI->getOpcode() != AMDGPU::REG_SEQUENCE)
>> + continue;
>> +
>> + RegSeqInfo &RSI = PreviousRegSeq[MI] = RegSeqInfo(MRI, MI);
>> +
>> + // All uses of MI are swizzeable ?
>> + unsigned Reg = MI->getOperand(0).getReg();
>> + bool AllUseAreSwizzeable = true;
>> + for (MachineRegisterInfo::use_iterator It = MRI.use_begin(Reg),
>> + E = MRI.use_end(); It != E; ++It) {
>> + if (!canSwizzle(*It)) {
>> + AllUseAreSwizzeable = false;
>> + break;
>> + }
>> + }
>> + if (!AllUseAreSwizzeable)
>> + continue;
>> +
>> + bool Merged = false;
>> + for (MachineInstr::mop_iterator MOp = MI->operands_begin(),
>> + MOE = MI->operands_end(); MOp != MOE; ++MOp) {
>> + if (!MOp->isReg())
>> + continue;
>> + if (PreviousRegSeqByReg[MOp->getReg()].empty())
>> + continue;
>> + std::vector<MachineInstr *> MIs =
> PreviousRegSeqByReg[MOp->getReg()];
>> + for (unsigned i = 0, e = MIs.size(); i < e; i++) {
>> + RegSeqInfo &CandidateRSI = PreviousRegSeq[MIs[i]];
>> + if (RSI == CandidateRSI)
>> + continue;
>> + std::vector<std::pair<unsigned, unsigned> >
> RemapChan;
>> + if (!tryMergeVector(&CandidateRSI, &RSI, RemapChan))
>> + continue;
>> + MII = RebuildVector(MRI, &RSI, &CandidateRSI,
> RemapChan);
>> + RemoveMI(CandidateRSI.Instr);
>> + MI->eraseFromParent();
>> + for (MachineRegisterInfo::use_iterator It = MRI.use_begin(Reg),
>> + E = MRI.use_end(); It != E; ++It) {
>> + SwizzleInput(*It, RemapChan);
>> + }
>> + Merged = true;
>> + break;
>> + }
>> + if (Merged)
>> + break;
>> + }
>> + if (Merged)
>> + continue;
>> + unsigned NeededUndefs = 4 - RSI.UndefReg.size();
>> + if (!PreviousRegSeqByUndefCount[NeededUndefs].empty()) {
>> + std::vector<MachineInstr *> &MIs =
>> + PreviousRegSeqByUndefCount[NeededUndefs];
>> + RegSeqInfo &PriorRSI = PreviousRegSeq[MIs.back()];
>> + std::vector<std::pair<unsigned, unsigned> > RemapChan;
>> + tryMergeVector(&PriorRSI, &RSI, RemapChan);
>> + MII = RebuildVector(MRI, &RSI, &PriorRSI, RemapChan);
>> + RemoveMI(PriorRSI.Instr);
>> + MI->eraseFromParent();
>> + for (MachineRegisterInfo::use_iterator It = MRI.use_begin(Reg),
>> + E = MRI.use_end(); It != E; ++It) {
>> + SwizzleInput(*It, RemapChan);
>> + }
>> + continue;
>> + }
>> + //Failed to merge
>> + for (DenseMap<unsigned, unsigned>::const_iterator
>> + It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It)
> {
>> + PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
>> + }
>> +
> PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
>> + }
>> + }
>> + return false;
>> +}
>> +
>> +}
>> +
>> +llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm)
> {
>> + return new R600VectorRegMerger(tm);
>> +}
>> +
>> --
>> 1.8.2.1
>>
>
>> From 8ab5368eb96cacdd86d02692ef2637157aff85b3 Mon Sep 17 00:00:00 2001
>> From: Vincent Lejeune <vljn at ovi.com>
>> Date: Thu, 23 May 2013 00:44:09 +0200
>> Subject: [PATCH 2/3] R600: Swizzle texture/export instructions
>>
>
> Reviewed-by: Tom Stellard <tom at stellard.net>
>
>> ---
>> lib/Target/R600/R600ISelLowering.cpp | 145
> ++++++++++++++++++++++++++++++-----
>> lib/Target/R600/R600ISelLowering.h | 1 +
>> test/CodeGen/R600/llvm.AMDGPU.tex.ll | 10 +--
>> 3 files changed, 131 insertions(+), 25 deletions(-)
>>
>> diff --git a/lib/Target/R600/R600ISelLowering.cpp
> b/lib/Target/R600/R600ISelLowering.cpp
>> index 00adca3..267f367 100644
>> --- a/lib/Target/R600/R600ISelLowering.cpp
>> +++ b/lib/Target/R600/R600ISelLowering.cpp
>> @@ -1210,6 +1210,99 @@ EVT
> R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
>> return VT.changeVectorElementTypeToInteger();
>> }
>>
>> +SDValue CompactSwizzlableVector(SelectionDAG &DAG, SDValue
> VectorEntry,
>> + DenseMap<unsigned, unsigned>
> &RemapSwizzle) {
>> + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
>> + assert(RemapSwizzle.empty());
>> + SDValue NewBldVec[4] = {
>> + VectorEntry.getOperand(0),
>> + VectorEntry.getOperand(1),
>> + VectorEntry.getOperand(2),
>> + VectorEntry.getOperand(3)
>> + };
>> +
>> + for (unsigned i = 0; i < 4; i++) {
>> + if (ConstantFPSDNode *C =
> dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
>> + if (C->isZero()) {
>> + RemapSwizzle[i] = 4; // SEL_0
>> + NewBldVec[i] = DAG.getUNDEF(MVT::f32);
>> + } else if (C->isExactlyValue(1.0)) {
>> + RemapSwizzle[i] = 5; // SEL_1
>> + NewBldVec[i] = DAG.getUNDEF(MVT::f32);
>> + }
>> + }
>> +
>> + if (NewBldVec[i].getOpcode() == ISD::UNDEF)
>> + continue;
>> + for (unsigned j = 0; j < i; j++) {
>> + if (NewBldVec[i] == NewBldVec[j]) {
>> + NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
>> + RemapSwizzle[i] = j;
>> + break;
>> + }
>> + }
>> + }
>> +
>> + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
>> + VectorEntry.getValueType(), NewBldVec, 4);
>> +}
>> +
>> +SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
>> + DenseMap<unsigned, unsigned>
> &RemapSwizzle) {
>> + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
>> + assert(RemapSwizzle.empty());
>> + SDValue NewBldVec[4] = {
>> + VectorEntry.getOperand(0),
>> + VectorEntry.getOperand(1),
>> + VectorEntry.getOperand(2),
>> + VectorEntry.getOperand(3)
>> + };
>> + bool isUnmovable[4] = { false, false, false, false };
>> +
>> + for (unsigned i = 0; i < 4; i++) {
>> + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
>> + unsigned Idx =
> dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
>> + ->getZExtValue();
>> + if (!isUnmovable[Idx]) {
>> + // Swap i and Idx
>> + std::swap(NewBldVec[Idx], NewBldVec[i]);
>> + RemapSwizzle[Idx] = i;
>> + RemapSwizzle[i] = Idx;
>> + }
>> + isUnmovable[Idx] = true;
>> + }
>> + }
>> +
>> + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
>> + VectorEntry.getValueType(), NewBldVec, 4);
>> +}
>> +
>> +
>> +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
>> +SDValue Swz[4], SelectionDAG &DAG) const {
>> + assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
>> + // Old -> New swizzle values
>> + DenseMap<unsigned, unsigned> SwizzleRemap;
>> +
>> + BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
>> + for (unsigned i = 0; i < 4; i++) {
>> + unsigned Idx =
> dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
>> + if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
>> + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
>> + }
>> +
>> + SwizzleRemap.clear();
>> + BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
>> + for (unsigned i = 0; i < 4; i++) {
>> + unsigned Idx =
> dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
>> + if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
>> + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
>> + }
>> +
>> + return BuildVector;
>> +}
>> +
>> +
>>
> //===----------------------------------------------------------------------===//
>> // Custom DAG Optimizations
>>
> //===----------------------------------------------------------------------===//
>> @@ -1319,12 +1412,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode
> *N,
>> SDValue Arg = N->getOperand(1);
>> if (Arg.getOpcode() != ISD::BUILD_VECTOR)
>> break;
>> - SDValue NewBldVec[4] = {
>> - DAG.getUNDEF(MVT::f32),
>> - DAG.getUNDEF(MVT::f32),
>> - DAG.getUNDEF(MVT::f32),
>> - DAG.getUNDEF(MVT::f32)
>> - };
>> +
>> SDValue NewArgs[8] = {
>> N->getOperand(0), // Chain
>> SDValue(),
>> @@ -1335,23 +1423,40 @@ SDValue
> R600TargetLowering::PerformDAGCombine(SDNode *N,
>> N->getOperand(6), // SWZ_Z
>> N->getOperand(7) // SWZ_W
>> };
>> - for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
>> - if (ConstantFPSDNode *C =
> dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
>> - if (C->isZero()) {
>> - NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
>> - } else if (C->isExactlyValue(1.0)) {
>> - NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
>> - } else {
>> - NewBldVec[i] = Arg.getOperand(i);
>> - }
>> - } else {
>> - NewBldVec[i] = Arg.getOperand(i);
>> - }
>> - }
>> SDLoc DL(N);
>> - NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec,
> 4);
>> + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4],
> DAG);
>> return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs,
> 8);
>> }
>> + case AMDGPUISD::TEXTURE_FETCH: {
>> + SDValue Arg = N->getOperand(1);
>> + if (Arg.getOpcode() != ISD::BUILD_VECTOR)
>> + break;
>> +
>> + SDValue NewArgs[19] = {
>> + N->getOperand(0),
>> + N->getOperand(1),
>> + N->getOperand(2),
>> + N->getOperand(3),
>> + N->getOperand(4),
>> + N->getOperand(5),
>> + N->getOperand(6),
>> + N->getOperand(7),
>> + N->getOperand(8),
>> + N->getOperand(9),
>> + N->getOperand(10),
>> + N->getOperand(11),
>> + N->getOperand(12),
>> + N->getOperand(13),
>> + N->getOperand(14),
>> + N->getOperand(15),
>> + N->getOperand(16),
>> + N->getOperand(17),
>> + N->getOperand(18),
>> + };
>> + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2],
> DAG);
>> + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N),
> N->getVTList(),
>> + NewArgs, 19);
>> + }
>> }
>> return SDValue();
>> }
>> diff --git a/lib/Target/R600/R600ISelLowering.h
> b/lib/Target/R600/R600ISelLowering.h
>> index 663aab4..2904396 100644
>> --- a/lib/Target/R600/R600ISelLowering.h
>> +++ b/lib/Target/R600/R600ISelLowering.h
>> @@ -51,6 +51,7 @@ private:
>>
>> void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
>> MachineRegisterInfo & MRI, unsigned dword_offset) const;
>> + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG
> &DAG) const;
>>
>> /// \brief Lower ROTL opcode to BITALIGN
>> SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
>> diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll
> b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
>> index 4ea82bb..aac014b 100644
>> --- a/test/CodeGen/R600/llvm.AMDGPU.tex.ll
>> +++ b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
>> @@ -5,12 +5,12 @@
>> ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNNN
>> ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNNN
>> ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:UUNN
>> -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNNN
>> -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNNN
>> -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:UUNN
>> +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0
> CT:NNNN
>> +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0
> CT:NNNN
>> +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0
> CT:UUNN
>> +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0
> CT:NNUN
>> ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNUN
>> -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNUN
>> -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNUN
>> +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0
> CT:NNUN
>> ;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNUN
>> ;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNNN
>> ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0
> CT:NNNN
>> --
>> 1.8.2.1
>>
>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-R600-Add-a-pass-that-merge-Vector-Register.patch
Type: application/octet-stream
Size: 16933 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130604/2169d433/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-Const-Neg-Abs-can-be-folded-to-dot4.patch
Type: application/octet-stream
Size: 15367 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130604/2169d433/attachment-0001.obj>
More information about the llvm-commits
mailing list