[llvm] r213551 - R600/SI: Use scratch memory for large private arrays
Tom Stellard
thomas.stellard at amd.com
Mon Jul 21 08:45:02 PDT 2014
Author: tstellar
Date: Mon Jul 21 10:45:01 2014
New Revision: 213551
URL: http://llvm.org/viewvc/llvm-project?rev=213551&view=rev
Log:
R600/SI: Use scratch memory for large private arrays
Modified:
llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.cpp
llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.h
llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h
llvm/trunk/lib/Target/R600/AMDGPUInstructions.td
llvm/trunk/lib/Target/R600/AMDGPURegisterInfo.h
llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/R600/SIDefines.h
llvm/trunk/lib/Target/R600/SIISelLowering.cpp
llvm/trunk/lib/Target/R600/SIISelLowering.h
llvm/trunk/lib/Target/R600/SIInstrInfo.cpp
llvm/trunk/lib/Target/R600/SIInstrInfo.h
llvm/trunk/lib/Target/R600/SIInstrInfo.td
llvm/trunk/lib/Target/R600/SIInstructions.td
llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.cpp
llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.h
llvm/trunk/lib/Target/R600/SIRegisterInfo.cpp
llvm/trunk/lib/Target/R600/SIRegisterInfo.h
llvm/trunk/test/CodeGen/R600/array-ptr-calc-i32.ll
llvm/trunk/test/CodeGen/R600/gv-const-addrspace.ll
llvm/trunk/test/CodeGen/R600/indirect-private-64.ll
llvm/trunk/test/CodeGen/R600/private-memory.ll
llvm/trunk/test/CodeGen/R600/work-item-intrinsics.ll
Modified: llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.cpp Mon Jul 21 10:45:01 2014
@@ -25,6 +25,7 @@
#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
@@ -141,6 +142,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunct
false);
OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
false);
+ OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+ false);
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer.emitRawComment(
@@ -332,6 +335,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(
// Do not clamp NAN to 0.
ProgInfo.DX10Clamp = 0;
+ const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+ ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+
ProgInfo.CodeLen = CodeSize;
}
@@ -361,6 +367,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI
unsigned LDSBlocks =
RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+ // Scratch is allocated in 256 dword blocks.
+ unsigned ScratchAlignShift = 10;
+ // We need to program the hardware with the amount of scratch memory that
+ // is used by the entire wave. KernelInfo.ScratchSize is the amount of
+ // scratch memory used per thread.
+ unsigned ScratchBlocks =
+ RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
+ 1 << ScratchAlignShift) >> ScratchAlignShift;
+
if (MFI->getShaderType() == ShaderType::COMPUTE) {
OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
@@ -377,7 +392,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI
OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
- OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+ const uint32_t ComputePGMRSrc2 =
+ S_00B84C_LDS_SIZE(LDSBlocks) |
+ S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
+
+ OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
+
+ OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+ OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
} else {
OutStreamer.EmitIntValue(RsrcReg, 4);
OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
Modified: llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.h (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUAsmPrinter.h Mon Jul 21 10:45:01 2014
@@ -32,6 +32,7 @@ private:
DX10Clamp(0),
DebugMode(0),
IEEEMode(0),
+ ScratchSize(0),
CodeLen(0) {}
// Fields set in PGM_RSRC1 pm4 packet.
@@ -43,6 +44,7 @@ private:
uint32_t DX10Clamp;
uint32_t DebugMode;
uint32_t IEEEMode;
+ uint32_t ScratchSize;
// Bonus information for debugging.
uint64_t CodeLen;
Modified: llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp Mon Jul 21 10:45:01 2014
@@ -16,9 +16,13 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "R600InstrInfo.h"
+#include "SIDefines.h"
#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Function.h"
@@ -85,7 +89,13 @@ private:
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
- SDValue &ImmOffset) const;
+ SDValue &ImmOffset) const;
+ bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &ImmOffset) const;
+ bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const;
SDNode *SelectADD_SUB_I64(SDNode *N);
SDNode *SelectDIV_SCALE(SDNode *N);
@@ -730,6 +740,10 @@ static SDValue wrapAddr64Rsrc(SelectionD
Ptr), 0);
}
+static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
+ return isUInt<12>(Imm->getZExtValue());
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
SDValue &Offset,
SDValue &ImmOffset) const {
@@ -740,7 +754,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr
SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (isUInt<12>(C1->getZExtValue())) {
+ if (isLegalMUBUFImmOffset(C1)) {
if (N0.getOpcode() == ISD::ADD) {
// (add (add N2, N3), C1)
@@ -776,6 +790,95 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr
return true;
}
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to the
+/// resource ponter.
+static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+
+ uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+ 0xffffffff;
+
+ SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+ SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+ SDValue DataLo = DAG->getTargetConstant(
+ Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
+ SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
+
+ const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
+ return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
+ MVT::v4i32, Ops), 0);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &ImmOffset) const {
+
+ SDLoc DL(Addr);
+ MachineFunction &MF = CurDAG->getMachineFunction();
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+
+ unsigned ScratchPtrReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+ unsigned ScratchOffsetReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
+ Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64));
+ SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+ MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+
+ // (add n0, c1)
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+ if (isLegalMUBUFImmOffset(C1)) {
+ VAddr = Addr.getOperand(0);
+ ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ return true;
+ }
+ }
+
+ // (add FI, n0)
+ if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+ isa<FrameIndexSDNode>(Addr.getOperand(0))) {
+ VAddr = Addr.getOperand(1);
+ ImmOffset = Addr.getOperand(0);
+ return true;
+ }
+
+ // (FI)
+ if (isa<FrameIndexSDNode>(Addr)) {
+ VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+ CurDAG->getConstant(0, MVT::i32)), 0);
+ ImmOffset = Addr;
+ return true;
+ }
+
+ // (node)
+ VAddr = Addr;
+ ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &GLC,
+ SDValue &SLC, SDValue &TFE) const {
+
+ GLC = CurDAG->getTargetConstant(0, MVT::i1);
+ SLC = CurDAG->getTargetConstant(0, MVT::i1);
+ TFE = CurDAG->getTargetConstant(0, MVT::i1);
+
+ Idxen = CurDAG->getTargetConstant(0, MVT::i1);
+ Offen = CurDAG->getTargetConstant(1, MVT::i1);
+
+ return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
Modified: llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h Mon Jul 21 10:45:01 2014
@@ -71,13 +71,6 @@ protected:
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
- /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
- /// MachineFunction.
- ///
- /// \returns a RegisterSDNode representing Reg.
- virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
- const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const;
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const;
/// \brief Split a vector load into multiple scalar loads.
@@ -160,6 +153,14 @@ public:
SDValue Op,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+
+ /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+ /// MachineFunction.
+ ///
+ /// \returns a RegisterSDNode representing Reg.
+ virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const;
};
namespace AMDGPUISD {
Modified: llvm/trunk/lib/Target/R600/AMDGPUInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUInstructions.td?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUInstructions.td (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUInstructions.td Mon Jul 21 10:45:01 2014
@@ -41,6 +41,8 @@ def UnsafeFPMath : Predicate<"TM.Options
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+let OperandType = "OPERAND_IMMEDIATE" in {
+
def u32imm : Operand<i32> {
let PrintMethod = "printU32ImmOperand";
}
@@ -53,6 +55,8 @@ def u8imm : Operand<i8> {
let PrintMethod = "printU8ImmOperand";
}
+} // End OperandType = "OPERAND_IMMEDIATE"
+
//===--------------------------------------------------------------------===//
// Custom Operands
//===--------------------------------------------------------------------===//
@@ -136,6 +140,28 @@ def COND_NULL : PatLeaf <
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
+class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+}]>;
+
+class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
+ (ops node:$ptr), (op node:$ptr)
+>;
+
+class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+ (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def extloadi8_private : PrivateLoad <extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+def extloadi16_private : PrivateLoad <extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+def load_private : PrivateLoad <load>;
+
+def truncstorei8_private : PrivateStore <truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_private : PrivateStore <store>;
+
def global_store : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return isGlobalStore(dyn_cast<StoreSDNode>(N));
Modified: llvm/trunk/lib/Target/R600/AMDGPURegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPURegisterInfo.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPURegisterInfo.h (original)
+++ llvm/trunk/lib/Target/R600/AMDGPURegisterInfo.h Mon Jul 21 10:45:01 2014
@@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGP
unsigned getSubRegFromChannel(unsigned Channel) const;
const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
- void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
Modified: llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUTargetMachine.cpp Mon Jul 21 10:45:01 2014
@@ -52,7 +52,7 @@ static std::string computeDataLayout(con
std::string Ret = "e-p:32:32";
if (ST.is64bit()) {
- // 32-bit private, local, and region pointers. 64-bit global and constant.
+ // 32-bit local, and region pointers. 64-bit private, global, and constant.
Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
}
Modified: llvm/trunk/lib/Target/R600/SIDefines.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIDefines.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIDefines.h (original)
+++ llvm/trunk/lib/Target/R600/SIDefines.h Mon Jul 21 10:45:01 2014
@@ -32,6 +32,7 @@ enum {
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
+#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0)
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
@@ -85,4 +86,7 @@ enum {
#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
+#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
+
#endif // SIDEFINES_H_
Modified: llvm/trunk/lib/Target/R600/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIISelLowering.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/R600/SIISelLowering.cpp Mon Jul 21 10:45:01 2014
@@ -391,10 +391,15 @@ SDValue SITargetLowering::LowerFormalArg
}
// The pointer to the list of arguments is stored in SGPR0, SGPR1
+ // The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
+ Info->NumUserSGPRs = 4;
CCInfo.AllocateReg(AMDGPU::SGPR0);
CCInfo.AllocateReg(AMDGPU::SGPR1);
+ CCInfo.AllocateReg(AMDGPU::SGPR2);
+ CCInfo.AllocateReg(AMDGPU::SGPR3);
MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
@@ -509,6 +514,36 @@ MachineBasicBlock * SITargetLowering::Em
MI->eraseFromParent();
break;
}
+ case AMDGPU::SI_BUFFER_RSRC: {
+ unsigned SuperReg = MI->getOperand(0).getReg();
+ unsigned Args[4];
+ for (unsigned i = 0, e = 4; i < e; ++i) {
+ MachineOperand &Arg = MI->getOperand(i + 1);
+
+ if (Arg.isReg()) {
+ Args[i] = Arg.getReg();
+ continue;
+ }
+
+ assert(Arg.isImm());
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
+ .addImm(Arg.getImm());
+ Args[i] = Reg;
+ }
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
+ SuperReg)
+ .addReg(Args[0])
+ .addImm(AMDGPU::sub0)
+ .addReg(Args[1])
+ .addImm(AMDGPU::sub1)
+ .addReg(Args[2])
+ .addImm(AMDGPU::sub2)
+ .addReg(Args[3])
+ .addImm(AMDGPU::sub3);
+ MI->eraseFromParent();
+ break;
+ }
case AMDGPU::V_SUB_F64: {
unsigned DestReg = MI->getOperand(0).getReg();
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
@@ -620,6 +655,7 @@ SDValue SITargetLowering::LowerOperation
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
@@ -658,8 +694,6 @@ SDValue SITargetLowering::LowerOperation
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
- //XXX: Hardcoded we only use two to store the pointer to the parameters.
- unsigned NumUserSGPRs = 2;
switch (IntrinsicID) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case Intrinsic::r600_read_ngroups_x:
@@ -682,13 +716,13 @@ SDValue SITargetLowering::LowerOperation
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
+ AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
+ AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
+ AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
AMDGPU::VGPR0, VT);
@@ -782,6 +816,21 @@ static SDNode *findUser(SDValue Value, u
return nullptr;
}
+SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
+ unsigned FrameIndex = FINode->getIndex();
+
+ CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
+
+ return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+}
+
/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -891,6 +940,11 @@ SDValue SITargetLowering::LowerGlobalAdd
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
+ // Vector private memory loads have already been split, and
+ // all the rest of private memory loads are legal.
+ if (Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ return SDValue();
+ }
SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
if (Lowered.getNode())
return Lowered;
@@ -1081,6 +1135,12 @@ SDValue SITargetLowering::LowerSTORE(SDV
VT.getVectorElementType() == MVT::i32)
return SDValue();
+ if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (VT.isVector() && VT.getVectorNumElements() > 4)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ }
+
SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
if (Ret.getNode())
return Ret;
@@ -1495,9 +1555,19 @@ void SITargetLowering::ensureSRegLimit(S
// This is a conservative aproach. It is possible that we can't determine the
// correct register class and copy too often, but better safe than sorry.
- SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
- SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
- Operand.getValueType(), Operand, RC);
+
+ SDNode *Node;
+ // We can't use COPY_TO_REGCLASS with FrameIndex arguments.
+ if (isa<FrameIndexSDNode>(Operand)) {
+ unsigned Opcode = Operand.getValueType() == MVT::i32 ?
+ AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
+ Operand);
+ } else {
+ SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
+ Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
+ Operand.getValueType(), Operand, RC);
+ }
Operand = SDValue(Node, 0);
}
@@ -1591,6 +1661,14 @@ SDNode *SITargetLowering::foldOperands(M
ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
}
continue;
+ } else {
+ // If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
+ // These will be lowered to immediates, so we will need to insert a MOV.
+ if (isa<GlobalAddressSDNode>(Ops[i])) {
+ SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
+ Operand.getValueType(), Operand);
+ Ops[i] = SDValue(Node, 0);
+ }
}
if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
Modified: llvm/trunk/lib/Target/R600/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIISelLowering.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/R600/SIISelLowering.h Mon Jul 21 10:45:01 2014
@@ -27,6 +27,7 @@ class SITargetLowering : public AMDGPUTa
SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
+ SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
Modified: llvm/trunk/lib/Target/R600/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstrInfo.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/R600/SIInstrInfo.cpp Mon Jul 21 10:45:01 2014
@@ -561,6 +561,21 @@ static bool compareMachineOp(const Machi
}
}
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+ const MachineOperand &MO) const {
+ const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
+
+ assert(MO.isImm() || MO.isFPImm());
+
+ if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
+ return true;
+
+ if (OpInfo.RegClass < 0)
+ return false;
+
+ return RI.regClassCanUseImmediate(OpInfo.RegClass);
+}
+
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI->getOpcode();
@@ -589,7 +604,11 @@ bool SIInstrInfo::verifyInstruction(cons
}
break;
case MCOI::OPERAND_IMMEDIATE:
- if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
+ // Check if this operand is an immediate.
+ // FrameIndex operands will be replaced by immediates, so they are
+ // allowed.
+ if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
+ !MI->getOperand(i).isFI()) {
ErrInfo = "Expected immediate, but got non-immediate";
return false;
}
Modified: llvm/trunk/lib/Target/R600/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstrInfo.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/R600/SIInstrInfo.h Mon Jul 21 10:45:01 2014
@@ -106,6 +106,9 @@ public:
bool isInlineConstant(const MachineOperand &MO) const;
bool isLiteralConstant(const MachineOperand &MO) const;
+ bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+ const MachineOperand &MO) const;
+
bool verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const override;
@@ -181,7 +184,7 @@ namespace AMDGPU {
int getMCOpcode(uint16_t Opcode, unsigned Gen);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-
+ const uint64_t RSRC_TID_ENABLE = 1LL << 55;
} // End namespace AMDGPU
Modified: llvm/trunk/lib/Target/R600/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstrInfo.td?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/R600/SIInstrInfo.td Mon Jul 21 10:45:01 2014
@@ -163,7 +163,9 @@ def sopp_brtarget : Operand<OtherVT> {
// Complex patterns
//===----------------------------------------------------------------------===//
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
//===----------------------------------------------------------------------===//
// SI assembler operands
@@ -605,12 +607,12 @@ multiclass MUBUF_Load_Helper <bits<7> op
asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
}
- let offen = 1, idxen = 0, offset = 0 in {
+ let offen = 1, idxen = 0 in {
def _OFFEN : MUBUF <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc, VReg_32:$vaddr,
- SSrc_32:$soffset, i1imm:$glc, i1imm:$slc,
+ SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc,
i1imm:$tfe),
- asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+ asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>;
}
let offen = 0, idxen = 1 in {
@@ -640,25 +642,40 @@ multiclass MUBUF_Load_Helper <bits<7> op
}
}
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
- ValueType store_vt, SDPatternOperator st> :
- MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
- u16imm:$offset),
- name#" $vdata, $srsrc + $vaddr + $offset",
- [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
+multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+ ValueType store_vt, SDPatternOperator st> {
- let mayLoad = 0;
- let mayStore = 1;
+ def "" : MUBUF <
+ op, (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
+ u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc,
+ i1imm:$tfe),
+ name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe",
+ []
+ > {
+ let addr64 = 0;
+ }
- // Encoding
- let offen = 0;
- let idxen = 0;
- let glc = 0;
- let addr64 = 1;
- let lds = 0;
- let slc = 0;
- let tfe = 0;
- let soffset = 128; // ZERO
+ def _ADDR64 : MUBUF <
+ op, (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
+ name#" $vdata, $srsrc + $vaddr + $offset",
+ [(st store_vt:$vdata,
+ (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
+
+ let mayLoad = 0;
+ let mayStore = 1;
+
+ // Encoding
+ let offen = 0;
+ let idxen = 0;
+ let glc = 0;
+ let addr64 = 1;
+ let lds = 0;
+ let slc = 0;
+ let tfe = 0;
+ let soffset = 128; // ZERO
+ }
}
class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
Modified: llvm/trunk/lib/Target/R600/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstructions.td?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/R600/SIInstructions.td Mon Jul 21 10:45:01 2014
@@ -872,23 +872,23 @@ defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_He
0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
>;
-def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
>;
-def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
>;
-def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
+defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
>;
-def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
+defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
>;
-def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
>;
//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
@@ -1667,6 +1667,12 @@ def SI_ADDR64_RSRC : InstSI <
"", []
>;
+def SI_BUFFER_RSRC : InstSI <
+ (outs SReg_128:$srsrc),
+ (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi),
+ "", []
+>;
+
def V_SUB_F64 : InstSI <
(outs VReg_64:$dst),
(ins VReg_64:$src0, VReg_64:$src1),
@@ -2410,7 +2416,7 @@ def : Ext32Pat <anyext>;
// Offset in an 32Bit VGPR
def : Pat <
(SIload_constant v4i32:$sbase, i32:$voff),
- (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
+ (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
>;
// The multiplication scales from [0,1] to the unsigned integer range
@@ -2599,22 +2605,30 @@ multiclass MUBUFLoad_Pattern <MUBUF Inst
(vt (constant_ld (add i64:$ptr, i64:$offset))),
(Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
>;
+
}
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
- sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
- az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
- sextloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
- az_extloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
- constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
- constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
- constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+
+class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
+ (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+ i32:$soffset, u16imm:$offset))),
+ (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
@@ -2630,9 +2644,9 @@ multiclass MUBUF_Load_Dword <ValueType v
def : Pat <
(vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
- imm, 1, 0, imm:$glc, imm:$slc,
+ imm:$offset, 1, 0, imm:$glc, imm:$slc,
imm:$tfe)),
- (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+ (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
(as_i1imm $tfe))
>;
@@ -2660,6 +2674,34 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_L
defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+ (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+ u16imm:$offset, i1imm:$offen, i1imm:$idxen,
+ i1imm:$glc, i1imm:$slc, i1imm:$tfe)),
+ (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen,
+ $glc, $slc, $tfe)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4, v4i32, store_private>;
+
+/*
+class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+ (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
+ (Instr $value, $srsrc, $vaddr, $offset)
+>;
+
+def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+
+*/
+
//===----------------------------------------------------------------------===//
// MTBUF Patterns
//===----------------------------------------------------------------------===//
Modified: llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.cpp (original)
+++ llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.cpp Mon Jul 21 10:45:01 2014
@@ -27,7 +27,8 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
PSInputAddr(0),
- SpillTracker() { }
+ SpillTracker(),
+ NumUserSGPRs(0) { }
static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
Modified: llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/R600/SIMachineFunctionInfo.h Mon Jul 21 10:45:01 2014
@@ -59,6 +59,7 @@ public:
SIMachineFunctionInfo(const MachineFunction &MF);
unsigned PSInputAddr;
struct RegSpillTracker SpillTracker;
+ unsigned NumUserSGPRs;
};
} // End namespace llvm
Modified: llvm/trunk/lib/Target/R600/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIRegisterInfo.cpp?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/R600/SIRegisterInfo.cpp Mon Jul 21 10:45:01 2014
@@ -16,6 +16,10 @@
#include "SIRegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
using namespace llvm;
@@ -27,8 +31,6 @@ BitVector SIRegisterInfo::getReservedReg
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::EXEC);
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
- const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
- TII->reserveIndirectRegisters(Reserved, MF);
return Reserved;
}
@@ -37,6 +39,30 @@ unsigned SIRegisterInfo::getRegPressureL
return RC->getNumRegs();
}
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+ return Fn.getFrameInfo()->hasStackObjects();
+}
+
+void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+ MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+ int Index = MI->getOperand(FIOperandNum).getIndex();
+ int64_t Offset = FrameInfo->getObjectOffset(Index);
+
+ FIOp.ChangeToImmediate(Offset);
+ if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
+ unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addImm(Offset);
+ FIOp.ChangeToRegister(TmpReg, false);
+ }
+}
+
const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
@@ -141,3 +167,21 @@ bool SIRegisterInfo::regClassCanUseImmed
const TargetRegisterClass *RC) const {
return regClassCanUseImmediate(RC->getID());
}
+
+unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
+ enum PreloadedValue Value) const {
+
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ switch (Value) {
+ case SIRegisterInfo::TGID_X:
+ return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
+ case SIRegisterInfo::TGID_Y:
+ return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
+ case SIRegisterInfo::TGID_Z:
+ return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
+ case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+ return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
+ case SIRegisterInfo::SCRATCH_PTR:
+ return AMDGPU::SGPR2_SGPR3;
+ }
+}
Modified: llvm/trunk/lib/Target/R600/SIRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIRegisterInfo.h?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIRegisterInfo.h (original)
+++ llvm/trunk/lib/Target/R600/SIRegisterInfo.h Mon Jul 21 10:45:01 2014
@@ -29,6 +29,12 @@ struct SIRegisterInfo : public AMDGPUReg
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
+ bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS) const override;
+
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
@@ -68,6 +74,19 @@ struct SIRegisterInfo : public AMDGPUReg
/// \returns True if operands defined with this register class can accept
/// inline immediates.
bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
+
+ enum PreloadedValue {
+ TGID_X,
+ TGID_Y,
+ TGID_Z,
+ SCRATCH_WAVE_OFFSET,
+ SCRATCH_PTR
+ };
+
+ /// \brief Returns the physical register that \p Value is stored in.
+ unsigned getPreloadedValue(const MachineFunction &MF,
+ enum PreloadedValue Value) const;
+
};
} // End namespace llvm
Modified: llvm/trunk/test/CodeGen/R600/array-ptr-calc-i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/array-ptr-calc-i32.ll?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/array-ptr-calc-i32.ll (original)
+++ llvm/trunk/test/CodeGen/R600/array-ptr-calc-i32.ll Mon Jul 21 10:45:01 2014
@@ -11,15 +11,18 @@ declare void @llvm.AMDGPU.barrier.local(
; SI-LABEL: @test_private_array_ptr_calc:
-; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
-
-; SI-ALLOCA: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
+; FIXME: We end up with zero argument for ADD, because
+; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
+; with the appropriate offset. We should fold this into the store.
+; SI-ALLOCA: V_ADD_I32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
+; SI-ALLOCA: BUFFER_STORE_DWORD {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], [[PTRREG]]
;
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
; alloca to a vector. It currently fails because it does not know how
; to interpret:
; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
+; SI-PROMOTE: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
; SI-PROMOTE: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
%alloca = alloca [4 x i32], i32 4, align 16
Modified: llvm/trunk/test/CodeGen/R600/gv-const-addrspace.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/gv-const-addrspace.ll?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/gv-const-addrspace.ll (original)
+++ llvm/trunk/test/CodeGen/R600/gv-const-addrspace.ll Mon Jul 21 10:45:01 2014
@@ -76,3 +76,22 @@ define void @array_v1_gv_load(<1 x i32>
store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
ret void
}
+
+define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+entry:
+ %0 = icmp eq i32 0, %a
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+ %2 = load float addrspace(2)* %1
+ store float %2, float addrspace(1)* %out
+ br label %endif
+
+else:
+ store float 1.0, float addrspace(1)* %out
+ br label %endif
+
+endif:
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/R600/indirect-private-64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/indirect-private-64.ll?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/indirect-private-64.ll (original)
+++ llvm/trunk/test/CodeGen/R600/indirect-private-64.ll Mon Jul 21 10:45:01 2014
@@ -6,10 +6,10 @@ declare void @llvm.AMDGPU.barrier.local(
; SI-LABEL: @private_access_f64_alloca:
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
+; SI-ALLOCA: BUFFER_STORE_DWORDX2
+; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
@@ -26,10 +26,12 @@ define void @private_access_f64_alloca(d
; SI-LABEL: @private_access_v2f64_alloca:
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
+; SI-ALLOCA: BUFFER_STORE_DWORDX4
+; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
@@ -52,10 +54,10 @@ define void @private_access_v2f64_alloca
; SI-LABEL: @private_access_i64_alloca:
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
+; SI-ALLOCA: BUFFER_STORE_DWORDX2
+; FIXME: We should be able to use BUFFER_LOAD_DWORDX2
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
@@ -72,14 +74,12 @@ define void @private_access_i64_alloca(i
; SI-LABEL: @private_access_v2i64_alloca:
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELD_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
-; SI-ALLOCA: V_MOVRELS_B32_e32
+; SI-ALLOCA: BUFFER_STORE_DWORDX4
+; FIXME: We should be able to use BUFFER_LOAD_DWORDX4
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
+; SI-ALLOCA: BUFFER_LOAD_DWORD
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
Modified: llvm/trunk/test/CodeGen/R600/private-memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/private-memory.ll?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/private-memory.ll (original)
+++ llvm/trunk/test/CodeGen/R600/private-memory.ll Mon Jul 21 10:45:01 2014
@@ -16,12 +16,8 @@ declare i32 @llvm.r600.read.tidig.x() no
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
-; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
-; SI-ALLOCA: V_MOVRELD
-; SI-ALLOCA: S_CBRANCH
-; SI-ALLOCA: V_READFIRSTLANE_B32 vcc_lo
-; SI-ALLOCA: V_MOVRELD
-; SI-ALLOCA: S_CBRANCH
+; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
+; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
entry:
%stack = alloca [5 x i32], align 4
@@ -120,7 +116,9 @@ for.end:
; R600: MOVA_INT
-; SI-PROMOTE: V_MOVRELS_B32_e32
+; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
+; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}
+; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v{{[0-9]+}}, s{{[0-9]+}}
define void @short_array(i32 addrspace(1)* %out, i32 %index) {
entry:
%0 = alloca [2 x i16]
@@ -139,8 +137,8 @@ entry:
; R600: MOVA_INT
-; SI: V_OR_B32_e32 v{{[0-9]}}, 0x100
-; SI: V_MOVRELS_B32_e32
+; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x0
+; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{[0-9]+}}, 0x1
define void @char_array(i32 addrspace(1)* %out, i32 %index) {
entry:
%0 = alloca [2 x i8]
Modified: llvm/trunk/test/CodeGen/R600/work-item-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/work-item-intrinsics.ll?rev=213551&r1=213550&r2=213551&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/work-item-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/R600/work-item-intrinsics.ll Mon Jul 21 10:45:01 2014
@@ -127,12 +127,12 @@ entry:
ret void
}
-; The tgid values are stored in ss offset by the number of user ss.
-; Currently we always use exactly 2 user ss for the pointer to the
+; The tgid values are stored in sgprs offset by the number of user sgprs.
+; Currently we always use exactly 2 user sgprs for the pointer to the
; kernel arguments, but this may change in the future.
; SI-CHECK: @tgid_x
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
define void @tgid_x (i32 addrspace(1)* %out) {
entry:
@@ -142,7 +142,7 @@ entry:
}
; SI-CHECK: @tgid_y
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s5
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
define void @tgid_y (i32 addrspace(1)* %out) {
entry:
@@ -152,7 +152,7 @@ entry:
}
; SI-CHECK: @tgid_z
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s6
; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
define void @tgid_z (i32 addrspace(1)* %out) {
entry:
More information about the llvm-commits
mailing list