[PATCH] R600/SI: Add preliminary support for flat address space
Matt Arsenault
Matthew.Arsenault at amd.com
Wed Feb 5 15:02:09 PST 2014
On 02/05/2014 02:48 PM, Alex Deucher wrote:
> On Wed, Feb 5, 2014 at 5:41 PM, Tom Stellard <tom at stellard.net> wrote:
>> Hi Matt,
>>
>> I can look into what it would take to get FLAT address space working on
>> real hardware. I really don't know much about it, so I'm not sure
>> how hard it will be to implement.
> It's been a while since I looked at flat ops, but I'm pretty sure it
> would require changes to the kernel driver to set up the GPU memory
> controller correctly to support them.
I don't know anything about the runtime components of supporting this,
but it will eventually be useful for OpenCL 2.0 support. Right now I'm
mostly interested in having an in-tree target that I can play with
addrspacecast and related features, so I don't think it's very important
to get the other parts working right away.
> Additionally, I think they are only supported on CI parts.
Correct
>
> Alex
>
>> -Tom
>> On Wed, Feb 05, 2014 at 10:55:05AM -0800, Matt Arsenault wrote:
>>> This currently still breaks some tests, and I don't know if it will actually run, but it should at least work as an example for lowering addrspacecast.
>>>
>>> http://llvm-reviews.chandlerc.com/D2707
>>>
>>> Files:
>>> lib/Target/R600/AMDGPU.h
>>> lib/Target/R600/AMDGPU.td
>>> lib/Target/R600/AMDGPUAsmPrinter.cpp
>>> lib/Target/R600/AMDGPUAsmPrinter.h
>>> lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>> lib/Target/R600/AMDGPUInstrInfo.h
>>> lib/Target/R600/AMDGPUInstructions.td
>>> lib/Target/R600/AMDGPUMachineFunction.cpp
>>> lib/Target/R600/AMDGPUMachineFunction.h
>>> lib/Target/R600/AMDGPUSubtarget.cpp
>>> lib/Target/R600/AMDGPUSubtarget.h
>>> lib/Target/R600/AMDGPUTargetMachine.cpp
>>> lib/Target/R600/AMDILInstrInfo.td
>>> lib/Target/R600/SIInstrFormats.td
>>> lib/Target/R600/SIInstrInfo.cpp
>>> lib/Target/R600/SIInstrInfo.td
>>> lib/Target/R600/SIInstructions.td
>>> lib/Target/R600/SILowerControlFlow.cpp
>>> lib/Target/R600/SIRegisterInfo.td
>>> test/CodeGen/R600/flat-address-space.ll
>>> Index: lib/Target/R600/AMDGPU.h
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPU.h
>>> +++ lib/Target/R600/AMDGPU.h
>>> @@ -76,33 +76,34 @@
>>> GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
>>> CONSTANT_ADDRESS = 2, ///< Address space for constant memory
>>> LOCAL_ADDRESS = 3, ///< Address space for local memory.
>>> - REGION_ADDRESS = 4, ///< Address space for region memory.
>>> - ADDRESS_NONE = 5, ///< Address space for unknown memory.
>>> - PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
>>> - PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
>>> + FLAT_ADDRESS = 4, ///< Address space for flat accesses to local, private or global.
>>> + REGION_ADDRESS = 5, ///< Address space for region memory.
>>> + ADDRESS_NONE = 6, ///< Address space for unknown memory.
>>> + PARAM_D_ADDRESS = 7, ///< Address space for direct addressible parameter memory (CONST0)
>>> + PARAM_I_ADDRESS = 8, ///< Address space for indirect addressible parameter memory (VTX1)
>>>
>>> // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this
>>> // order to be able to dynamically index a constant buffer, for example:
>>> //
>>> // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
>>>
>>> - CONSTANT_BUFFER_0 = 8,
>>> - CONSTANT_BUFFER_1 = 9,
>>> - CONSTANT_BUFFER_2 = 10,
>>> - CONSTANT_BUFFER_3 = 11,
>>> - CONSTANT_BUFFER_4 = 12,
>>> - CONSTANT_BUFFER_5 = 13,
>>> - CONSTANT_BUFFER_6 = 14,
>>> - CONSTANT_BUFFER_7 = 15,
>>> - CONSTANT_BUFFER_8 = 16,
>>> - CONSTANT_BUFFER_9 = 17,
>>> - CONSTANT_BUFFER_10 = 18,
>>> - CONSTANT_BUFFER_11 = 19,
>>> - CONSTANT_BUFFER_12 = 20,
>>> - CONSTANT_BUFFER_13 = 21,
>>> - CONSTANT_BUFFER_14 = 22,
>>> - CONSTANT_BUFFER_15 = 23,
>>> - LAST_ADDRESS = 24
>>> + CONSTANT_BUFFER_0 = 9,
>>> + CONSTANT_BUFFER_1 = 10,
>>> + CONSTANT_BUFFER_2 = 11,
>>> + CONSTANT_BUFFER_3 = 12,
>>> + CONSTANT_BUFFER_4 = 13,
>>> + CONSTANT_BUFFER_5 = 14,
>>> + CONSTANT_BUFFER_6 = 15,
>>> + CONSTANT_BUFFER_7 = 16,
>>> + CONSTANT_BUFFER_8 = 17,
>>> + CONSTANT_BUFFER_9 = 18,
>>> + CONSTANT_BUFFER_10 = 19,
>>> + CONSTANT_BUFFER_11 = 20,
>>> + CONSTANT_BUFFER_12 = 21,
>>> + CONSTANT_BUFFER_13 = 22,
>>> + CONSTANT_BUFFER_14 = 23,
>>> + CONSTANT_BUFFER_15 = 24,
>>> + LAST_ADDRESS = 25
>>> };
>>>
>>> } // namespace AMDGPUAS
>>> Index: lib/Target/R600/AMDGPU.td
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPU.td
>>> +++ lib/Target/R600/AMDGPU.td
>>> @@ -68,6 +68,11 @@
>>> "true",
>>> "GPU has CF_ALU bug">;
>>>
>>> +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
>>> + "FlatAddressSpace",
>>> + "true",
>>> + "Support flat address space">;
>>> +
>>> class SubtargetFeatureFetchLimit <string Value> :
>>> SubtargetFeature <"fetch"#Value,
>>> "TexVTXClauseSize",
>>> @@ -108,7 +113,7 @@
>>> [Feature64BitPtr, FeatureFP64]>;
>>>
>>> def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
>>> - [Feature64BitPtr, FeatureFP64]>;
>>> + [Feature64BitPtr, FeatureFP64, FeatureFlatAddressSpace]>;
>>> //===----------------------------------------------------------------------===//
>>>
>>> def AMDGPUInstrInfo : InstrInfo {
>>> Index: lib/Target/R600/AMDGPUAsmPrinter.cpp
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUAsmPrinter.cpp
>>> +++ lib/Target/R600/AMDGPUAsmPrinter.cpp
>>> @@ -66,7 +66,7 @@
>>> const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
>>> SIProgramInfo KernelInfo;
>>> if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
>>> - findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
>>> + findUsedRegistersSI(MF, KernelInfo);
>>> EmitProgramInfoSI(MF, KernelInfo);
>>> } else {
>>> EmitProgramInfoR600(MF);
>>> @@ -186,14 +186,14 @@
>>> }
>>> }
>>>
>>> -void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
>>> - unsigned &NumSGPR,
>>> - unsigned &NumVGPR) const {
>>> +void AMDGPUAsmPrinter::findUsedRegistersSI(MachineFunction &MF,
>>> + SIProgramInfo &Out) const {
>>> unsigned MaxSGPR = 0;
>>> unsigned MaxVGPR = 0;
>>> bool VCCUsed = false;
>>> - const SIRegisterInfo * RI =
>>> - static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
>>> + bool FlatUsed = false;
>>> + const SIRegisterInfo *RI
>>> + = static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
>>>
>>> for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
>>> BB != BB_E; ++BB) {
>>> @@ -215,6 +215,10 @@
>>> if (reg == AMDGPU::VCC) {
>>> VCCUsed = true;
>>> continue;
>>> + } else if (reg == AMDGPU::FLAT_SCRATCH_SIZE ||
>>> + reg == AMDGPU::FLAT_SCRATCH_OFFSET) {
>>> + FlatUsed = true;
>>> + continue;
>>> }
>>>
>>> switch (reg) {
>>> @@ -275,13 +279,18 @@
>>> if (VCCUsed)
>>> MaxSGPR += 2;
>>>
>>> - NumSGPR = MaxSGPR;
>>> - NumVGPR = MaxVGPR;
>>> + if (FlatUsed)
>>> + MaxSGPR += 2;
>>> +
>>> + Out.NumSGPR = MaxSGPR;
>>> + Out.NumVGPR = MaxVGPR;
>>> + Out.VCCUsed = VCCUsed;
>>> + Out.FlatUsed = FlatUsed;
>>> }
>>>
>>> void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
>>> MachineFunction &MF) const {
>>> - findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
>>> + findUsedRegistersSI(MF, Out);
>>> }
>>>
>>> void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
>>> @@ -316,6 +325,7 @@
>>> if (MFI->ShaderType == ShaderType::COMPUTE) {
>>> OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
>>> OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
>>> + // TODO: Should probably note flat usage somewhere
>>> }
>>> if (MFI->ShaderType == ShaderType::PIXEL) {
>>> OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
>>> Index: lib/Target/R600/AMDGPUAsmPrinter.h
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUAsmPrinter.h
>>> +++ lib/Target/R600/AMDGPUAsmPrinter.h
>>> @@ -24,15 +24,19 @@
>>> class AMDGPUAsmPrinter : public AsmPrinter {
>>> private:
>>> struct SIProgramInfo {
>>> - SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
>>> + SIProgramInfo() : NumSGPR(0),
>>> + NumVGPR(0),
>>> + VCCUsed(false),
>>> + FlatUsed(false) {}
>>> unsigned NumSGPR;
>>> unsigned NumVGPR;
>>> + bool VCCUsed;
>>> + bool FlatUsed;
>>> };
>>>
>>> void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const;
>>> - void findNumUsedRegistersSI(MachineFunction &MF,
>>> - unsigned &NumSGPR,
>>> - unsigned &NumVGPR) const;
>>> + void findUsedRegistersSI(MachineFunction &MF,
>>> + SIProgramInfo &Out) const;
>>>
>>> /// \brief Emit register usage information so that the GPU driver
>>> /// can correctly setup the GPU state.
>>> Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>> +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>> @@ -61,10 +61,12 @@
>>> SDValue SimplifyI24(SDValue &Op);
>>> bool SelectI24(SDValue Addr, SDValue &Op);
>>> bool SelectU24(SDValue Addr, SDValue &Op);
>>> + SDNode *SelectAddrSpaceCast(SDNode *N);
>>>
>>> static bool checkType(const Value *ptr, unsigned int addrspace);
>>>
>>> static bool isGlobalStore(const StoreSDNode *N);
>>> + static bool isFlatStore(const StoreSDNode *N);
>>> static bool isPrivateStore(const StoreSDNode *N);
>>> static bool isLocalStore(const StoreSDNode *N);
>>> static bool isRegionStore(const StoreSDNode *N);
>>> @@ -72,6 +74,7 @@
>>> bool isCPLoad(const LoadSDNode *N) const;
>>> bool isConstantLoad(const LoadSDNode *N, int cbID) const;
>>> bool isGlobalLoad(const LoadSDNode *N) const;
>>> + bool isFlatLoad(const LoadSDNode *N) const;
>>> bool isParamLoad(const LoadSDNode *N) const;
>>> bool isPrivateLoad(const LoadSDNode *N) const;
>>> bool isLocalLoad(const LoadSDNode *N) const;
>>> @@ -343,6 +346,9 @@
>>> CurDAG->getVTList(MVT::Other),
>>> Ops);
>>> }
>>> +
>>> + case ISD::ADDRSPACECAST:
>>> + return SelectAddrSpaceCast(N);
>>> }
>>> return SelectCode(N);
>>> }
>>> @@ -370,6 +376,10 @@
>>> return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
>>> }
>>>
>>> +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
>>> + return checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS);
>>> +}
>>> +
>>> bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
>>> return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
>>> }
>>> @@ -400,6 +410,10 @@
>>> return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
>>> }
>>>
>>> +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const {
>>> + return checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS);
>>> +}
>>> +
>>> bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const {
>>> return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
>>> }
>>> @@ -428,6 +442,7 @@
>>> }
>>> if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
>>> && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
>>> + && !checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS)
>>> && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
>>> && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
>>> && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
>>> @@ -558,6 +573,58 @@
>>> return false;
>>> }
>>>
>>> +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
>>> + AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
>>> + SDLoc DL(N);
>>> +
>>> + assert(Subtarget.hasFlatAddressSpace() &&
>>> + "addrspacecast only supported with flat address space!");
>>> +
>>> + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
>>> + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
>>> + "Cannot cast address space to / from constant address!");
>>> +
>>> + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
>>> + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
>>> + "Can only cast to / from flat address space!");
>>> +
>>> + // The flat instructions read the address as the index of the VGPR holding the
>>> + // address, so casting should just be reinterpreting the base VGPR, so just
>>> + // insert trunc / bitcast / zext.
>>> +
>>> + SDValue Src = ASC->getOperand(0);
>>> + EVT DestVT = ASC->getValueType(0);
>>> + EVT SrcVT = Src.getValueType();
>>> +
>>> + unsigned SrcSize = SrcVT.getSizeInBits();
>>> + unsigned DestSize = DestVT.getSizeInBits();
>>> +
>>> + if (SrcSize > DestSize) {
>>> + assert(SrcSize == 64 && DestSize == 32);
>>> + return CurDAG->getMachineNode(
>>> + TargetOpcode::EXTRACT_SUBREG,
>>> + DL,
>>> + DestVT,
>>> + Src,
>>> + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32));
>>> + }
>>> +
>>> +
>>> + if (DestSize > SrcSize) {
>>> + assert(SrcSize == 32 && DestSize == 64);
>>> + return CurDAG->getMachineNode(
>>> + TargetOpcode::SUBREG_TO_REG,
>>> + DL,
>>> + DestVT,
>>> + CurDAG->getTargetConstant(0, MVT::i32),
>>> + Src,
>>> + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32));
>>> + }
>>> +
>>> + assert(SrcSize == 64 && DestSize == 64);
>>> + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
>>> +}
>>> +
>>> void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
>>> const AMDGPUTargetLowering& Lowering =
>>> (*(const AMDGPUTargetLowering*)getTargetLowering());
>>> Index: lib/Target/R600/AMDGPUInstrInfo.h
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUInstrInfo.h
>>> +++ lib/Target/R600/AMDGPUInstrInfo.h
>>> @@ -100,6 +100,7 @@
>>> MachineInstr *MI,
>>> const SmallVectorImpl<unsigned> &Ops,
>>> MachineInstr *LoadMI) const;
>>> +public:
>>> /// \returns the smallest register index that will be accessed by an indirect
>>> /// read or write or -1 if indirect addressing is not used by this program.
>>> virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
>>> @@ -108,7 +109,6 @@
>>> /// read or write or -1 if indirect addressing is not used by this program.
>>> virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
>>>
>>> -public:
>>> bool canFoldMemoryOperand(const MachineInstr *MI,
>>> const SmallVectorImpl<unsigned> &Ops) const;
>>> bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
>>> Index: lib/Target/R600/AMDGPUInstructions.td
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUInstructions.td
>>> +++ lib/Target/R600/AMDGPUInstructions.td
>>> @@ -133,6 +133,14 @@
>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>> }]>;
>>>
>>> +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>> +}]>;
>>> +
>>> +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>> +}]>;
>>> +
>>> def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>> }]>;
>>> @@ -161,6 +169,14 @@
>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>> }]>;
>>>
>>> +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>> +}]>;
>>> +
>>> +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>> +}]>;
>>> +
>>> def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>> }]>;
>>> @@ -186,6 +202,11 @@
>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>> }]>;
>>>
>>> +def az_extloadi32_flat : PatFrag<(ops node:$ptr),
>>> + (az_extloadi32 node:$ptr), [{
>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>> +}]>;
>>> +
>>> def az_extloadi32_constant : PatFrag<(ops node:$ptr),
>>> (az_extloadi32 node:$ptr), [{
>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>> @@ -201,6 +222,16 @@
>>> return isGlobalStore(dyn_cast<StoreSDNode>(N));
>>> }]>;
>>>
>>> +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
>>> + (truncstorei8 node:$val, node:$ptr), [{
>>> + return isFlatStore(dyn_cast<StoreSDNode>(N));
>>> +}]>;
>>> +
>>> +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
>>> + (truncstorei16 node:$val, node:$ptr), [{
>>> + return isFlatStore(dyn_cast<StoreSDNode>(N));
>>> +}]>;
>>> +
>>> def local_store : PatFrag<(ops node:$val, node:$ptr),
>>> (store node:$val, node:$ptr), [{
>>> return isLocalStore(dyn_cast<StoreSDNode>(N));
>>> @@ -235,6 +266,11 @@
>>> return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
>>> }]>;
>>>
>>> +def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
>>> + (AMDGPUstore_mskor node:$val, node:$ptr), [{
>>> + return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
>>> +}]>;
>>> +
>>> class Constants {
>>> int TWO_PI = 0x40c90fdb;
>>> int PI = 0x40490fdb;
>>> Index: lib/Target/R600/AMDGPUMachineFunction.cpp
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUMachineFunction.cpp
>>> +++ lib/Target/R600/AMDGPUMachineFunction.cpp
>>> @@ -10,9 +10,11 @@
>>> void AMDGPUMachineFunction::anchor() {}
>>>
>>> AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
>>> - MachineFunctionInfo() {
>>> - ShaderType = ShaderType::COMPUTE;
>>> - LDSSize = 0;
>>> + MachineFunctionInfo(),
>>> + ShaderType(ShaderType::COMPUTE),
>>> + LDSSize(0),
>>> + ScratchSize(0),
>>> + IsKernel(true) {
>>> AttributeSet Set = MF.getFunction()->getAttributes();
>>> Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
>>> ShaderTypeAttribute);
>>> Index: lib/Target/R600/AMDGPUMachineFunction.h
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUMachineFunction.h
>>> +++ lib/Target/R600/AMDGPUMachineFunction.h
>>> @@ -28,6 +28,8 @@
>>> std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
>>> /// Number of bytes in the LDS that are being used.
>>> unsigned LDSSize;
>>> + unsigned ScratchSize;
>>> + bool IsKernel;
>>> };
>>>
>>> }
>>> Index: lib/Target/R600/AMDGPUSubtarget.cpp
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUSubtarget.cpp
>>> +++ lib/Target/R600/AMDGPUSubtarget.cpp
>>> @@ -36,6 +36,7 @@
>>> Gen = AMDGPUSubtarget::R600;
>>> FP64 = false;
>>> CaymanISA = false;
>>> + FlatAddressSpace = false;
>>> EnableIRStructurizer = true;
>>> EnableIfCvt = true;
>>> WavefrontSize = 0;
>>> Index: lib/Target/R600/AMDGPUSubtarget.h
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUSubtarget.h
>>> +++ lib/Target/R600/AMDGPUSubtarget.h
>>> @@ -49,6 +49,7 @@
>>> enum Generation Gen;
>>> bool FP64;
>>> bool CaymanISA;
>>> + bool FlatAddressSpace;
>>> bool EnableIRStructurizer;
>>> bool EnableIfCvt;
>>> unsigned WavefrontSize;
>>> @@ -68,6 +69,9 @@
>>> enum Generation getGeneration() const;
>>> bool hasHWFP64() const;
>>> bool hasCaymanISA() const;
>>> + bool hasFlatAddressSpace() const {
>>> + return FlatAddressSpace;
>>> + }
>>> bool IsIRStructurizerEnabled() const;
>>> bool isIfCvtEnabled() const;
>>> unsigned getWavefrontSize() const;
>>> Index: lib/Target/R600/AMDGPUTargetMachine.cpp
>>> ===================================================================
>>> --- lib/Target/R600/AMDGPUTargetMachine.cpp
>>> +++ lib/Target/R600/AMDGPUTargetMachine.cpp
>>> @@ -53,8 +53,9 @@
>>> std::string Ret = "e-p:32:32";
>>>
>>> if (ST.is64bit()) {
>>> - // 32-bit private, local, and region pointers. 64-bit global and constant.
>>> - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
>>> + // 32-bit private, local, and region pointers. 64-bit global, flat and
>>> + // constant.
>>> + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p6:64:64";
>>> }
>>>
>>> Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
>>> Index: lib/Target/R600/AMDILInstrInfo.td
>>> ===================================================================
>>> --- lib/Target/R600/AMDILInstrInfo.td
>>> +++ lib/Target/R600/AMDILInstrInfo.td
>>> @@ -74,6 +74,12 @@
>>> return isGlobalStore(dyn_cast<StoreSDNode>(N));
>>> }]>;
>>>
>>> +def flat_store : PatFrag<(ops node:$val, node:$ptr),
>>> + (store node:$val, node:$ptr), [{
>>> + return isFlatStore(dyn_cast<StoreSDNode>(N));
>>> +}]>;
>>> +
>>> +
>>> //===----------------------------------------------------------------------===//
>>> // Load pattern fragments
>>> //===----------------------------------------------------------------------===//
>>> @@ -81,6 +87,10 @@
>>> def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>> }]>;
>>> +// Flat address space loads
>>> +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>> +}]>;
>>> // Constant address space loads
>>> def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>> Index: lib/Target/R600/SIInstrFormats.td
>>> ===================================================================
>>> --- lib/Target/R600/SIInstrFormats.td
>>> +++ lib/Target/R600/SIInstrFormats.td
>>> @@ -446,6 +446,36 @@
>>> let MIMG = 1;
>>> }
>>>
>>> +class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
>>> + Enc64<outs, ins, asm, pattern> {
>>> + bits<8> addr;
>>> + bits<8> data;
>>> + bits<8> vdst;
>>> + bits<1> slc;
>>> + bits<1> glc;
>>> + bits<1> tfe;
>>> +
>>> + // 15-0 is reserved.
>>> + let Inst{16} = glc;
>>> + let Inst{17} = slc;
>>> + let Inst{24-18} = op;
>>> + let Inst{31-26} = 0x37; // Encoding.
>>> + let Inst{39-32} = addr;
>>> + let Inst{47-40} = data;
>>> + // 54-48 is reserved.
>>> + let Inst{55} = tfe;
>>> + let Inst{63-56} = vdst;
>>> +
>>> + // Internally, FLAT instruction are executed as both an LDS and a
>>> + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
>>> + // and are not considered done until both have been decremented.
>>> + let VM_CNT = 1;
>>> + let EXP_CNT = 1; // XXX - Need this?
>>> + let LGKM_CNT = 1;
>>> +
>>> + let neverHasSideEffects = 1;
>>> +}
>>> +
>>> def EXP : Enc64<
>>> (outs),
>>> (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
>>> Index: lib/Target/R600/SIInstrInfo.cpp
>>> ===================================================================
>>> --- lib/Target/R600/SIInstrInfo.cpp
>>> +++ lib/Target/R600/SIInstrInfo.cpp
>>> @@ -383,6 +383,11 @@
>>> if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
>>> ++ConstantBusCount;
>>>
>>> + // XXX - I'm sort of guessing about this.
>>> + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCRATCH_SIZE ||
>>> + MO.getReg() == AMDGPU::FLAT_SCRATCH_OFFSET))
>>> + ++ConstantBusCount;
>>> +
>>> // SGPRs use the constant bus
>>> if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
>>> (!MO.isImplicit() &&
>>> Index: lib/Target/R600/SIInstrInfo.td
>>> ===================================================================
>>> --- lib/Target/R600/SIInstrInfo.td
>>> +++ lib/Target/R600/SIInstrInfo.td
>>> @@ -132,6 +132,8 @@
>>> def SIOperand {
>>> int ZERO = 0x80;
>>> int VCC = 0x6A;
>>> + int FLAT_SCRATCH_OFFSET = 0x68;
>>> + int FLAT_SCRATCH_SIZE = 0x69;
>>> }
>>>
>>> include "SIInstrFormats.td"
>>> @@ -490,6 +492,31 @@
>>> let soffset = 128; // ZERO
>>> }
>>>
>>> +class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
>>> + FLAT <op, (outs regClass:$data),
>>> + (ins VReg_64:$addr),
>>> + asm#" $data, $addr, [M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]", []> {
>>> + let glc = 0;
>>> + let slc = 0;
>>> + let tfe = 0;
>>> + let mayLoad = 1;
>>> + let Uses = [EXEC, M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE];
>>> +}
>>> +
>>> +class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
>>> + FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr),
>>> + name#" $data, $addr, [M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]",
>>> + []> {
>>> +
>>> + let mayLoad = 0;
>>> + let mayStore = 1;
>>> +
>>> + // Encoding
>>> + let glc = 0;
>>> + let slc = 0;
>>> + let tfe = 0;
>>> +}
>>> +
>>> class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
>>> op,
>>> (outs regClass:$dst),
>>> Index: lib/Target/R600/SIInstructions.td
>>> ===================================================================
>>> --- lib/Target/R600/SIInstructions.td
>>> +++ lib/Target/R600/SIInstructions.td
>>> @@ -29,6 +29,11 @@
>>> def isSI : Predicate<"Subtarget.getGeneration() "
>>> ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
>>>
>>> +def isCI : Predicate<"Subtarget.getGeneration() "
>>> + ">= AMDGPUSubtarget::SEA_ISLANDS">;
>>> +def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
>>> +
>>> +
>>> def WAIT_FLAG : InstFlag<"printWaitFlag">;
>>>
>>> let Predicates = [isSI] in {
>>> @@ -491,6 +496,78 @@
>>> def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
>>> def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
>>>
>>> +let Predicates = [HasFlatAddressSpace] in {
>>> +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "FLAT_LOAD_UBYTE", VReg_32>;
>>> +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "FLAT_LOAD_SBYTE", VReg_32>;
>>> +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "FLAT_LOAD_USHORT", VReg_32>;
>>> +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "FLAT_LOAD_SSHORT", VReg_32>;
>>> +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "FLAT_LOAD_DWORD", VReg_32>;
>>> +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "FLAT_LOAD_DWORDX2", VReg_64>;
>>> +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "FLAT_LOAD_DWORDX4", VReg_128>;
>>> +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "FLAT_LOAD_DWORDX3", VReg_96>;
>>> +
>>> +def FLAT_STORE_BYTE : FLAT_Store_Helper <
>>> + 0x00000018, "FLAT_STORE_BYTE", VReg_32
>>> +>;
>>> +
>>> +def FLAT_STORE_SHORT : FLAT_Store_Helper <
>>> + 0x0000001a, "FLAT_STORE_SHORT", VReg_32
>>> +>;
>>> +
>>> +def FLAT_STORE_DWORD : FLAT_Store_Helper <
>>> + 0x0000001c, "FLAT_STORE_DWORD", VReg_32
>>> +>;
>>> +
>>> +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
>>> + 0x0000001d, "FLAT_STORE_DWORDX2", VReg_64
>>> +>;
>>> +
>>> +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
>>> + 0x0000001e, "FLAT_STORE_DWORDX4", VReg_128
>>> +>;
>>> +
>>> +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
>>> + 0x0000001e, "FLAT_STORE_DWORDX3", VReg_96
>>> +>;
>>> +
>>> +//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "FLAT_ATOMIC_SWAP", []>;
>>> +//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "FLAT_ATOMIC_CMPSWAP", []>;
>>> +//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "FLAT_ATOMIC_ADD", []>;
>>> +//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "FLAT_ATOMIC_SUB", []>;
>>> +//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "FLAT_ATOMIC_RSUB", []>;
>>> +//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "FLAT_ATOMIC_SMIN", []>;
>>> +//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "FLAT_ATOMIC_UMIN", []>;
>>> +//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "FLAT_ATOMIC_SMAX", []>;
>>> +//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "FLAT_ATOMIC_UMAX", []>;
>>> +//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "FLAT_ATOMIC_AND", []>;
>>> +//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "FLAT_ATOMIC_OR", []>;
>>> +//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "FLAT_ATOMIC_XOR", []>;
>>> +//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "FLAT_ATOMIC_INC", []>;
>>> +//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "FLAT_ATOMIC_DEC", []>;
>>> +//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "FLAT_ATOMIC_FCMPSWAP", []>;
>>> +//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "FLAT_ATOMIC_FMIN", []>;
>>> +//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "FLAT_ATOMIC_FMAX", []>;
>>> +//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "FLAT_ATOMIC_SWAP_X2", []>;
>>> +//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "FLAT_ATOMIC_CMPSWAP_X2", []>;
>>> +//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "FLAT_ATOMIC_ADD_X2", []>;
>>> +//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "FLAT_ATOMIC_SUB_X2", []>;
>>> +//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "FLAT_ATOMIC_RSUB_X2", []>;
>>> +//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "FLAT_ATOMIC_SMIN_X2", []>;
>>> +//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "FLAT_ATOMIC_UMIN_X2", []>;
>>> +//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "FLAT_ATOMIC_SMAX_X2", []>;
>>> +//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "FLAT_ATOMIC_UMAX_X2", []>;
>>> +//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "FLAT_ATOMIC_AND_X2", []>;
>>> +//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "FLAT_ATOMIC_OR_X2", []>;
>>> +//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "FLAT_ATOMIC_XOR_X2", []>;
>>> +//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "FLAT_ATOMIC_INC_X2", []>;
>>> +//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "FLAT_ATOMIC_DEC_X2", []>;
>>> +//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "FLAT_ATOMIC_FCMPSWAP_X2", []>;
>>> +//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "FLAT_ATOMIC_FMIN_X2", []>;
>>> +//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "FLAT_ATOMIC_FMAX_X2", []>;
>>> +
>>> +} // End HasFlatAddressSpace predicate
>>> +
>>> +
>>> let mayLoad = 1 in {
>>>
>>> // We are using the SGPR_32 and not the SReg_32 register class for 32-bit
>>> @@ -2084,6 +2161,39 @@
>>> def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
>>> def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
>>>
>>> +//===----------------------------------------------------------------------===//
>>> +// Flat Patterns
>>> +//===----------------------------------------------------------------------===//
>>> +
>>> +class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
>>> + PatFrag flat_ld> :
>>> + Pat <(vt (flat_ld i64:$ptr)),
>>> + (Instr_ADDR64 $ptr)
>>> +>;
>>> +
>>> +def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
>>> +
>>> +class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
>>> + Pat <(st vt:$value, i64:$ptr),
>>> + (Instr $value, $ptr)
>>> + >;
>>> +
>>> +def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
>>> +def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
>>> +def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
>>> +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
>>> +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
>>> +def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
>>> +
>>> +
>>> /********** ====================== **********/
>>> /********** Indirect adressing **********/
>>> /********** ====================== **********/
>>> Index: lib/Target/R600/SILowerControlFlow.cpp
>>> ===================================================================
>>> --- lib/Target/R600/SILowerControlFlow.cpp
>>> +++ lib/Target/R600/SILowerControlFlow.cpp
>>> @@ -51,6 +51,7 @@
>>> #include "AMDGPU.h"
>>> #include "SIInstrInfo.h"
>>> #include "SIMachineFunctionInfo.h"
>>> +#include "llvm/CodeGen/MachineFrameInfo.h"
>>> #include "llvm/CodeGen/MachineFunction.h"
>>> #include "llvm/CodeGen/MachineFunctionPass.h"
>>> #include "llvm/CodeGen/MachineInstrBuilder.h"
>>> @@ -415,6 +416,7 @@
>>> bool HaveKill = false;
>>> bool NeedM0 = false;
>>> bool NeedWQM = false;
>>> + bool NeedFlat = false;
>>> unsigned Depth = 0;
>>>
>>> for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
>>> @@ -500,6 +502,24 @@
>>> NeedWQM = true;
>>> break;
>>>
>>> + case AMDGPU::FLAT_LOAD_DWORD:
>>> + case AMDGPU::FLAT_LOAD_DWORDX2:
>>> + case AMDGPU::FLAT_LOAD_DWORDX3:
>>> + case AMDGPU::FLAT_LOAD_DWORDX4:
>>> + case AMDGPU::FLAT_LOAD_SBYTE:
>>> + case AMDGPU::FLAT_LOAD_SSHORT:
>>> + case AMDGPU::FLAT_LOAD_UBYTE:
>>> + case AMDGPU::FLAT_LOAD_USHORT:
>>> + case AMDGPU::FLAT_STORE_BYTE:
>>> + case AMDGPU::FLAT_STORE_DWORD:
>>> + case AMDGPU::FLAT_STORE_DWORDX2:
>>> + case AMDGPU::FLAT_STORE_DWORDX3:
>>> + case AMDGPU::FLAT_STORE_DWORDX4:
>>> + case AMDGPU::FLAT_STORE_SHORT:
>>> + // TODO: atomics and other flat instructions
>>> + NeedFlat = true;
>>> + break;
>>> +
>>> }
>>> }
>>> }
>>> @@ -518,5 +538,39 @@
>>> AMDGPU::EXEC).addReg(AMDGPU::EXEC);
>>> }
>>>
>>> + // FIXME: This seems inappropriate to do here.
>>> + if (NeedFlat && MFI->IsKernel) {
>>> + // Insert the prologue initializing the SGPRs pointing to the scratch space
>>> + // for flat accesses.
>>> + const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
>>> +
>>> + // TODO: What to use with function calls?
>>> + unsigned StackSizeBytes = FrameInfo->getStackSize();
>>> +
>>> + int IndirectBegin = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
>>> +
>>> + // Convert register index to 256-byte unit.
>>> + // XXX - Does it mean bits? 256-bytes seems wrong.
>>> + unsigned StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
>>> +
>>> + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
>>> + "Stack limits should be smaller than 16-bits");
>>> +
>>> + // Initialize the flat scratch register pair.
>>> +
>>> + // Offset is in units of 256-bytes.
>>> + MachineBasicBlock &MBB = MF.front();
>>> + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(),
>>> + TII->get(AMDGPU::S_MOVK_I32),
>>> + AMDGPU::FLAT_SCRATCH_OFFSET).addImm(StackOffset);
>>> +
>>> + // XXX - Documentation says size is "per-thread scratch size in bytes", but
>>> + // that's crazy. Maybe it means per wave?
>>> + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(),
>>> + TII->get(AMDGPU::S_MOVK_I32),
>>> + AMDGPU::FLAT_SCRATCH_SIZE).addImm(StackSizeBytes);
>>> +
>>> + }
>>> +
>>> return true;
>>> }
>>> Index: lib/Target/R600/SIRegisterInfo.td
>>> ===================================================================
>>> --- lib/Target/R600/SIRegisterInfo.td
>>> +++ lib/Target/R600/SIRegisterInfo.td
>>> @@ -17,11 +17,20 @@
>>> }
>>>
>>> // Special Registers
>>> +
>>> +// Pair to indicate location of scratch space for flat accesses.
>>> +// Offset is in units of 256-bytes,
>>> +def FLAT_SCRATCH_OFFSET : SIReg <"FLAT_SCRATCH_OFFSET", 104>;
>>> +
>>> +// Size is the per-thread scratch size, in bytes.
>>> +def FLAT_SCRATCH_SIZE : SIReg <"FLAT_SCRATCH_SIZE", 105>;
>>> +
>>> def VCC : SIReg<"VCC", 106>;
>>> def EXEC : SIReg<"EXEC", 126>;
>>> def SCC : SIReg<"SCC", 253>;
>>> def M0 : SIReg <"M0", 124>;
>>>
>>> +
>>> // SGPR registers
>>> foreach Index = 0-101 in {
>>> def SGPR#Index : SIReg <"SGPR"#Index, Index>;
>>> Index: test/CodeGen/R600/flat-address-space.ll
>>> ===================================================================
>>> --- /dev/null
>>> +++ test/CodeGen/R600/flat-address-space.ll
>>> @@ -0,0 +1,182 @@
>>> +; RUN: llc -O0 -march=r600 -mcpu=bonaire < %s | FileCheck %s
>>> +
>>> +; Disable optimizations in case there are optimizations added that
>>> +; specialize away generic pointer accesses.
>>> +
>>> +
>>> +; CHECK-LABEL: @branch_use_flat_i32:
>>> +; CHECK: ; BB#3: ; %global
>>> +
>>> +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-1]+]], {{s[0-9]+}}
>>> +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-1]+]], {{s[0-9]+}}
>>> +
>>> +; CHECK: ; BB#2: ; %local
>>> +
>>> +; CHECK: V_MOV_B32_e32 v[[LO_VREG]], {{s[0-9]+}}
>>> +; CHECK: V_MOV_B32_e32 v[[HI_VREG]], {{s[0-9]+}}
>>> +
>>> +; CHECK: FLAT_STORE_DWORD {{v[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
>>> +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
>>> +entry:
>>> + %cmp = icmp ne i32 %c, 0
>>> + br i1 %cmp, label %local, label %global
>>> +
>>> +local:
>>> + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
>>> + br label %end
>>> +
>>> +global:
>>> + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
>>> + br label %end
>>> +
>>> +end:
>>> + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
>>> + store i32 %x, i32 addrspace(4)* %fptr, align 4
>>> +; %val = load i32 addrspace(4)* %fptr, align 4
>>> +; store i32 %val, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +
>>> +
>>> +; These testcases might become useless when there are optimizations to
>>> +; remove generic pointers.
>>> +
>>> +; CHECK-LABEL: @store_flat_i32:
>>> +; CHECK: V_MOV_B32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
>>> +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
>>> +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
>>> +; CHECK: FLAT_STORE_DWORD v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
>>> +define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
>>> + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
>>> + store i32 %x, i32 addrspace(4)* %fptr, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @store_flat_i64:
>>> +; CHECK: FLAT_STORE_DWORDX2
>>> +define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
>>> + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
>>> + store i64 %x, i64 addrspace(4)* %fptr, align 8
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @store_flat_v4i32:
>>> +; CHECK: FLAT_STORE_DWORDX4
>>> +define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
>>> + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
>>> + store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @store_flat_trunc_i16:
>>> +; CHECK: FLAT_STORE_SHORT
>>> +define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
>>> + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
>>> + %y = trunc i32 %x to i16
>>> + store i16 %y, i16 addrspace(4)* %fptr, align 2
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @store_flat_trunc_i8:
>>> +; CHECK: FLAT_STORE_BYTE
>>> +define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
>>> + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
>>> + %y = trunc i32 %x to i8
>>> + store i8 %y, i8 addrspace(4)* %fptr, align 2
>>> + ret void
>>> +}
>>> +
>>> +
>>> +
>>> +; CHECK-LABEL @load_flat_i32:
>>> +; CHECK: FLAT_LOAD_DWORD
>>> +define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
>>> + %fload = load i32 addrspace(4)* %fptr, align 4
>>> + store i32 %fload, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL @load_flat_i64:
>>> +; CHECK: FLAT_LOAD_DWORDX2
>>> +define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
>>> + %fload = load i64 addrspace(4)* %fptr, align 4
>>> + store i64 %fload, i64 addrspace(1)* %out, align 8
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL @load_flat_v4i32:
>>> +; CHECK: FLAT_LOAD_DWORDX4
>>> +define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
>>> + %fload = load <4 x i32> addrspace(4)* %fptr, align 4
>>> + store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL @sextload_flat_i8:
>>> +; CHECK: FLAT_LOAD_SBYTE
>>> +define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
>>> + %fload = load i8 addrspace(4)* %fptr, align 4
>>> + %ext = sext i8 %fload to i32
>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL @zextload_flat_i8:
>>> +; CHECK: FLAT_LOAD_UBYTE
>>> +define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
>>> + %fload = load i8 addrspace(4)* %fptr, align 4
>>> + %ext = zext i8 %fload to i32
>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL @sextload_flat_i16:
>>> +; CHECK: FLAT_LOAD_SSHORT
>>> +define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
>>> + %fload = load i16 addrspace(4)* %fptr, align 4
>>> + %ext = sext i16 %fload to i32
>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +; CHECK-LABEL @zextload_flat_i16:
>>> +; CHECK: FLAT_LOAD_USHORT
>>> +define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
>>> + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
>>> + %fload = load i16 addrspace(4)* %fptr, align 4
>>> + %ext = zext i16 %fload to i32
>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +declare void @llvm.AMDGPU.barrier.local() #1
>>> +
>>> +
>>> +; Check for prologue initializing special SGPRs pointing to scratch.
>>> +; CHECK-LABEL: @store_flat_scratch:
>>> +; CHECK: S_MOVK_I32 FLAT_SCRATCH_SIZE, 40
>>> +; CHECK: S_MOVK_I32 FLAT_SCRATCH_OFFSET,
>>> +; CHECK: FLAT_STORE_DWORD
>>> +; CHECK: S_BARRIER
>>> +; CHECK: FLAT_LOAD_DWORD
>>> +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32 %x) #0 {
>>> + %alloca = alloca i32, i32 9, align 4
>>> + %pptr = getelementptr i32* %alloca, i32 %x
>>> + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
>>> + store i32 %x, i32 addrspace(4)* %fptr
>>> + ; Dummy call
>>> + call void @llvm.AMDGPU.barrier.local() #1
>>> + %reload = load i32 addrspace(4)* %fptr, align 4
>>> + store i32 %reload, i32 addrspace(1)* %out, align 4
>>> + ret void
>>> +}
>>> +
>>> +attributes #0 = { nounwind }
>>> +attributes #1 = { nounwind noduplicate }
>>> _______________________________________________
>>> llvm-commits mailing list
>>> llvm-commits at cs.uiuc.edu
>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
More information about the llvm-commits
mailing list