[PATCH] R600/SI: Add preliminary support for flat address space
Matt Arsenault
arsenm2 at gmail.com
Mon Sep 15 08:51:59 PDT 2014
On Sep 15, 2014, at 10:01 AM, Tom Stellard <tom at stellard.net> wrote:
> LGTM.
>
> -Tom
>
r217775-r217777
I reverted the register initialization to use 2 separate movs since I’m not sure what s_mov_b64 does with an immediate.
> On Sat, Sep 13, 2014 at 03:29:37PM -0400, Matt Arsenault wrote:
>> Attached are new versions. I’ve fixed addrspacecast selection so that it now doesn’t get verifier errors, but it’s also mostly wrong.
>>
>>
>> On Sep 11, 2014, at 5:09 PM, Tom Stellard <tom at stellard.net> wrote:
>>
>>> On Thu, Sep 11, 2014 at 08:32:45PM +0000, Matt Arsenault wrote:
>>>> Rebase to tot
>>>>
>>>> http://reviews.llvm.org/D2707
>>>>
>>>> Files:
>>>> lib/Target/R600/AMDGPU.td
>>>> lib/Target/R600/AMDGPUAsmPrinter.cpp
>>>> lib/Target/R600/AMDGPUAsmPrinter.h
>>>> lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>>> lib/Target/R600/AMDGPUInstrInfo.h
>>>> lib/Target/R600/AMDGPUInstructions.td
>>>> lib/Target/R600/AMDGPUMachineFunction.cpp
>>>> lib/Target/R600/AMDGPUMachineFunction.h
>>>> lib/Target/R600/AMDGPUSubtarget.cpp
>>>> lib/Target/R600/AMDGPUSubtarget.h
>>>> lib/Target/R600/SIInstrFormats.td
>>>> lib/Target/R600/SIInstrInfo.cpp
>>>> lib/Target/R600/SIInstrInfo.td
>>>> lib/Target/R600/SIInstructions.td
>>>> lib/Target/R600/SILowerControlFlow.cpp
>>>> lib/Target/R600/SIRegisterInfo.td
>>>> test/CodeGen/R600/flat-address-space.ll
>>>
>>>> Index: lib/Target/R600/AMDGPU.td
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPU.td
>>>> +++ lib/Target/R600/AMDGPU.td
>>>> @@ -81,6 +81,11 @@
>>>> "true",
>>>> "GPU has CF_ALU bug">;
>>>>
>>>> +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
>>>> + "FlatAddressSpace",
>>>> + "true",
>>>> + "Support flat address space">;
>>>> +
>>>> class SubtargetFeatureFetchLimit <string Value> :
>>>> SubtargetFeature <"fetch"#Value,
>>>> "TexVTXClauseSize",
>>>> @@ -135,7 +140,7 @@
>>>>
>>>> def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
>>>> [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
>>>> - FeatureWavefrontSize64]>;
>>>> + FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
>>>> //===----------------------------------------------------------------------===//
>>>>
>>>> def AMDGPUInstrInfo : InstrInfo {
>>>> Index: lib/Target/R600/AMDGPUAsmPrinter.cpp
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUAsmPrinter.cpp
>>>> +++ lib/Target/R600/AMDGPUAsmPrinter.cpp
>>>> @@ -240,6 +240,7 @@
>>>> unsigned MaxSGPR = 0;
>>>> unsigned MaxVGPR = 0;
>>>> bool VCCUsed = false;
>>>> + bool FlatUsed = false;
>>>> const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
>>>> TM.getSubtargetImpl()->getRegisterInfo());
>>>>
>>>> @@ -262,6 +263,10 @@
>>>> reg == AMDGPU::VCC_HI) {
>>>> VCCUsed = true;
>>>> continue;
>>>> + } else if (reg == AMDGPU::FLAT_SCRATCH_SIZE ||
>>>> + reg == AMDGPU::FLAT_SCRATCH_OFFSET) {
>>>> + FlatUsed = true;
>>>> + continue;
>>>> }
>>>>
>>>> switch (reg) {
>>>> @@ -322,6 +327,9 @@
>>>> if (VCCUsed)
>>>> MaxSGPR += 2;
>>>>
>>>> + if (FlatUsed)
>>>> + MaxSGPR += 2;
>>>> +
>>>> ProgInfo.NumVGPR = MaxVGPR;
>>>> ProgInfo.NumSGPR = MaxSGPR;
>>>>
>>>> @@ -339,6 +347,11 @@
>>>> ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
>>>>
>>>> ProgInfo.CodeLen = CodeSize;
>>>> +
>>>> + ProgInfo.NumSGPR = MaxSGPR;
>>>> + ProgInfo.NumVGPR = MaxVGPR;
>>>> + ProgInfo.VCCUsed = VCCUsed;
>>>> + ProgInfo.FlatUsed = FlatUsed;
>>>> }
>>>>
>>>> void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
>>>> @@ -392,6 +405,7 @@
>>>> OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
>>>>
>>>> OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
>>>> +
>>>
>>> Random whitespace change.
>>>
>>>> const uint32_t ComputePGMRSrc2 =
>>>> S_00B84C_LDS_SIZE(LDSBlocks) |
>>>> S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
>>>> @@ -400,6 +414,8 @@
>>>>
>>>> OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
>>>> OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
>>>> +
>>>> + // TODO: Should probably note flat usage somewhere
>>>> } else {
>>>> OutStreamer.EmitIntValue(RsrcReg, 4);
>>>> OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
>>>> Index: lib/Target/R600/AMDGPUAsmPrinter.h
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUAsmPrinter.h
>>>> +++ lib/Target/R600/AMDGPUAsmPrinter.h
>>>> @@ -33,6 +33,8 @@
>>>> DebugMode(0),
>>>> IEEEMode(0),
>>>> ScratchSize(0),
>>>> + FlatUsed(false),
>>>> + VCCUsed(false),
>>>> CodeLen(0) {}
>>>>
>>>> // Fields set in PGM_RSRC1 pm4 packet.
>>>> @@ -46,7 +48,10 @@
>>>> uint32_t IEEEMode;
>>>> uint32_t ScratchSize;
>>>>
>>>> + bool FlatUsed;
>>>> +
>>>> // Bonus information for debugging.
>>>> + bool VCCUsed;
>>>> uint64_t CodeLen;
>>>> };
>>>>
>>>> Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>>> +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp
>>>> @@ -65,6 +65,7 @@
>>>> static bool checkPrivateAddress(const MachineMemOperand *Op);
>>>>
>>>> static bool isGlobalStore(const StoreSDNode *N);
>>>> + static bool isFlatStore(const StoreSDNode *N);
>>>> static bool isPrivateStore(const StoreSDNode *N);
>>>> static bool isLocalStore(const StoreSDNode *N);
>>>> static bool isRegionStore(const StoreSDNode *N);
>>>> @@ -72,6 +73,7 @@
>>>> bool isCPLoad(const LoadSDNode *N) const;
>>>> bool isConstantLoad(const LoadSDNode *N, int cbID) const;
>>>> bool isGlobalLoad(const LoadSDNode *N) const;
>>>> + bool isFlatLoad(const LoadSDNode *N) const;
>>>> bool isParamLoad(const LoadSDNode *N) const;
>>>> bool isPrivateLoad(const LoadSDNode *N) const;
>>>> bool isLocalLoad(const LoadSDNode *N) const;
>>>> @@ -104,6 +106,7 @@
>>>> bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
>>>> SDValue &Offset, SDValue &GLC, SDValue &SLC,
>>>> SDValue &TFE) const;
>>>> + SDNode *SelectAddrSpaceCast(SDNode *N);
>>>> bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
>>>> bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
>>>> SDValue &Clamp, SDValue &Omod) const;
>>>> @@ -440,7 +443,6 @@
>>>> CurDAG->getVTList(MVT::Other),
>>>> Ops);
>>>> }
>>>> -
>>>
>>> Random whitespace.
>>>
>>>> case AMDGPUISD::BFE_I32:
>>>> case AMDGPUISD::BFE_U32: {
>>>> if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
>>>> @@ -484,6 +486,8 @@
>>>> case AMDGPUISD::DIV_SCALE: {
>>>> return SelectDIV_SCALE(N);
>>>> }
>>>> + case ISD::ADDRSPACECAST:
>>>> + return SelectAddrSpaceCast(N);
>>>> }
>>>> return SelectCode(N);
>>>> }
>>>> @@ -522,6 +526,10 @@
>>>> return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
>>>> }
>>>>
>>>> +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
>>>> + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
>>>> +}
>>>> +
>>>> bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
>>>> return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
>>>> }
>>>> @@ -553,6 +561,10 @@
>>>> return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
>>>> }
>>>>
>>>> +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const {
>>>> + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
>>>> +}
>>>> +
>>>> bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const {
>>>> return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
>>>> }
>>>> @@ -582,10 +594,11 @@
>>>> const Value *MemVal = N->getMemOperand()->getValue();
>>>> if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
>>>> !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
>>>> + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
>>>> !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
>>>> !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
>>>> !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
>>>> - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
>>>> + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
>>>> return true;
>>>> }
>>>> return false;
>>>> @@ -1005,6 +1018,58 @@
>>>> return false;
>>>> }
>>>>
>>>> +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
>>>> + AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
>>>> + SDLoc DL(N);
>>>> +
>>>> + assert(Subtarget.hasFlatAddressSpace() &&
>>>> + "addrspacecast only supported with flat address space!");
>>>> +
>>>> + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
>>>> + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
>>>> + "Cannot cast address space to / from constant address!");
>>>> +
>>>> + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
>>>> + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
>>>> + "Can only cast to / from flat address space!");
>>>> +
>>>> + // The flat instructions read the address as the index of the VGPR holding the
>>>> + // address, so casting should just be reinterpreting the base VGPR, so just
>>>> + // insert trunc / bitcast / zext.
>>>> +
>>>> + SDValue Src = ASC->getOperand(0);
>>>> + EVT DestVT = ASC->getValueType(0);
>>>> + EVT SrcVT = Src.getValueType();
>>>> +
>>>> + unsigned SrcSize = SrcVT.getSizeInBits();
>>>> + unsigned DestSize = DestVT.getSizeInBits();
>>>> +
>>>> + if (SrcSize > DestSize) {
>>>> + assert(SrcSize == 64 && DestSize == 32);
>>>> + return CurDAG->getMachineNode(
>>>> + TargetOpcode::EXTRACT_SUBREG,
>>>> + DL,
>>>> + DestVT,
>>>> + Src,
>>>> + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32));
>>>> + }
>>>> +
>>>> +
>>>> + if (DestSize > SrcSize) {
>>>> + assert(SrcSize == 32 && DestSize == 64);
>>>> + return CurDAG->getMachineNode(
>>>> + TargetOpcode::SUBREG_TO_REG,
>>>> + DL,
>>>> + DestVT,
>>>> + CurDAG->getTargetConstant(0, MVT::i32),
>>>> + Src,
>>>> + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32));
>>>> + }
>>>> +
>>>> + assert(SrcSize == 64 && DestSize == 64);
>>>> + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
>>>> +}
>>>> +
>>>> bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
>>>> SDValue &SrcMods) const {
>>>>
>>>> Index: lib/Target/R600/AMDGPUInstrInfo.h
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUInstrInfo.h
>>>> +++ lib/Target/R600/AMDGPUInstrInfo.h
>>>> @@ -95,6 +95,7 @@
>>>> MachineInstr *MI,
>>>> const SmallVectorImpl<unsigned> &Ops,
>>>> MachineInstr *LoadMI) const override;
>>>> +public:
>>>> /// \returns the smallest register index that will be accessed by an indirect
>>>> /// read or write or -1 if indirect addressing is not used by this program.
>>>> int getIndirectIndexBegin(const MachineFunction &MF) const;
>>>> @@ -103,7 +104,6 @@
>>>> /// read or write or -1 if indirect addressing is not used by this program.
>>>> int getIndirectIndexEnd(const MachineFunction &MF) const;
>>>>
>>>> -public:
>>>
>>> Another random change.
>>>
>>
>> This is intentional and makes getIndirectIndexBegin / getIndirectIndexEnd public
>>
>>
>>>> bool canFoldMemoryOperand(const MachineInstr *MI,
>>>> const SmallVectorImpl<unsigned> &Ops) const override;
>>>> bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
>>>> Index: lib/Target/R600/AMDGPUInstructions.td
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUInstructions.td
>>>> +++ lib/Target/R600/AMDGPUInstructions.td
>>>> @@ -195,6 +195,14 @@
>>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>>> }]>;
>>>>
>>>> +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
>>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>>> +}]>;
>>>> +
>>>> +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
>>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>>> +}]>;
>>>> +
>>>> def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
>>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>>> }]>;
>>>> @@ -223,6 +231,14 @@
>>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>>> }]>;
>>>>
>>>> +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
>>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>>> +}]>;
>>>> +
>>>> +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
>>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>>> +}]>;
>>>> +
>>>> def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
>>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>>> }]>;
>>>> @@ -248,6 +264,11 @@
>>>> return isGlobalLoad(dyn_cast<LoadSDNode>(N));
>>>> }]>;
>>>>
>>>> +def az_extloadi32_flat : PatFrag<(ops node:$ptr),
>>>> + (az_extloadi32 node:$ptr), [{
>>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>>> +}]>;
>>>> +
>>>> def az_extloadi32_constant : PatFrag<(ops node:$ptr),
>>>> (az_extloadi32 node:$ptr), [{
>>>> return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
>>>> @@ -263,6 +284,16 @@
>>>> return isGlobalStore(dyn_cast<StoreSDNode>(N));
>>>> }]>;
>>>>
>>>> +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
>>>> + (truncstorei8 node:$val, node:$ptr), [{
>>>> + return isFlatStore(dyn_cast<StoreSDNode>(N));
>>>> +}]>;
>>>> +
>>>> +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
>>>> + (truncstorei16 node:$val, node:$ptr), [{
>>>> + return isFlatStore(dyn_cast<StoreSDNode>(N));
>>>> +}]>;
>>>> +
>>>> def local_store : PatFrag<(ops node:$val, node:$ptr),
>>>> (store node:$val, node:$ptr), [{
>>>> return isLocalStore(dyn_cast<StoreSDNode>(N));
>>>> @@ -318,6 +349,7 @@
>>>> return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
>>>> }]>;
>>>>
>>>> +
>>>> def atomic_cmp_swap_32_local :
>>>> PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
>>>> (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
>>>> @@ -334,6 +366,20 @@
>>>> AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
>>>> }]>;
>>>>
>>>> +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
>>>> + return isFlatLoad(dyn_cast<LoadSDNode>(N));
>>>> +}]>;
>>>> +
>>>> +def flat_store : PatFrag<(ops node:$val, node:$ptr),
>>>> + (store node:$val, node:$ptr), [{
>>>> + return isFlatStore(dyn_cast<StoreSDNode>(N));
>>>> +}]>;
>>>> +
>>>> +def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
>>>> + (AMDGPUstore_mskor node:$val, node:$ptr), [{
>>>> + return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
>>>> +}]>;
>>>> +
>>>> //===----------------------------------------------------------------------===//
>>>> // Misc Pattern Fragments
>>>> //===----------------------------------------------------------------------===//
>>>> Index: lib/Target/R600/AMDGPUMachineFunction.cpp
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUMachineFunction.cpp
>>>> +++ lib/Target/R600/AMDGPUMachineFunction.cpp
>>>> @@ -12,7 +12,9 @@
>>>> AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
>>>> MachineFunctionInfo(),
>>>> ShaderType(ShaderType::COMPUTE),
>>>> - LDSSize(0) {
>>>> + LDSSize(0),
>>>> + ScratchSize(0),
>>>> + IsKernel(true) {
>>>> AttributeSet Set = MF.getFunction()->getAttributes();
>>>> Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
>>>> ShaderTypeAttribute);
>>>> Index: lib/Target/R600/AMDGPUMachineFunction.h
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUMachineFunction.h
>>>> +++ lib/Target/R600/AMDGPUMachineFunction.h
>>>> @@ -33,6 +33,9 @@
>>>> unsigned getShaderType() const {
>>>> return ShaderType;
>>>> }
>>>> +
>>>> + unsigned ScratchSize;
>>>> + bool IsKernel;
>>>> };
>>>>
>>>> }
>>>> Index: lib/Target/R600/AMDGPUSubtarget.cpp
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUSubtarget.cpp
>>>> +++ lib/Target/R600/AMDGPUSubtarget.cpp
>>>> @@ -77,14 +77,14 @@
>>>> DumpCode(false), R600ALUInst(false), HasVertexCache(false),
>>>> TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
>>>> FP64Denormals(false), FP32Denormals(false), CaymanISA(false),
>>>> - EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
>>>> + FlatAddressSpace(false), EnableIRStructurizer(true),
>>>> + EnablePromoteAlloca(false), EnableIfCvt(true),
>>>> WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
>>>> DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
>>>> FrameLowering(TargetFrameLowering::StackGrowsUp,
>>>> 64 * 16, // Maximum stack alignment (long16)
>>>> 0),
>>>> InstrItins(getInstrItineraryForCPU(GPU)) {
>>>> -
>>>> if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
>>>> InstrInfo.reset(new R600InstrInfo(*this));
>>>> TLInfo.reset(new R600TargetLowering(TM));
>>>> Index: lib/Target/R600/AMDGPUSubtarget.h
>>>> ===================================================================
>>>> --- lib/Target/R600/AMDGPUSubtarget.h
>>>> +++ lib/Target/R600/AMDGPUSubtarget.h
>>>> @@ -56,6 +56,7 @@
>>>> bool FP64Denormals;
>>>> bool FP32Denormals;
>>>> bool CaymanISA;
>>>> + bool FlatAddressSpace;
>>>> bool EnableIRStructurizer;
>>>> bool EnablePromoteAlloca;
>>>> bool EnableIfCvt;
>>>> @@ -124,6 +125,10 @@
>>>> return FP64Denormals;
>>>> }
>>>>
>>>> + bool hasFlatAddressSpace() const {
>>>> + return FlatAddressSpace;
>>>> + }
>>>> +
>>>> bool hasBFE() const {
>>>> return (getGeneration() >= EVERGREEN);
>>>> }
>>>> Index: lib/Target/R600/SIInstrFormats.td
>>>> ===================================================================
>>>> --- lib/Target/R600/SIInstrFormats.td
>>>> +++ lib/Target/R600/SIInstrFormats.td
>>>> @@ -425,8 +425,27 @@
>>>> let Inst{57-53} = SSAMP{6-2};
>>>> }
>>>>
>>>> -class EXPe : Enc64 {
>>>> +class FLATe<bits<7> op> : Enc64 {
>>>> + bits<8> addr;
>>>> + bits<8> data;
>>>> + bits<8> vdst;
>>>> + bits<1> slc;
>>>> + bits<1> glc;
>>>> + bits<1> tfe;
>>>>
>>>> + // 15-0 is reserved.
>>>> + let Inst{16} = glc;
>>>> + let Inst{17} = slc;
>>>> + let Inst{24-18} = op;
>>>> + let Inst{31-26} = 0x37; // Encoding.
>>>> + let Inst{39-32} = addr;
>>>> + let Inst{47-40} = data;
>>>> + // 54-48 is reserved.
>>>> + let Inst{55} = tfe;
>>>> + let Inst{63-56} = vdst;
>>>> +}
>>>> +
>>>> +class EXPe : Enc64 {
>>>> bits<4> EN;
>>>> bits<6> TGT;
>>>> bits<1> COMPR;
>>>> @@ -532,6 +551,16 @@
>>>> let neverHasSideEffects = 1;
>>>> }
>>>>
>>>> +class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
>>>> + InstSI<outs, ins, asm, pattern>, FLATe <op> {
>>>> + // Internally, FLAT instruction are executed as both an LDS and a
>>>> + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
>>>> + // and are not considered done until both have been decremented.
>>>> + let VM_CNT = 1;
>>>> + let EXP_CNT = 1; // XXX - Need this?
>>>
>>> I don't think we need EXP_CNT here.
>>>
>>>> + let LGKM_CNT = 1;
>>>> +}
>>>> +
>>>> iclass MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
>>>> InstSI <outs, ins, asm, pattern>, MIMGe <op> {
>>>>
>>>> Index: lib/Target/R600/SIInstrInfo.cpp
>>>> ===================================================================
>>>> --- lib/Target/R600/SIInstrInfo.cpp
>>>> +++ lib/Target/R600/SIInstrInfo.cpp
>>>> @@ -843,6 +843,11 @@
>>>> if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
>>>> ++ConstantBusCount;
>>>>
>>>> + // XXX - I'm sort of guessing about this.
>>>> + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCRATCH_SIZE ||
>>>> + MO.getReg() == AMDGPU::FLAT_SCRATCH_OFFSET))
>>>> + ++ConstantBusCount;
>>>> +
>>>
>>> This is correct. These registers are SGPRs, so they use the constant bus.
>>>
>>>> // SGPRs use the constant bus
>>>> if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
>>>> (!MO.isImplicit() &&
>>>> Index: lib/Target/R600/SIInstrInfo.td
>>>> ===================================================================
>>>> --- lib/Target/R600/SIInstrInfo.td
>>>> +++ lib/Target/R600/SIInstrInfo.td
>>>> @@ -209,6 +209,8 @@
>>>> def SIOperand {
>>>> int ZERO = 0x80;
>>>> int VCC = 0x6A;
>>>> + int FLAT_SCRATCH_OFFSET = 0x68;
>>>> + int FLAT_SCRATCH_SIZE = 0x69;
>>>> }
>>>>
>>>> def SRCMODS {
>>>> @@ -1063,6 +1065,31 @@
>>>> }
>>>> }
>>>>
>>>> +class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
>>>> + FLAT <op, (outs regClass:$data),
>>>> + (ins VReg_64:$addr),
>>>> + asm#" $data, $addr, [M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]", []> {
>>>> + let glc = 0;
>>>> + let slc = 0;
>>>> + let tfe = 0;
>>>> + let mayLoad = 1;
>>>> + let Uses = [EXEC, M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE];
>>>> +}
>>>> +
>>>> +class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
>>>> + FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr),
>>>> + name#" $data, $addr, [M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]",
>>>> + []> {
>>>> +
>>>> + let mayLoad = 0;
>>>> + let mayStore = 1;
>>>> +
>>>> + // Encoding
>>>> + let glc = 0;
>>>> + let slc = 0;
>>>> + let tfe = 0;
>>>> +}
>>>> +
>>>> class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
>>>> op,
>>>> (outs regClass:$dst),
>>>> Index: lib/Target/R600/SIInstructions.td
>>>> ===================================================================
>>>> --- lib/Target/R600/SIInstructions.td
>>>> +++ lib/Target/R600/SIInstructions.td
>>>> @@ -31,6 +31,7 @@
>>>>
>>>> def isCI : Predicate<"Subtarget.getGeneration() "
>>>> ">= AMDGPUSubtarget::SEA_ISLANDS">;
>>>> +def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
>>>>
>>>> def isCFDepth0 : Predicate<"isCFDepth0()">;
>>>>
>>>> @@ -1044,6 +1045,80 @@
>>>> //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
>>>>
>>>> //===----------------------------------------------------------------------===//
>>>> +// Flat Instructions
>>>> +//===----------------------------------------------------------------------===//
>>>> +
>>>> +let Predicates = [HasFlatAddressSpace] in {
>>>> +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "FLAT_LOAD_UBYTE", VReg_32>;
>>>> +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "FLAT_LOAD_SBYTE", VReg_32>;
>>>> +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "FLAT_LOAD_USHORT", VReg_32>;
>>>> +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "FLAT_LOAD_SSHORT", VReg_32>;
>>>> +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "FLAT_LOAD_DWORD", VReg_32>;
>>>> +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "FLAT_LOAD_DWORDX2", VReg_64>;
>>>> +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "FLAT_LOAD_DWORDX4", VReg_128>;
>>>> +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "FLAT_LOAD_DWORDX3", VReg_96>;
>>>> +
>>>> +def FLAT_STORE_BYTE : FLAT_Store_Helper <
>>>> + 0x00000018, "FLAT_STORE_BYTE", VReg_32
>>>> +>;
>>>> +
>>>> +def FLAT_STORE_SHORT : FLAT_Store_Helper <
>>>> + 0x0000001a, "FLAT_STORE_SHORT", VReg_32
>>>> +>;
>>>> +
>>>> +def FLAT_STORE_DWORD : FLAT_Store_Helper <
>>>> + 0x0000001c, "FLAT_STORE_DWORD", VReg_32
>>>> +>;
>>>> +
>>>> +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
>>>> + 0x0000001d, "FLAT_STORE_DWORDX2", VReg_64
>>>> +>;
>>>> +
>>>> +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
>>>> + 0x0000001e, "FLAT_STORE_DWORDX4", VReg_128
>>>> +>;
>>>> +
>>>> +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
>>>> + 0x0000001e, "FLAT_STORE_DWORDX3", VReg_96
>>>> +>;
>>>> +
>>>> +//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "FLAT_ATOMIC_SWAP", []>;
>>>> +//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "FLAT_ATOMIC_CMPSWAP", []>;
>>>> +//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "FLAT_ATOMIC_ADD", []>;
>>>> +//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "FLAT_ATOMIC_SUB", []>;
>>>> +//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "FLAT_ATOMIC_RSUB", []>;
>>>> +//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "FLAT_ATOMIC_SMIN", []>;
>>>> +//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "FLAT_ATOMIC_UMIN", []>;
>>>> +//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "FLAT_ATOMIC_SMAX", []>;
>>>> +//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "FLAT_ATOMIC_UMAX", []>;
>>>> +//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "FLAT_ATOMIC_AND", []>;
>>>> +//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "FLAT_ATOMIC_OR", []>;
>>>> +//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "FLAT_ATOMIC_XOR", []>;
>>>> +//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "FLAT_ATOMIC_INC", []>;
>>>> +//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "FLAT_ATOMIC_DEC", []>;
>>>> +//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "FLAT_ATOMIC_FCMPSWAP", []>;
>>>> +//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "FLAT_ATOMIC_FMIN", []>;
>>>> +//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "FLAT_ATOMIC_FMAX", []>;
>>>> +//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "FLAT_ATOMIC_SWAP_X2", []>;
>>>> +//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "FLAT_ATOMIC_CMPSWAP_X2", []>;
>>>> +//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "FLAT_ATOMIC_ADD_X2", []>;
>>>> +//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "FLAT_ATOMIC_SUB_X2", []>;
>>>> +//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "FLAT_ATOMIC_RSUB_X2", []>;
>>>> +//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "FLAT_ATOMIC_SMIN_X2", []>;
>>>> +//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "FLAT_ATOMIC_UMIN_X2", []>;
>>>> +//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "FLAT_ATOMIC_SMAX_X2", []>;
>>>> +//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "FLAT_ATOMIC_UMAX_X2", []>;
>>>> +//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "FLAT_ATOMIC_AND_X2", []>;
>>>> +//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "FLAT_ATOMIC_OR_X2", []>;
>>>> +//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "FLAT_ATOMIC_XOR_X2", []>;
>>>> +//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "FLAT_ATOMIC_INC_X2", []>;
>>>> +//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "FLAT_ATOMIC_DEC_X2", []>;
>>>> +//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "FLAT_ATOMIC_FCMPSWAP_X2", []>;
>>>> +//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "FLAT_ATOMIC_FMIN_X2", []>;
>>>> +//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "FLAT_ATOMIC_FMAX_X2", []>;
>>>> +
>>>> +} // End HasFlatAddressSpace predicate
>>>> +//===----------------------------------------------------------------------===//
>>>> // VOP1 Instructions
>>>> //===----------------------------------------------------------------------===//
>>>>
>>>> @@ -2822,6 +2897,37 @@
>>>>
>>>> } // End iSCI
>>>>
>>>> +//===----------------------------------------------------------------------===//
>>>> +// Flat Patterns
>>>> +//===----------------------------------------------------------------------===//
>>>> +
>>>> +class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
>>>> + PatFrag flat_ld> :
>>>> + Pat <(vt (flat_ld i64:$ptr)),
>>>> + (Instr_ADDR64 $ptr)
>>>> +>;
>>>> +
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
>>>> +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
>>>> +
>>>> +class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
>>>> + Pat <(st vt:$value, i64:$ptr),
>>>> + (Instr $value, $ptr)
>>>> + >;
>>>> +
>>>> +def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
>>>> +def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
>>>> +def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
>>>> +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
>>>> +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
>>>> +def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
>>>>
>>>> /********** ====================== **********/
>>>> /********** Indirect adressing **********/
>>>> Index: lib/Target/R600/SILowerControlFlow.cpp
>>>> ===================================================================
>>>> --- lib/Target/R600/SILowerControlFlow.cpp
>>>> +++ lib/Target/R600/SILowerControlFlow.cpp
>>>> @@ -52,6 +52,7 @@
>>>> #include "AMDGPUSubtarget.h"
>>>> #include "SIInstrInfo.h"
>>>> #include "SIMachineFunctionInfo.h"
>>>> +#include "llvm/CodeGen/MachineFrameInfo.h"
>>>> #include "llvm/CodeGen/MachineFunction.h"
>>>> #include "llvm/CodeGen/MachineFunctionPass.h"
>>>> #include "llvm/CodeGen/MachineInstrBuilder.h"
>>>> @@ -451,6 +452,7 @@
>>>> bool HaveKill = false;
>>>> bool NeedM0 = false;
>>>> bool NeedWQM = false;
>>>> + bool NeedFlat = false;
>>>> unsigned Depth = 0;
>>>>
>>>> for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
>>>> @@ -533,6 +535,24 @@
>>>> NeedWQM = true;
>>>> break;
>>>>
>>>> + case AMDGPU::FLAT_LOAD_DWORD:
>>>> + case AMDGPU::FLAT_LOAD_DWORDX2:
>>>> + case AMDGPU::FLAT_LOAD_DWORDX3:
>>>> + case AMDGPU::FLAT_LOAD_DWORDX4:
>>>> + case AMDGPU::FLAT_LOAD_SBYTE:
>>>> + case AMDGPU::FLAT_LOAD_SSHORT:
>>>> + case AMDGPU::FLAT_LOAD_UBYTE:
>>>> + case AMDGPU::FLAT_LOAD_USHORT:
>>>> + case AMDGPU::FLAT_STORE_BYTE:
>>>> + case AMDGPU::FLAT_STORE_DWORD:
>>>> + case AMDGPU::FLAT_STORE_DWORDX2:
>>>> + case AMDGPU::FLAT_STORE_DWORDX3:
>>>> + case AMDGPU::FLAT_STORE_DWORDX4:
>>>> + case AMDGPU::FLAT_STORE_SHORT:
>>>
>>> We should use the TSFlags to add an isFlat() helper for this.
>>>
>>>> + // TODO: atomics and other flat instructions
>>>> + NeedFlat = true;
>>>> + break;
>>>> +
>>>> }
>>>> }
>>>> }
>>>> @@ -550,5 +570,39 @@
>>>> AMDGPU::EXEC).addReg(AMDGPU::EXEC);
>>>> }
>>>>
>>>> + // FIXME: This seems inappropriate to do here.
>>>> + if (NeedFlat && MFI->IsKernel) {
>>>> + // Insert the prologue initializing the SGPRs pointing to the scratch space
>>>> + // for flat accesses.
>>>> + const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
>>>> +
>>>> + // TODO: What to use with function calls?
>>>> + unsigned StackSizeBytes = FrameInfo->getStackSize();
>>>> +
>>>> + int IndirectBegin = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
>>>> +
>>>> + // Convert register index to 256-byte unit.
>>>> + // XXX - Does it mean bits? 256-bytes seems wrong.
>>>> + unsigned StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
>>>
>>> We don't use indirect registers anymore, so you don't need to calculate the offset.
>>
>> I would rather leave trying to calculate it for when indirect registers are used again in the future
>>
>>
>>>
>>>> +
>>>> + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
>>>> + "Stack limits should be smaller than 16-bits");
>>>> +
>>>> + // Initialize the flat scratch register pair.
>>>> +
>>>> + // Offset is in units of 256-bytes.
>>>> + MachineBasicBlock &MBB = MF.front();
>>>> + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(),
>>>> + TII->get(AMDGPU::S_MOVK_I32),
>>>> + AMDGPU::FLAT_SCRATCH_OFFSET).addImm(StackOffset);
>>>> +
>>>> + // XXX - Documentation says size is "per-thread scratch size in bytes", but
>>>> + // that's crazy. Maybe it means per wave?
>>>
>>> I pretty sure per-thread scratch size is correct.
>>
>> I’ve changed this to use a single 64-bit mov to initialize both at the same time
>>
>>
>>
>>>
>>>> + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(),
>>>> + TII->get(AMDGPU::S_MOVK_I32),
>>>> + AMDGPU::FLAT_SCRATCH_SIZE).addImm(StackSizeBytes);
>>>> +
>>>> + }
>>>> +
>>>> return true;
>>>> }
>>>> Index: lib/Target/R600/SIRegisterInfo.td
>>>> ===================================================================
>>>> --- lib/Target/R600/SIRegisterInfo.td
>>>> +++ lib/Target/R600/SIRegisterInfo.td
>>>> @@ -39,6 +39,13 @@
>>>> def SCC : SIReg<"SCC", 253>;
>>>> def M0 : SIReg <"M0", 124>;
>>>>
>>>> +// Pair to indicate location of scratch space for flat accesses.
>>>> +// Offset is in units of 256-bytes,
>>>> +def FLAT_SCRATCH_OFFSET : SIReg <"FLAT_SCRATCH_OFFSET", 104>;
>>>> +
>>>> +// Size is the per-thread scratch size, in bytes.
>>>> +def FLAT_SCRATCH_SIZE : SIReg <"FLAT_SCRATCH_SIZE", 105>;
>>>> +
>>>
>>> In the docs, these are called flat_scratch_lo and flat_scratch_hi,
>>> and I think we should use these names to be consistent. Plus, the
>>> assembly names should be lower case.
>>
>> I’ve changed the variable names to FLAT_SCR / FLAT_SCR_LO / FLAT_SCR_HI (which seems to be what’s used in the ISA documentation), which are printed as flat_scratch, flat_scratch_lo, and flat_scratch_hi to match what SC seems to do
>>
>>
>>>
>>>
>>>> // SGPR registers
>>>> foreach Index = 0-101 in {
>>>> def SGPR#Index : SIReg <"SGPR"#Index, Index>;
>>>> Index: test/CodeGen/R600/flat-address-space.ll
>>>> ===================================================================
>>>> --- /dev/null
>>>> +++ test/CodeGen/R600/flat-address-space.ll
>>>> @@ -0,0 +1,182 @@
>>>> +; RUN: llc -O0 -march=r600 -mcpu=bonaire < %s | FileCheck %s
>>>> +
>>>> +; Disable optimizations in case there are optimizations added that
>>>> +; specialize away generic pointer accesses.
>>>> +
>>>> +
>>>> +; CHECK-LABEL: @branch_use_flat_i32:
>>>> +; CHECK: ; BB#3: ; %global
>>>> +
>>>> +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-1]+]], {{s[0-9]+}}
>>>> +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-1]+]], {{s[0-9]+}}
>>>> +
>>>> +; CHECK: ; BB#2: ; %local
>>>> +
>>>> +; CHECK: V_MOV_B32_e32 v[[LO_VREG]], {{s[0-9]+}}
>>>> +; CHECK: V_MOV_B32_e32 v[[HI_VREG]], {{s[0-9]+}}
>>>> +
>>>> +; CHECK: FLAT_STORE_DWORD {{v[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
>>>> +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
>>>> +entry:
>>>> + %cmp = icmp ne i32 %c, 0
>>>> + br i1 %cmp, label %local, label %global
>>>> +
>>>> +local:
>>>> + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
>>>> + br label %end
>>>> +
>>>> +global:
>>>> + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
>>>> + br label %end
>>>> +
>>>> +end:
>>>> + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
>>>> + store i32 %x, i32 addrspace(4)* %fptr, align 4
>>>> +; %val = load i32 addrspace(4)* %fptr, align 4
>>>> +; store i32 %val, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +; These testcases might become useless when there are optimizations to
>>>> +; remove generic pointers.
>>>> +
>>>> +; CHECK-LABEL: @store_flat_i32:
>>>> +; CHECK: V_MOV_B32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
>>>> +; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
>>>> +; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
>>>> +; CHECK: FLAT_STORE_DWORD v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
>>>> +define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
>>>> + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
>>>> + store i32 %x, i32 addrspace(4)* %fptr, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL: @store_flat_i64:
>>>> +; CHECK: FLAT_STORE_DWORDX2
>>>> +define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
>>>> + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
>>>> + store i64 %x, i64 addrspace(4)* %fptr, align 8
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL: @store_flat_v4i32:
>>>> +; CHECK: FLAT_STORE_DWORDX4
>>>> +define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
>>>> + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
>>>> + store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL: @store_flat_trunc_i16:
>>>> +; CHECK: FLAT_STORE_SHORT
>>>> +define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
>>>> + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
>>>> + %y = trunc i32 %x to i16
>>>> + store i16 %y, i16 addrspace(4)* %fptr, align 2
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL: @store_flat_trunc_i8:
>>>> +; CHECK: FLAT_STORE_BYTE
>>>> +define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
>>>> + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
>>>> + %y = trunc i32 %x to i8
>>>> + store i8 %y, i8 addrspace(4)* %fptr, align 2
>>>> + ret void
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +; CHECK-LABEL @load_flat_i32:
>>>> +; CHECK: FLAT_LOAD_DWORD
>>>> +define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
>>>> + %fload = load i32 addrspace(4)* %fptr, align 4
>>>> + store i32 %fload, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL @load_flat_i64:
>>>> +; CHECK: FLAT_LOAD_DWORDX2
>>>> +define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
>>>> + %fload = load i64 addrspace(4)* %fptr, align 4
>>>> + store i64 %fload, i64 addrspace(1)* %out, align 8
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL @load_flat_v4i32:
>>>> +; CHECK: FLAT_LOAD_DWORDX4
>>>> +define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
>>>> + %fload = load <4 x i32> addrspace(4)* %fptr, align 4
>>>> + store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL @sextload_flat_i8:
>>>> +; CHECK: FLAT_LOAD_SBYTE
>>>> +define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
>>>> + %fload = load i8 addrspace(4)* %fptr, align 4
>>>> + %ext = sext i8 %fload to i32
>>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL @zextload_flat_i8:
>>>> +; CHECK: FLAT_LOAD_UBYTE
>>>> +define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
>>>> + %fload = load i8 addrspace(4)* %fptr, align 4
>>>> + %ext = zext i8 %fload to i32
>>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL @sextload_flat_i16:
>>>> +; CHECK: FLAT_LOAD_SSHORT
>>>> +define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
>>>> + %fload = load i16 addrspace(4)* %fptr, align 4
>>>> + %ext = sext i16 %fload to i32
>>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +; CHECK-LABEL @zextload_flat_i16:
>>>> +; CHECK: FLAT_LOAD_USHORT
>>>> +define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
>>>> + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
>>>> + %fload = load i16 addrspace(4)* %fptr, align 4
>>>> + %ext = zext i16 %fload to i32
>>>> + store i32 %ext, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +declare void @llvm.AMDGPU.barrier.local() #1
>>>> +
>>>> +
>>>> +; Check for prologue initializing special SGPRs pointing to scratch.
>>>> +; CHECK-LABEL: @store_flat_scratch:
>>>> +; CHECK: S_MOVK_I32 FLAT_SCRATCH_SIZE, 40
>>>> +; CHECK: S_MOVK_I32 FLAT_SCRATCH_OFFSET,
>>>> +; CHECK: FLAT_STORE_DWORD
>>>> +; CHECK: S_BARRIER
>>>> +; CHECK: FLAT_LOAD_DWORD
>>>> +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32 %x) #0 {
>>>> + %alloca = alloca i32, i32 9, align 4
>>>> + %pptr = getelementptr i32* %alloca, i32 %x
>>>> + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
>>>> + store i32 %x, i32 addrspace(4)* %fptr
>>>> + ; Dummy call
>>>> + call void @llvm.AMDGPU.barrier.local() #1
>>>> + %reload = load i32 addrspace(4)* %fptr, align 4
>>>> + store i32 %reload, i32 addrspace(1)* %out, align 4
>>>> + ret void
>>>> +}
>>>> +
>>>> +attributes #0 = { nounwind }
>>>> +attributes #1 = { nounwind noduplicate }
>>>
>>>> _______________________________________________
>>>> llvm-commits mailing list
>>>> llvm-commits at cs.uiuc.edu
>>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>>
>>> _______________________________________________
>>> llvm-commits mailing list
>>> llvm-commits at cs.uiuc.edu
>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>
>
>
>
>
>>
>
More information about the llvm-commits
mailing list