[llvm] r267452 - AMDGPU: Implement addrspacecast
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 25 12:27:24 PDT 2016
Author: arsenm
Date: Mon Apr 25 14:27:24 2016
New Revision: 267452
URL: http://llvm.org/viewvc/llvm-project?rev=267452&view=rev
Log:
AMDGPU: Implement addrspacecast
Added:
llvm/trunk/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll
llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp Mon Apr 25 14:27:24 2016
@@ -24,6 +24,8 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public ModulePass {
private:
+ static bool hasAddrSpaceCast(const Function &F);
+
void addAttrToCallers(Function *Intrin, StringRef AttrName);
bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
@@ -48,12 +50,29 @@ char AMDGPUAnnotateKernelFeatures::ID =
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+ "Add AMDGPU function attributes", false, false)
+
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+ unsigned SrcAS = ASC->getSrcAddressSpace();
-INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
- "Add AMDGPU function attributes", false, false)
-INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
- "Add AMDGPU function attributes", false, false)
+ // The queue ptr is only needed when casting to flat, not from it.
+ return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+// Return true if an addrspacecast is used that requires the queue ptr.
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+ for (const BasicBlock &BB : F) {
+ for (const Instruction &I : BB) {
+ if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+ if (castRequiresQueuePtr(ASC))
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
StringRef AttrName) {
@@ -117,9 +136,18 @@ bool AMDGPUAnnotateKernelFeatures::runOn
// always initialized.
bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
- if (TT.getOS() == Triple::AMDHSA)
+ if (TT.getOS() == Triple::AMDHSA) {
Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+ for (Function &F : M) {
+ if (F.hasFnAttribute("amdgpu-queue-ptr"))
+ continue;
+
+ if (hasAddrSpaceCast(F))
+ F.addFnAttr("amdgpu-queue-ptr");
+ }
+ }
+
return Changed;
}
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Mon Apr 25 14:27:24 2016
@@ -147,7 +147,6 @@ private:
bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
- SDNode *SelectAddrSpaceCast(SDNode *N);
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -526,8 +525,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNod
Lowering.legalizeTargetIndependentNode(N, *CurDAG);
break;
}
- case ISD::ADDRSPACECAST:
- return SelectAddrSpaceCast(N);
case ISD::AND:
case ISD::SRL:
case ISD::SRA:
@@ -1332,69 +1329,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBuffe
!isa<ConstantSDNode>(Offset);
}
-// FIXME: This is incorrect and only enough to be able to compile.
-SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
- AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
- SDLoc DL(N);
-
- const MachineFunction &MF = CurDAG->getMachineFunction();
- DiagnosticInfoUnsupported NotImplemented(
- *MF.getFunction(), "addrspacecast not implemented", DL.getDebugLoc());
- CurDAG->getContext()->diagnose(NotImplemented);
-
- assert(Subtarget->hasFlatAddressSpace() &&
- "addrspacecast only supported with flat address space!");
-
- assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
- ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
- "Can only cast to / from flat address space!");
-
- // The flat instructions read the address as the index of the VGPR holding the
- // address, so casting should just be reinterpreting the base VGPR, so just
- // insert trunc / bitcast / zext.
-
- SDValue Src = ASC->getOperand(0);
- EVT DestVT = ASC->getValueType(0);
- EVT SrcVT = Src.getValueType();
-
- unsigned SrcSize = SrcVT.getSizeInBits();
- unsigned DestSize = DestVT.getSizeInBits();
-
- if (SrcSize > DestSize) {
- assert(SrcSize == 64 && DestSize == 32);
- return CurDAG->getMachineNode(
- TargetOpcode::EXTRACT_SUBREG,
- DL,
- DestVT,
- Src,
- CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
- }
-
- if (DestSize > SrcSize) {
- assert(SrcSize == 32 && DestSize == 64);
-
- // FIXME: This is probably wrong, we should never be defining
- // a register class with both VGPRs and SGPRs
- SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
- MVT::i32);
-
- const SDValue Ops[] = {
- RC,
- Src,
- CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(0, DL, MVT::i32)), 0),
- CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
- DL, N->getValueType(0), Ops);
- }
-
- assert(SrcSize == 64 && DestSize == 64);
- return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
-}
-
SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
uint32_t Offset, uint32_t Width) {
// Transformation function, pack the offset and width of a BFE into
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Mon Apr 25 14:27:24 2016
@@ -278,6 +278,11 @@ SITargetLowering::SITargetLowering(Targe
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+ if (Subtarget->hasFlatAddressSpace()) {
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+ }
+
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
@@ -1232,6 +1237,7 @@ SDValue SITargetLowering::LowerOperation
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+ case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
}
return SDValue();
}
@@ -1390,6 +1396,84 @@ SDValue SITargetLowering::LowerBRCOND(SD
return Chain;
}
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+ SelectionDAG &DAG) const {
+ SDLoc SL;
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, Info->getQueuePtrUserSGPR(), MVT::i64);
+
+ // Offset into amd_queue_t for group_segment_aperture_base_hi /
+ // private_segment_aperture_base_hi.
+ uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+ SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+ DAG.getConstant(StructOffset, SL, MVT::i64));
+
+ // TODO: Use custom target PseudoSourceValue.
+ // TODO: We should use the value from the IR intrinsic call, but it might not
+ // be available and how do we get it?
+ Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS));
+
+ MachinePointerInfo PtrInfo(V, StructOffset);
+ return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr,
+ PtrInfo, false,
+ false, true,
+ MinAlign(64, StructOffset));
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+ SDValue Src = ASC->getOperand(0);
+
+ // FIXME: Really support non-0 null pointers.
+ SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+ SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+ // flat -> local/private
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+ SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ NonNull, Ptr, SegmentNullPtr);
+ }
+ }
+
+ // local/private -> flat
+ if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NonNull
+ = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+ SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+ SDValue CvtPtr
+ = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+ FlatNullPtr);
+ }
+ }
+
+ // global <-> flat are no-ops and never emitted.
+
+ const MachineFunction &MF = DAG.getMachineFunction();
+ DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+ *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+ DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+ return DAG.getUNDEF(ASC->getValueType(0));
+}
+
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Mon Apr 25 14:27:24 2016
@@ -45,6 +45,9 @@ class SITargetLowering final : public AM
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+ SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
SDValue performUCharToFloatCombine(SDNode *N,
Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h Mon Apr 25 14:27:24 2016
@@ -270,6 +270,10 @@ public:
ScratchWaveOffsetReg = Reg;
}
+ unsigned getQueuePtrUserSGPR() const {
+ return QueuePtrUserSGPR;
+ }
+
bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll Mon Apr 25 14:27:24 2016
@@ -1,18 +1,208 @@
-; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
-; ERROR: addrspacecast not implemented
+; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 1
+
+; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+
+; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+
+; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
+; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
+; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+ %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+ store volatile i32 7, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 1
+
+; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+
+; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+
+; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
+; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
+; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+ %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
+ store volatile i32 7, i32 addrspace(4)* %stof
+ ret void
+}
+
+; no-op
+; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+ %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
+ store volatile i32 7, i32 addrspace(4)* %stof
+ ret void
+}
+
+; no-op
+; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+ %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
+ %ld = load volatile i32, i32 addrspace(4)* %stof
+ ret void
+}
-; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
-; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
+; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
+define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
+ store volatile i32 0, i32 addrspace(3)* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
+; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
+ store volatile i32 0, i32* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
+; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
+ store volatile i32 0, i32 addrspace(1)* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
+; HSA: enable_sgpr_queue_ptr = 0
+
+; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
+define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
+ load volatile i32, i32 addrspace(2)* %ftos
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
+; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_0_group_to_flat_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
+ store i32 7, i32 addrspace(4)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: ds_write_b32 [[PTR]], [[K]]
+define void @cast_0_flat_to_group_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
+ store i32 7, i32 addrspace(3)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
+; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
+ store i32 7, i32 addrspace(4)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: ds_write_b32 [[PTR]], [[K]]
+define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
+ store i32 7, i32 addrspace(3)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
+; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+define void @cast_0_private_to_flat_addrspacecast() #0 {
+ %cast = addrspacecast i32* null to i32 addrspace(4)*
+ store i32 7, i32 addrspace(4)* %cast
+ ret void
+}
+
+; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
+; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+define void @cast_0_flat_to_private_addrspacecast() #0 {
+ %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
+ store i32 7, i32* %cast
+ ret void
+}
; Disable optimizations in case there are optimizations added that
; specialize away generic pointer accesses.
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
+; HSA-LABEL: {{^}}branch_use_flat_i32:
+; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
+; HSA: s_endpgm
define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
entry:
%cmp = icmp ne i32 %c, 0
@@ -34,20 +224,17 @@ end:
ret void
}
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
+; HSA-LABEL: {{^}}store_flat_scratch:
+; HSA: s_mov_b32 flat_scratch_lo, s9
+; HSA: s_add_u32 [[ADD:s[0-9]+]], s8, s11
+; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+; HSA: flat_store_dword
+; HSA: s_barrier
+; HSA: flat_load_dword
define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
%alloca = alloca i32, i32 9, align 4
- %x = call i32 @llvm.amdgcn.workitem.id.x() #3
+ %x = call i32 @llvm.amdgcn.workitem.id.x() #2
%pptr = getelementptr i32, i32* %alloca, i32 %x
%fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
store i32 %x, i32 addrspace(4)* %fptr
@@ -59,8 +246,8 @@ define void @store_flat_scratch(i32 addr
}
declare void @llvm.amdgcn.s.barrier() #1
-declare i32 @llvm.amdgcn.workitem.id.x() #3
+declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind }
attributes #1 = { nounwind convergent }
-attributes #3 = { nounwind readnone }
+attributes #2 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll?rev=267452&r1=267451&r2=267452&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll Mon Apr 25 14:27:24 2016
@@ -164,6 +164,63 @@ define void @use_queue_ptr(i32 addrspace
ret void
}
+; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+ %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+ %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
+ store volatile i32 0, i32 addrspace(3)* %ftos
+ ret void
+}
+
+; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
+ store volatile i32 0, i32* %ftos
+ ret void
+}
+
+; No-op addrspacecast should not use queue ptr
+; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+ %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
+ store volatile i32 0, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+ %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
+ %ld = load volatile i32, i32 addrspace(4)* %stof
+ ret void
+}
+
+; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
+ store volatile i32 0, i32 addrspace(1)* %ftos
+ ret void
+}
+
+; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+ %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
+ %ld = load volatile i32, i32 addrspace(2)* %ftos
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
Added: llvm/trunk/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/invalid-addrspacecast.ll?rev=267452&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/invalid-addrspacecast.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/invalid-addrspacecast.ll Mon Apr 25 14:27:24 2016
@@ -0,0 +1,8 @@
+; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
+define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+ %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
+ store volatile i32 0, i32 addrspace(1)* %stof
+ ret void
+}
More information about the llvm-commits
mailing list