[PATCH 1/1] R600: Add new intrinsic to read work dimensions
Jan Vesely
jan.vesely at rutgers.edu
Sat Oct 11 15:03:13 PDT 2014
v2: Add SI lowering
Add test
v3: Place work dimensions after the kernel arguments.
v4: Calculate offset while lowering arguments
v5: rebase
Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
---
include/llvm/IR/IntrinsicsR600.td | 2 ++
lib/Target/R600/AMDGPUMachineFunction.h | 3 +++
lib/Target/R600/R600ISelLowering.cpp | 11 ++++++++---
lib/Target/R600/SIISelLowering.cpp | 10 ++++++++--
test/CodeGen/R600/work-item-intrinsics.ll | 16 ++++++++++++++++
5 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td
index 2e711a9..8dafc80 100644
--- a/include/llvm/IR/IntrinsicsR600.td
+++ b/include/llvm/IR/IntrinsicsR600.td
@@ -33,6 +33,8 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
"__builtin_r600_read_tgid">;
defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
"__builtin_r600_read_tidig">;
+def int_r600_read_workdim : R600ReadPreloadRegisterIntrinsic <
+ "__builtin_r600_read_workdim">;
} // End TargetPrefix = "r600"
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 886fb1b..f5e4694 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -30,6 +30,9 @@ public:
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
+ /// Start of implicit kernel args
+ unsigned ABIArgOffset;
+
unsigned getShaderType() const {
return ShaderType;
}
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 87610e9..7c60bbe 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -808,6 +808,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return LowerImplicitParameter(DAG, VT, DL, 7);
case Intrinsic::r600_read_local_size_z:
return LowerImplicitParameter(DAG, VT, DL, 8);
+ case Intrinsic::r600_read_workdim: {
+ return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
+ }
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
@@ -1698,7 +1701,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
MachineFunction &MF = DAG.getMachineFunction();
- unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
SmallVector<ISD::InputArg, 8> LocalIns;
@@ -1716,7 +1719,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
MemVT = MemVT.getVectorElementType();
}
- if (ShaderType != ShaderType::COMPUTE) {
+ if (MFI->getShaderType() != ShaderType::COMPUTE) {
unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
InVals.push_back(Register);
@@ -1748,16 +1751,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
+ unsigned Offset = 36 + VA.getLocMemOffset();
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
- DAG.getConstant(36 + PartOffset, MVT::i32),
+ DAG.getConstant(Offset, MVT::i32),
DAG.getUNDEF(MVT::i32),
PtrInfo,
MemVT, false, true, true, 4);
// 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
+ MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
}
return Chain;
}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 49ac269..ed1e746 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -519,11 +519,11 @@ SDValue SITargetLowering::LowerFormalArguments(
if (VA.isMemLoc()) {
VT = Ins[i].VT;
EVT MemVT = Splits[i].VT;
+ const unsigned Offset = 36 + VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
- 36 + VA.getLocMemOffset(),
- Ins[i].Flags.isSExt());
+ Offset, Ins[i].Flags.isSExt());
const PointerType *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
@@ -537,6 +537,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
InVals.push_back(Arg);
+ Info->ABIArgOffset = Offset + MemVT.getStoreSize();
continue;
}
assert(VA.isRegLoc() && "Parameter must be in a register!");
@@ -927,6 +928,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::r600_read_local_size_z:
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
+ case Intrinsic::r600_read_workdim: {
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
+ false);
+ }
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index a1337ae..d0ca92b 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -128,6 +128,20 @@ entry:
ret void
}
+; FUNC-LABEL: @get_work_dim
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0xb
+; SI: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: BUFFER_STORE_DWORD [[VVAL]]
+define void @get_work_dim (i32 addrspace(1)* %out) {
+entry:
+ %0 = call i32 @llvm.r600.read.workdim() #0
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
; The tgid values are stored in sgprs offset by the number of user sgprs.
; Currently we always use exactly 2 user sgprs for the pointer to the
; kernel arguments, but this may change in the future.
@@ -209,4 +223,6 @@ declare i32 @llvm.r600.read.tidig.x() #0
declare i32 @llvm.r600.read.tidig.y() #0
declare i32 @llvm.r600.read.tidig.z() #0
+declare i32 @llvm.r600.read.workdim() #0
+
attributes #0 = { readnone }
--
1.9.3
More information about the llvm-commits
mailing list