[PATCH 1/1] Reapply "R600: Add new intrinsic to read work dimensions"

Tue Oct 14 13:33:04 PDT 2014

You normally don't need a new review when it is only a test fix.

On 14 October 2014 15:55, Tom Stellard <tom at stellard.net> wrote:
> On Tue, Oct 14, 2014 at 03:53:28PM -0400, Jan Vesely wrote:
>> This effectively reverts revert 219707. After fixing the test to work with
>> new function name format and renamed intrinsic.
>>
>
> LGTM.
>> Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
>> ---
>>
>> sorry about that
>>
>>  include/llvm/IR/IntrinsicsR600.td         |  9 ++++++++-
>>  lib/Target/R600/AMDGPUMachineFunction.h   |  3 +++
>>  lib/Target/R600/R600ISelLowering.cpp      | 11 ++++++++---
>>  lib/Target/R600/SIISelLowering.cpp        | 11 +++++++++--
>>  test/CodeGen/R600/work-item-intrinsics.ll | 16 ++++++++++++++++
>>  5 files changed, 44 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td
>> index 2e711a9..098ad53 100644
>> --- a/include/llvm/IR/IntrinsicsR600.td
>> +++ b/include/llvm/IR/IntrinsicsR600.td
>> @@ -33,10 +33,14 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
>>                                         "__builtin_r600_read_tgid">;
>>  defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
>>                                         "__builtin_r600_read_tidig">;
>> -
>>  } // End TargetPrefix = "r600"
>>
>>  let TargetPrefix = "AMDGPU" in {
>> +
>> +class AMDGPUReadPreloadRegisterIntrinsic<string name>
>> +  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
>> +    GCCBuiltin<name>;
>> +
>>  def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">,
>>    // 1st parameter: Numerator
>>    // 2nd parameter: Denominator
>> @@ -72,4 +76,7 @@ def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">,
>>  def int_AMDGPU_ldexp : GCCBuiltin<"__builtin_amdgpu_ldexp">,
>>    Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
>>
>> +def int_AMDGPU_read_workdim : AMDGPUReadPreloadRegisterIntrinsic <
>> +                                       "__builtin_amdgpu_read_workdim">;
>> +
>>  } // End TargetPrefix = "AMDGPU"
>> diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
>> index 886fb1b..f5e4694 100644
>> --- a/lib/Target/R600/AMDGPUMachineFunction.h
>> +++ b/lib/Target/R600/AMDGPUMachineFunction.h
>> @@ -30,6 +30,9 @@ public:
>>    /// Number of bytes in the LDS that are being used.
>>    unsigned LDSSize;
>>
>> +  /// Start of implicit kernel args
>> +  unsigned ABIArgOffset;
>> +
>>    unsigned getShaderType() const {
>>      return ShaderType;
>>    }
>> diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
>> index e0ac13f..2e91270 100644
>> --- a/lib/Target/R600/R600ISelLowering.cpp
>> +++ b/lib/Target/R600/R600ISelLowering.cpp
>> @@ -818,6 +818,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
>>      case Intrinsic::r600_read_local_size_z:
>>        return LowerImplicitParameter(DAG, VT, DL, 8);
>>
>> +    case Intrinsic::AMDGPU_read_workdim:
>> +      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
>> +
>>      case Intrinsic::r600_read_tgid_x:
>>        return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
>>                                    AMDGPU::T1_X, VT);
>> @@ -1725,7 +1728,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
>>    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
>>                   *DAG.getContext());
>>    MachineFunction &MF = DAG.getMachineFunction();
>> -  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
>> +  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
>>
>>    SmallVector<ISD::InputArg, 8> LocalIns;
>>
>> @@ -1743,7 +1746,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
>>        MemVT = MemVT.getVectorElementType();
>>      }
>>
>> -    if (ShaderType != ShaderType::COMPUTE) {
>> +    if (MFI->getShaderType() != ShaderType::COMPUTE) {
>>        unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
>>        SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
>>        InVals.push_back(Register);
>> @@ -1775,16 +1778,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
>>
>>      unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
>>      unsigned PartOffset = VA.getLocMemOffset();
>> +    unsigned Offset = 36 + VA.getLocMemOffset();
>>
>>      MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
>>      SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
>> -                              DAG.getConstant(36 + PartOffset, MVT::i32),
>> +                              DAG.getConstant(Offset, MVT::i32),
>>                                DAG.getUNDEF(MVT::i32),
>>                                PtrInfo,
>>                                MemVT, false, true, true, 4);
>>
>>      // 4 is the preferred alignment for the CONSTANT memory space.
>>      InVals.push_back(Arg);
>> +    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
>>    }
>>    return Chain;
>>  }
>> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
>> index 49ac269..2888195 100644
>> --- a/lib/Target/R600/SIISelLowering.cpp
>> +++ b/lib/Target/R600/SIISelLowering.cpp
>> @@ -519,11 +519,11 @@ SDValue SITargetLowering::LowerFormalArguments(
>>      if (VA.isMemLoc()) {
>>        VT = Ins[i].VT;
>>        EVT MemVT = Splits[i].VT;
>> +      const unsigned Offset = 36 + VA.getLocMemOffset();
>>        // The first 36 bytes of the input buffer contains information about
>>        // thread group and global sizes.
>>        SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
>> -                                   36 + VA.getLocMemOffset(),
>> -                                   Ins[i].Flags.isSExt());
>> +                                   Offset, Ins[i].Flags.isSExt());
>>
>>        const PointerType *ParamTy =
>>            dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
>> @@ -537,6 +537,7 @@ SDValue SITargetLowering::LowerFormalArguments(
>>        }
>>
>>        InVals.push_back(Arg);
>> +      Info->ABIArgOffset = Offset + MemVT.getStoreSize();
>>        continue;
>>      }
>>      assert(VA.isRegLoc() && "Parameter must be in a register!");
>> @@ -927,6 +928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
>>    case Intrinsic::r600_read_local_size_z:
>>      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
>>                            SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
>> +
>> +  case Intrinsic::AMDGPU_read_workdim:
>> +    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
>> +                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
>> +                          false);
>> +
>>    case Intrinsic::r600_read_tgid_x:
>>      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
>>        TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
>> diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
>> index a1337ae..cbefe25 100644
>> --- a/test/CodeGen/R600/work-item-intrinsics.ll
>> +++ b/test/CodeGen/R600/work-item-intrinsics.ll
>> @@ -128,6 +128,20 @@ entry:
>>    ret void
>>  }
>>
>> +; FUNC-LABEL: {{^}}get_work_dim:
>> +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
>> +; EG: MOV [[VAL]], KC0[2].Z
>> +
>> +; SI: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0xb
>> +; SI: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
>> +; SI: BUFFER_STORE_DWORD [[VVAL]]
>> +define void @get_work_dim (i32 addrspace(1)* %out) {
>> +entry:
>> +  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
>> +  store i32 %0, i32 addrspace(1)* %out
>> +  ret void
>> +}
>> +
>>  ; The tgid values are stored in sgprs offset by the number of user sgprs.
>>  ; Currently we always use exactly 2 user sgprs for the pointer to the
>>  ; kernel arguments, but this may change in the future.
>> @@ -209,4 +223,6 @@ declare i32 @llvm.r600.read.tidig.x() #0
>>  declare i32 @llvm.r600.read.tidig.y() #0
>>  declare i32 @llvm.r600.read.tidig.z() #0
>>
>> +declare i32 @llvm.AMDGPU.read.workdim() #0
>> +
>>  attributes #0 = { readnone }
>> --
>> 1.9.3
>>