PATCH: R600/SI: Fix handling of constant initializers

Mon Jul 14 14:43:27 PDT 2014

On 07/14/2014 01:57 PM, Tom Stellard wrote:
>
>  From 6deb5706328bef08e419970e9a4505de172d63c9 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 14 Jul 2014 16:09:50 -0400
> Subject: [PATCH 3/3] R600/SI: Store constant initializer data in constant
>   memory
>
> This implements a solution for constant initializers suggested
> by Vadim Girlin, where we store the data after the shader code
> and then use the S_GETPC instruction to compute its address.
>
> This saves use the trouble of creating a new buffer for constant data
> and then having to pass the pointer to the kernel via user SGPRs or the
> input buffer.
This strategy sounds weird. I'm not exactly sure how the caches work, 
but will this end up putting the constant initializer in the instruction 
cache? That would be really, really bad.

> +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
> +                                             SDValue Op,
> +                                             SelectionDAG &DAG) const {
> +  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
> +
> +  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
> +    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
> +
> +  SDLoc DL(GSD);
> +  const GlobalValue *GV = GSD->getGlobal();
> +  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
> +
> +  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
> +  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
> +
> +  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
> +                              DAG.getConstant(0, MVT::i32));
> +  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
> +                              DAG.getConstant(1, MVT::i32));
> +
> +  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
> +                           PtrLo, GA);
> +  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
> +                           PtrHi, DAG.getConstant(0, MVT::i32),
> +                           SDValue(Lo.getNode(), 1));
> +  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
Why not just do the add of the pointer type and let the 64-bit add be 
legalized normally? It would also be 1 less obstacle to adding support 
for 32-bit pointers

> +}
> +
>   SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
>     SDLoc DL(Op);
>     LoadSDNode *Load = cast<LoadSDNode>(Op);
> diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
> index e25323a..4e1c949 100644
> --- a/lib/Target/R600/SIISelLowering.h
> +++ b/lib/Target/R600/SIISelLowering.h
> @@ -25,6 +25,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
>                            SDValue Chain, unsigned Offset, bool Signed) const;
>     SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
>                                  SelectionDAG &DAG) const;
> +  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
> +                             SelectionDAG &DAG) const override;
>     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 455c890..15c9a5e 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -361,6 +361,26 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
>       MI->eraseFromParent();
>       break;
>     }
> +  case AMDGPU::SI_CONSTDATA_PTR: {
> +    unsigned Reg = MI->getOperand(0).getReg();
> +    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
> +    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
> +
> +    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
> +
> +    // Add 32-bit offset from this instruction to the start of the constant data.
> +    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_I32), RegLo)
> +            .addReg(RegLo)
> +            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
> +            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
> +    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
> +            .addReg(RegHi)
> +            .addImm(0)
> +            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
> +            .addReg(AMDGPU::SCC, RegState::Implicit);
> +    MI->eraseFromParent();
> +    break;
> +  }
>     }
>     return true;
>   }
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 7093db6..c1f22a5 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -57,6 +57,10 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
>   def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
>   def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
>   
> +def SIconstdata_ptr : SDNode<
> +  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
> +>;
> +
>   // Transformation function, extract the lower 32bit of a 64bit immediate
>   def LO32 : SDNodeXForm<imm, [{
>     return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 4f480cd..81f8e15 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -139,7 +139,11 @@ def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
>   ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
>   ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
>   ////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
> -def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
> +def S_GETPC_B64 : SOP1 <
> +  0x0000001f, (outs SReg_64:$dst), (ins), "S_GETPC_B64 $dst", []
> +> {
> +  let SSRC0 = 0;
> +}
>   def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
>   def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
>   def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
> @@ -1681,6 +1685,16 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
>   defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
>   defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
>   
> +let Defs = [SCC] in {
> +
> +def SI_CONSTDATA_PTR : InstSI <
> +  (outs SReg_64:$dst),
> +  (ins),
> +  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
> +>;
> +
> +} // End Defs = [SCC]
> +
>   } // end IsCodeGenOnly, isPseudo
>   
>   } // end SubtargetPredicate = SI
> diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
> index db64a6f..074d908 100644
> --- a/test/CodeGen/R600/gv-const-addrspace.ll
> +++ b/test/CodeGen/R600/gv-const-addrspace.ll
> @@ -4,11 +4,11 @@
>   
>   @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
>   
> -; XXX: Test on SI once 64-bit adds are supportes.
> -
>   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
>   
>   ; FUNC-LABEL: @float
> +; FIXME: We should be using S_LOAD_DWORD here.
> +; SI: BUFFER_LOAD_DWORD
>   
>   ; EG-DAG: MOV {{\** *}}T2.X
>   ; EG-DAG: MOV {{\** *}}T3.X
> @@ -29,6 +29,9 @@ entry:
>   
>   ; FUNC-LABEL: @i32
>   
> +; FIXME: We should be using S_LOAD_DWORD here.
> +; SI: BUFFER_LOAD_DWORD
> +
>   ; EG-DAG: MOV {{\** *}}T2.X
>   ; EG-DAG: MOV {{\** *}}T3.X
>   ; EG-DAG: MOV {{\** *}}T4.X
> @@ -50,6 +53,7 @@ entry:
>   @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
>   
>   ; FUNC-LABEL: @struct_foo_gv_load
> +; SI: S_LOAD_DWORD
>   
>   define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
>     %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
> @@ -64,6 +68,8 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
>                                                                   <1 x i32> <i32 4> ]
>   
>   ; FUNC-LABEL: @array_v1_gv_load
> +; FIXME: We should be using S_LOAD_DWORD here.
> +; SI: BUFFER_LOAD_DWORD
>   define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
>     %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
>     %load = load <1 x i32> addrspace(2)* %gep, align 4
> diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
> index 552cd05..191b5c3 100644
> --- a/test/CodeGen/R600/large-constant-initializer.ll
> +++ b/test/CodeGen/R600/large-constant-initializer.ll
> @@ -1,6 +1,5 @@
> -; XFAIL: *
> -; REQUIRES: asserts
>   ; RUN: llc -march=r600 -mcpu=SI < %s
> +; CHECK: S_ENDPGM
>   
>   @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
>   
> -- 1.8.1.5
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140714/778206fa/attachment.html>