R600/SI Patchset: Initial support for compute shaders
Christian König
deathsimple at vodafone.de
Mon Mar 11 02:03:53 PDT 2013
Am 08.03.2013 22:29, schrieb Tom Stellard:
> Hi,
>
> The attached patches add support for the BUFFER_STORE instruction and
> make it possible to run simple compute shaders on Southern Islands
> hardware with the open source radeonsi GPU driver in
> Mesa (http://www.mesa3d.org).
>
> -Tom
>
> 0001-R600-SI-Avoid-generating-S_MOVs-with-64-bit-immediat.patch
>
>
> >From aa3e075e2f0df66e2fe50018704aee28904155f0 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 6 Mar 2013 17:28:55 +0000
> Subject: [PATCH 1/5] R600/SI: Avoid generating S_MOVs with 64-bit immediates
>
> SITargetLowering::analyzeImmediate() was converting the 64-bit values
> to 32-bit and then checking if they were an inline immediate. Some
> of these conversions caused this check to succeed and produced
> S_MOV instructions with 64-bit immediates, which are illegal.
> ---
> lib/Target/R600/SIISelLowering.cpp | 11 +++++++++--
> test/CodeGen/R600/imm.ll | 34 ++++++++++++++++++++++++++++++++++
> 2 files changed, 43 insertions(+), 2 deletions(-)
> create mode 100644 test/CodeGen/R600/imm.ll
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index fead115..74b1dc5 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -426,9 +426,16 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
> float F;
> } Imm;
>
> - if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N))
> + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
> + if (Node->getValueType(0) == MVT::i64) {
> + int64_t Imm64 = Node->getZExtValue();
> + if (Imm64 >> 32) {
> + return -1;
> + }
> + Imm.I = Imm64;
> + }
> Imm.I = Node->getSExtValue();
I wouldn't write it like this, setting Imm.I twice is a bit superfluous.
Also I don't think we should look at the ValueType, maybe something like
this should be more appropriate:
if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
int64_t Imm64 = Node->getSExtValue();
if (Imm64 >> 32)
return -1;
Imm.I = Imm64;
} else...
That should also work if anybody ever has the stupid idea to put an i64 immediate onto a *_F64 operation.
> - else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
> + } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
> Imm.F = Node->getValueAPF().convertToFloat();
> else
> return -1; // It isn't an immediate
> diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
> new file mode 100644
> index 0000000..20f9dec
> --- /dev/null
> +++ b/test/CodeGen/R600/imm.ll
> @@ -0,0 +1,34 @@
> +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
> +
> +; XXX: Enable once SI supports buffer stores
> +; XFAIL: *
> +
> +; CHECK: @i64_imm
> +; CHECK-NOT: S_MOV_B64
> +define void @i64_imm(i64 addrspace(1)* %out) {
> +entry:
> + store i64 1311768465173141112, i64 addrspace(1) *%out ; 0x1234567812345678
> + ret void
> +}
> +
> +; Use a 64-bit value with lo bits that can be represented as an inline constant
> +; CHECK: @i64_imm_inline_lo
> +; CHECK: S_MOV_B32 [[LO:SGPR[0-9]+]], 5
> +; CHECK: V_MOV_B32_e32 [[LO_VGPR:VGPR[0-9]+]], [[LO]]
> +; CHECK: BUFFER_STORE_DWORDX2 [[LO_VGPR]]_
> +define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
> +entry:
> + store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
> + ret void
> +}
> +
> +; Use a 64-bit value with hi bits that can be represented as an inline constant
> +; CHECK: @i64_imm_inline_hi
> +; CHECK: S_MOV_B32 [[HI:SGPR[0-9]+]], 5
> +; CHECK: V_MOV_B32_e32 [[HI_VGPR:VGPR[0-9]+]], [[HI]]
> +; CHECK: BUFFER_STORE_DWORDX2 {{VGPR[0-9]+}}_[[HI_VGPR]]
> +define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
> +entry:
> + store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
> + ret void
> +}
> -- 1.7.3.4
>
> 0002-R600-SI-Add-processor-types-for-each-SI-variant.patch
>
>
> >From 018c05ab8a7e9f6da5a6f7d2547da61671f82c3e Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 8 Mar 2013 13:28:18 -0500
> Subject: [PATCH 2/5] R600/SI: Add processor types for each SI variant
>
> ---
> lib/Target/R600/AMDILDeviceInfo.cpp | 3 ++-
> lib/Target/R600/Processors.td | 6 ++++--
> 2 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
> index 9605fbe..9cc2083 100644
> --- a/lib/Target/R600/AMDILDeviceInfo.cpp
> +++ b/lib/Target/R600/AMDILDeviceInfo.cpp
> @@ -79,7 +79,8 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
> " on 32bit pointers!");
> #endif
> return new AMDGPUNIDevice(ptr);
> - } else if (deviceName == "SI") {
> + } else if (deviceName == "tahiti" || deviceName == "pitcairn" ||
> + deviceName == "verde" || deviceName == "oland") {
> return new AMDGPUSIDevice(ptr);
> } else {
> #if DEBUG
> diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
> index 868810c..86534f6 100644
> --- a/lib/Target/R600/Processors.td
> +++ b/lib/Target/R600/Processors.td
> @@ -26,5 +26,7 @@ def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
> def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
> def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
> def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
> -def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
> -
> +def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
> +def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
> +def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
> +def : Proc<"oland", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
> -- 1.7.3.4
>
> 0003-R600-SI-Use-same-names-for-corresponding-MUBUF-opera.patch
>
>
> >From f95ede73f1030b0f62ecfe9d7ffe51e6237fd442 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 24 Oct 2012 16:20:20 -0400
> Subject: [PATCH 3/5] R600/SI: Use same names for corresponding MUBUF operands and encoding fields
>
> The code emitter knows how to encode operands whose name matches one of
> the encoding fields. If there is no match, the code emitter relies on
> the order of the operand and field definitions to determine how operands
> should be encoding. Matching by order makes it easy to accidentally break
> the instruction encodings, so we prefer to match by name.
> ---
> lib/Target/R600/SIInstrFormats.td | 50 ++++++++++++++++++------------------
> lib/Target/R600/SIInstrInfo.td | 4 +-
> 2 files changed, 27 insertions(+), 27 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
> index 3891ddb..f737ddd 100644
> --- a/lib/Target/R600/SIInstrFormats.td
> +++ b/lib/Target/R600/SIInstrFormats.td
> @@ -284,33 +284,33 @@ let Uses = [EXEC] in {
> class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
> Enc64<outs, ins, asm, pattern> {
>
> - bits<8> VDATA;
> - bits<12> OFFSET;
> - bits<1> OFFEN;
> - bits<1> IDXEN;
> - bits<1> GLC;
> - bits<1> ADDR64;
> - bits<1> LDS;
> - bits<8> VADDR;
> - bits<7> SRSRC;
> - bits<1> SLC;
> - bits<1> TFE;
> - bits<8> SOFFSET;
> -
> - let Inst{11-0} = OFFSET;
> - let Inst{12} = OFFEN;
> - let Inst{13} = IDXEN;
> - let Inst{14} = GLC;
> - let Inst{15} = ADDR64;
> - let Inst{16} = LDS;
> + bits<12> offset;
> + bits<1> offen;
> + bits<1> idxen;
> + bits<1> glc;
> + bits<1> addr64;
> + bits<1> lds;
> + bits<8> vaddr;
> + bits<8> vdata;
> + bits<7> srsrc;
> + bits<1> slc;
> + bits<1> tfe;
> + bits<8> soffset;
> +
> + let Inst{11-0} = offset;
> + let Inst{12} = offen;
> + let Inst{13} = idxen;
> + let Inst{14} = glc;
> + let Inst{15} = addr64;
> + let Inst{16} = lds;
> let Inst{24-18} = op;
> let Inst{31-26} = 0x38; //encoding
> - let Inst{39-32} = VADDR;
> - let Inst{47-40} = VDATA;
> - let Inst{52-48} = SRSRC{6-2};
> - let Inst{54} = SLC;
> - let Inst{55} = TFE;
> - let Inst{63-56} = SOFFSET;
> + let Inst{39-32} = vaddr;
> + let Inst{47-40} = vdata;
> + let Inst{52-48} = srsrc{6-2};
> + let Inst{54} = slc;
> + let Inst{55} = tfe;
> + let Inst{63-56} = soffset;
>
> let VM_CNT = 1;
> let EXP_CNT = 1;
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 260c651..1b09ded 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -276,11 +276,11 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
>
> class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
> op,
> - (outs regClass:$dst),
> + (outs regClass:$vdata),
> (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
> i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
> i1imm:$tfe, SSrc_32:$soffset),
> - asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, "
> + asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, "
> #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
> []> {
> let mayLoad = 1;
> -- 1.7.3.4
>
> 0004-R600-SI-Simplify-MUBUF-helper-class-for-loads.patch
>
>
> >From 3a1807a4b8e0a17febbf623d05b833e0cbdda3d2 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 25 Oct 2012 10:36:05 -0400
> Subject: [PATCH 4/5] R600/SI: Simplify MUBUF helper class for loads
>
> ---
> lib/Target/R600/SIInstrInfo.td | 23 ++++++++++++++++-------
> lib/Target/R600/SIInstructions.td | 17 +++++++----------
> 2 files changed, 23 insertions(+), 17 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 1b09ded..d0cd468 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -274,17 +274,26 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
> let mayLoad = 0;
> }
>
> -class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
> +class MUBUF_Load_Helper <bits<7> op, string name, RegisterClass regClass,
> + list<dag> Pat> : MUBUF <
> op,
> (outs regClass:$vdata),
> - (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
> - i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
> - i1imm:$tfe, SSrc_32:$soffset),
> - asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, "
> - #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
> - []> {
> + (ins i16imm:$offset, SReg_128:$srsrc, VReg_32:$vaddr),
> + name#" $vdata, ($srsrc + $offset)[$vaddr]",
> + Pat> {
> +
> let mayLoad = 1;
> let mayStore = 0;
> +
> + // Encoding
> + let offen = 0;
> + let idxen = 1;
> + let glc = 0;
> + let addr64 = 0;
> + let lds = 0;
> + let slc = 0;
> + let tfe = 0;
> + let soffset = 128; // ZERO
> }
>
> class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 0ab9e4e..44f35b9 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -394,7 +394,13 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
> //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
> //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
> //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
> -def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
> +
> +def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <
> + 0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128,
> + [(set VReg_128:$vdata, (int_SI_vs_load_input SReg_128:$srsrc,
> + IMM12bit:$offset,
> + VReg_32:$vaddr))]>;
> +
> //def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
> //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
> //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
> @@ -1145,15 +1151,6 @@ def : Pat <
> (SI_KILL (V_MOV_B32_e32 0xbf800000))
> >;
>
> -/* int_SI_vs_load_input */
> -def : Pat<
> - (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
> - VReg_32:$buf_idx_vgpr),
> - (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
> - VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
> - 0, 0, 0)
> ->;
> -
> /* int_SI_export */
> def : Pat <
> (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
> -- 1.7.3.4
>
> 0005-R600-SI-Add-support-for-buffer-stores.patch
>
>
> >From b5391a90d06ae4e232fd63e440b891586e456045 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 4 Mar 2013 15:12:16 -0500
> Subject: [PATCH 5/5] R600/SI: Add support for buffer stores
>
> ---
> lib/Target/R600/AMDGPUCallingConv.td | 8 ++-
> lib/Target/R600/AMDGPUISelLowering.h | 1 +
> lib/Target/R600/AMDILISelDAGToDAG.cpp | 23 +++++++
> lib/Target/R600/SIDefines.h | 112 +++++++++++++++++++++++++++++++++
> lib/Target/R600/SIISelLowering.cpp | 63 ++++++++++++++++++
> lib/Target/R600/SIISelLowering.h | 1 +
> lib/Target/R600/SIInstrInfo.td | 26 ++++++++
> lib/Target/R600/SIInstructions.td | 10 +++-
> lib/Target/R600/SIRegisterInfo.td | 2 +-
> test/CodeGen/R600/imm.ll | 3 -
> test/CodeGen/R600/store.ll | 11 +++
> 11 files changed, 253 insertions(+), 7 deletions(-)
> create mode 100644 lib/Target/R600/SIDefines.h
> create mode 100644 test/CodeGen/R600/store.ll
>
> diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
> index 45ae37e..9c30515 100644
> --- a/lib/Target/R600/AMDGPUCallingConv.td
> +++ b/lib/Target/R600/AMDGPUCallingConv.td
> @@ -32,8 +32,14 @@ def CC_SI : CallingConv<[
> VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
> VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
> VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
> - ]>>>
> + ]>>>,
>
> + // This is the default for i64 values.
> + // XXX: We should change this once clang understands the CC_AMDGPU.
> + CCIfType<[i64], CCAssignToRegWithShadow<
> + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
> + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
> + >>
> ]>;
>
> def CC_AMDGPU : CallingConv<[
> diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
> index f31b646..c2a79ea 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.h
> +++ b/lib/Target/R600/AMDGPUISelLowering.h
> @@ -116,6 +116,7 @@ enum {
> BRANCH_COND,
> // End AMDIL ISD Opcodes
> BITALIGN,
> + BUFFER_STORE,
> DWORDADDR,
> FRACT,
> FMAX,
> diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
> index e77b9dc..3e58881 100644
> --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
> +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
> @@ -162,6 +162,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
> }
> switch (Opc) {
> default: break;
> + case ISD::BUILD_PAIR: {
> + SDValue RC, SubReg0, SubReg1;
> + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
> + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
> + break;
> + }
> + if (N->getValueType(0) == MVT::i128) {
> + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
> + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
> + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
> + } else if (N->getValueType(0) == MVT::i64) {
> + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
> + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
> + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
> + } else {
> + llvm_unreachable("Unhandled value type for BUILD_PAIR");
> + }
> + const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
> + N->getOperand(1), SubReg1 };
> + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
> + N->getDebugLoc(), N->getValueType(0), Ops, 5);
> + }
> +
> case ISD::ConstantFP:
> case ISD::Constant: {
> const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
> diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
> new file mode 100644
> index 0000000..3e2a229
> --- /dev/null
> +++ b/lib/Target/R600/SIDefines.h
> @@ -0,0 +1,112 @@
> +//===-- SIDefines.h - SI Helper Macros --------------------------*- C++ -*-===//
> +//
> +// The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +/// \file
> +//===----------------------------------------------------------------------===//
> +
> +#ifndef SIDEFINES_H_
> +#define SIDEFINES_H_
> +
> +#define R_008F0C_SQ_BUF_RSRC_WORD3 0x008F0C
> +#define S_008F0C_DST_SEL_X(x) (((x) & 0x07) << 0)
> +#define G_008F0C_DST_SEL_X(x) (((x) >> 0) & 0x07)
> +#define C_008F0C_DST_SEL_X 0xFFFFFFF8
> +#define V_008F0C_SQ_SEL_0 0x00
> +#define V_008F0C_SQ_SEL_1 0x01
> +#define V_008F0C_SQ_SEL_RESERVED_0 0x02
> +#define V_008F0C_SQ_SEL_RESERVED_1 0x03
> +#define V_008F0C_SQ_SEL_X 0x04
> +#define V_008F0C_SQ_SEL_Y 0x05
> +#define V_008F0C_SQ_SEL_Z 0x06
> +#define V_008F0C_SQ_SEL_W 0x07
> +#define S_008F0C_DST_SEL_Y(x) (((x) & 0x07) << 3)
> +#define G_008F0C_DST_SEL_Y(x) (((x) >> 3) & 0x07)
> +#define C_008F0C_DST_SEL_Y 0xFFFFFFC7
> +#define V_008F0C_SQ_SEL_0 0x00
> +#define V_008F0C_SQ_SEL_1 0x01
> +#define V_008F0C_SQ_SEL_RESERVED_0 0x02
> +#define V_008F0C_SQ_SEL_RESERVED_1 0x03
> +#define V_008F0C_SQ_SEL_X 0x04
> +#define V_008F0C_SQ_SEL_Y 0x05
> +#define V_008F0C_SQ_SEL_Z 0x06
> +#define V_008F0C_SQ_SEL_W 0x07
> +#define S_008F0C_DST_SEL_Z(x) (((x) & 0x07) << 6)
> +#define G_008F0C_DST_SEL_Z(x) (((x) >> 6) & 0x07)
> +#define C_008F0C_DST_SEL_Z 0xFFFFFE3F
> +#define V_008F0C_SQ_SEL_0 0x00
> +#define V_008F0C_SQ_SEL_1 0x01
> +#define V_008F0C_SQ_SEL_RESERVED_0 0x02
> +#define V_008F0C_SQ_SEL_RESERVED_1 0x03
> +#define V_008F0C_SQ_SEL_X 0x04
> +#define V_008F0C_SQ_SEL_Y 0x05
> +#define V_008F0C_SQ_SEL_Z 0x06
> +#define V_008F0C_SQ_SEL_W 0x07
> +#define S_008F0C_DST_SEL_W(x) (((x) & 0x07) << 9)
> +#define G_008F0C_DST_SEL_W(x) (((x) >> 9) & 0x07)
> +#define C_008F0C_DST_SEL_W 0xFFFFF1FF
> +#define V_008F0C_SQ_SEL_0 0x00
> +#define V_008F0C_SQ_SEL_1 0x01
> +#define V_008F0C_SQ_SEL_RESERVED_0 0x02
> +#define V_008F0C_SQ_SEL_RESERVED_1 0x03
> +#define V_008F0C_SQ_SEL_X 0x04
> +#define V_008F0C_SQ_SEL_Y 0x05
> +#define V_008F0C_SQ_SEL_Z 0x06
> +#define V_008F0C_SQ_SEL_W 0x07
> +#define S_008F0C_NUM_FORMAT(x) (((x) & 0x07) << 12)
> +#define G_008F0C_NUM_FORMAT(x) (((x) >> 12) & 0x07)
> +#define C_008F0C_NUM_FORMAT 0xFFFF8FFF
> +#define V_008F0C_BUF_NUM_FORMAT_UNORM 0x00
> +#define V_008F0C_BUF_NUM_FORMAT_SNORM 0x01
> +#define V_008F0C_BUF_NUM_FORMAT_USCALED 0x02
> +#define V_008F0C_BUF_NUM_FORMAT_SSCALED 0x03
> +#define V_008F0C_BUF_NUM_FORMAT_UINT 0x04
> +#define V_008F0C_BUF_NUM_FORMAT_SINT 0x05
> +#define V_008F0C_BUF_NUM_FORMAT_SNORM_OGL 0x06
> +#define V_008F0C_BUF_NUM_FORMAT_FLOAT 0x07
> +#define S_008F0C_DATA_FORMAT(x) (((x) & 0x0F) << 15)
> +#define G_008F0C_DATA_FORMAT(x) (((x) >> 15) & 0x0F)
> +#define C_008F0C_DATA_FORMAT 0xFFF87FFF
> +#define V_008F0C_BUF_DATA_FORMAT_INVALID 0x00
> +#define V_008F0C_BUF_DATA_FORMAT_8 0x01
> +#define V_008F0C_BUF_DATA_FORMAT_16 0x02
> +#define V_008F0C_BUF_DATA_FORMAT_8_8 0x03
> +#define V_008F0C_BUF_DATA_FORMAT_32 0x04
> +#define V_008F0C_BUF_DATA_FORMAT_16_16 0x05
> +#define V_008F0C_BUF_DATA_FORMAT_10_11_11 0x06
> +#define V_008F0C_BUF_DATA_FORMAT_11_11_10 0x07
> +#define V_008F0C_BUF_DATA_FORMAT_10_10_10_2 0x08
> +#define V_008F0C_BUF_DATA_FORMAT_2_10_10_10 0x09
> +#define V_008F0C_BUF_DATA_FORMAT_8_8_8_8 0x0A
> +#define V_008F0C_BUF_DATA_FORMAT_32_32 0x0B
> +#define V_008F0C_BUF_DATA_FORMAT_16_16_16_16 0x0C
> +#define V_008F0C_BUF_DATA_FORMAT_32_32_32 0x0D
> +#define V_008F0C_BUF_DATA_FORMAT_32_32_32_32 0x0E
> +#define V_008F0C_BUF_DATA_FORMAT_RESERVED_15 0x0F
> +#define S_008F0C_ELEMENT_SIZE(x) (((x) & 0x03) << 19)
> +#define G_008F0C_ELEMENT_SIZE(x) (((x) >> 19) & 0x03)
> +#define C_008F0C_ELEMENT_SIZE 0xFFE7FFFF
> +#define S_008F0C_INDEX_STRIDE(x) (((x) & 0x03) << 21)
> +#define G_008F0C_INDEX_STRIDE(x) (((x) >> 21) & 0x03)
> +#define C_008F0C_INDEX_STRIDE 0xFF9FFFFF
> +#define S_008F0C_ADD_TID_ENABLE(x) (((x) & 0x1) << 23)
> +#define G_008F0C_ADD_TID_ENABLE(x) (((x) >> 23) & 0x1)
> +#define C_008F0C_ADD_TID_ENABLE 0xFF7FFFFF
> +#define S_008F0C_HASH_ENABLE(x) (((x) & 0x1) << 25)
> +#define G_008F0C_HASH_ENABLE(x) (((x) >> 25) & 0x1)
> +#define C_008F0C_HASH_ENABLE 0xFDFFFFFF
> +#define S_008F0C_HEAP(x) (((x) & 0x1) << 26)
> +#define G_008F0C_HEAP(x) (((x) >> 26) & 0x1)
> +#define C_008F0C_HEAP 0xFBFFFFFF
> +#define S_008F0C_TYPE(x) (((x) & 0x03) << 30)
> +#define G_008F0C_TYPE(x) (((x) >> 30) & 0x03)
> +#define C_008F0C_TYPE 0x3FFFFFFF
> +#define V_008F0C_SQ_RSRC_BUF 0x00
> +#define V_008F0C_SQ_RSRC_BUF_RSVD_1 0x01
> +#define V_008F0C_SQ_RSRC_BUF_RSVD_2 0x02
> +#define V_008F0C_SQ_RSRC_BUF_RSVD_3 0x03
> +
> +#endif // R600DEFINES_H_
Oh no, please not! Do we really need that? For the indirect stuff I put
quite allot of effort into keeping the resource descriptors out of the
LLVM backend.
My main reason was that even if we today only use v1i32 or v2i32 (or
v2f32, etc...) buffers that stuff really belongs into the driver and not
the compile, cause the driver is really setting up the buffers
(including handling all the restrictions for tilling, alignment, padding
etc...).
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 74b1dc5..3f0e27d 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -16,6 +16,7 @@
> #include "AMDIL.h"
> #include "AMDGPU.h"
> #include "AMDILIntrinsicInfo.h"
> +#include "SIDefines.h"
> #include "SIInstrInfo.h"
> #include "SIMachineFunctionInfo.h"
> #include "SIRegisterInfo.h"
> @@ -49,6 +50,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
>
> addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
> addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
> + addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
>
> addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
> addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
> @@ -68,6 +70,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
> setTargetDAGCombine(ISD::SELECT_CC);
>
> setTargetDAGCombine(ISD::SETCC);
> +
> + setOperationAction(ISD::STORE, MVT::i32, Custom);
> + setOperationAction(ISD::STORE, MVT::i64, Custom);
> }
>
> SDValue SITargetLowering::LowerFormalArguments(
> @@ -236,6 +241,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
> default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
> case ISD::BRCOND: return LowerBRCOND(Op, DAG);
> case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
> + case ISD::STORE: return LowerSTORE(Op, DAG);
> }
> return SDValue();
> }
> @@ -334,6 +340,63 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
> return Chain;
> }
>
> +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
> + StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
> + SDValue Chain = Op.getOperand(0);
> + SDValue Value = Op.getOperand(1);
> + SDValue VirtualAddress = Op.getOperand(2);
> + EVT VT = Value.getValueType();
> + DebugLoc DL = Op.getDebugLoc();
> +
> + if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) {
> + return SDValue();
> + }
> +
> + // 128-bit resource constants are used to identify buffers for loads and
> + // stores. The resource constant is composed of a 48-bit virtual address
> + // plus 80-bits of information about the buffer.
> +
> + // Set NumRecords to the maximum allowed value. This way we don't need to
> + // pass the number of records to he program and it should work for all
> + // buffer sizes.
> + //
> + // XXX: Is there any disadvantage to doing this?
> + //
> + uint64_t RsrcWord2 = 0xFFFFFFFF; //NUM_RECORDS
> +
> + unsigned DataFormat;
> + switch (VT.getSizeInBits()) {
> + case 32: DataFormat = V_008F0C_BUF_DATA_FORMAT_32; break;
> + case 64: DataFormat = V_008F0C_BUF_DATA_FORMAT_32_32; break;
> + default: llvm_unreachable("Unhandle type size in store");
> + }
> +
> + uint64_t RsrcWord3 =
> + // Always set the destination select values to XYZW.
> + // XXX: Is there any disadvantage to doing this?
> + S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> + // NUM_FORMAT_UINT appears to work even if the storage type is float,
> + // so we'll use it for everything.
> + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
> + S_008F0C_DATA_FORMAT(DataFormat);
> +
> + SDValue RsrcWord23 = DAG.getConstant(RsrcWord2 | (RsrcWord3 << 32), MVT::i64);
> +
> + SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
> + VirtualAddress, RsrcWord23);
> +
> + SDValue Ops[2];
> + Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain,
> + Value, SrcSrc, DAG.getConstant(0, MVT::i32));
> + Ops[1] = Chain;
> +
> + return DAG.getMergeValues(Ops, 2, DL);
> +
> +}
> +
> SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
> SDValue LHS = Op.getOperand(0);
> SDValue RHS = Op.getOperand(1);
> diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
> index 0411565..ef0a8dc 100644
> --- a/lib/Target/R600/SIISelLowering.h
> +++ b/lib/Target/R600/SIISelLowering.h
> @@ -27,6 +27,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
> void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
> MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
>
> + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
> SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
> SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
>
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index d0cd468..02452ca 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -26,6 +26,10 @@ def HI32 : SDNodeXForm<imm, [{
> return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
> }]>;
>
> +def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE",
> + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
> + [SDNPHasChain, SDNPMayStore]>;
> +
> def IMM8bitDWORD : ImmLeaf <
> i32, [{
> return (Imm & ~0x3FC) == 0;
> @@ -296,6 +300,28 @@ class MUBUF_Load_Helper <bits<7> op, string name, RegisterClass regClass,
> let soffset = 128; // ZERO
> }
>
> +class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
> + ValueType VT> :
> + MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr),
> + name#" $vdata, $srsrc + $vaddr",
> + [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc),
> + (i32 VReg_32:$vaddr))]> {
> +
> + let mayLoad = 0;
> + let mayStore = 1;
> +
> + // Encoding
> + let offset = 0;
> + let offen = 1;
> + let idxen = 0;
> + let glc = 0;
> + let addr64 = 0;
> + let lds = 0;
> + let slc = 0;
> + let tfe = 0;
> + let soffset = 128; // ZERO
> +}
> +
> class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
> op,
> (outs regClass:$dst),
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 44f35b9..a01e096 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -414,8 +414,14 @@ def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <
> //def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
> //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
> //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
> -//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
> -//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
> +
> +def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
> + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32
> +>;
> +
> +def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
> + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
> +>;
> //def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
> //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
> //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
> diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
> index 3dcad50..af74123 100644
> --- a/lib/Target/R600/SIRegisterInfo.td
> +++ b/lib/Target/R600/SIRegisterInfo.td
> @@ -151,7 +151,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
> (add SGPR_64, VCCReg, EXECReg)
> >;
>
> -def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>;
> +def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>;
>
> def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
>
> diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
> index 20f9dec..0e19e78 100644
> --- a/test/CodeGen/R600/imm.ll
> +++ b/test/CodeGen/R600/imm.ll
> @@ -1,8 +1,5 @@
> ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
>
> -; XXX: Enable once SI supports buffer stores
> -; XFAIL: *
> -
> ; CHECK: @i64_imm
> ; CHECK-NOT: S_MOV_B64
> define void @i64_imm(i64 addrspace(1)* %out) {
> diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
> new file mode 100644
> index 0000000..e5811ea
> --- /dev/null
> +++ b/test/CodeGen/R600/store.ll
> @@ -0,0 +1,11 @@
> +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
> +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
> +
> +; CHECK: @store_float
> +; EG-CHECK: RAT_WRITE_CACHELESS_32_eg T{{[0-9]+\.X, T[0-9]+\.X}}, 1
> +; SI-CHECK: BUFFER_STORE_DWORD
> +
> +define void @store_float(float addrspace(1)* %out, float %in) {
> + store float %in, float addrspace(1)* %out
> + ret void
> +}
> -- 1.7.3.4
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130311/1020a534/attachment.html>
More information about the llvm-commits
mailing list