SI indirect addressing
Tom Stellard
tom at stellard.net
Fri Mar 15 11:49:06 PDT 2013
On Thu, Mar 14, 2013 at 04:46:08PM +0100, Christian König wrote:
> Hi guys,
>
> attached is the LLVM part of my indirect addressing patchset for SI,
> and despite what I said yesterday in our team meeting it indeed
> fixes 16 piglit tests without any regression.
>
> The mesa will follow in a few minutes, please review and comment.
>
For the series:
Reviewed-by: Tom Stellard <thomas.stellard at amd.com>
> Regards,
> Christian.
> From a8d1e5d0e302b79d637b055d1e58d37362ccfdcc Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Thu, 14 Mar 2013 15:46:33 +0100
> Subject: [PATCH 1/7] R600/SI: fix inserting waits for all defines
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Unfortunately the previous fix for inserting waits for unordered
> defines wasn't sufficient, cause it's possible that even ordered
> defines are only partially used (or not used at all).
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/SIInsertWaits.cpp | 16 +---------------
> 1 file changed, 1 insertion(+), 15 deletions(-)
>
> diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
> index 67fbdf7..98bd3db 100644
> --- a/lib/Target/R600/SIInsertWaits.cpp
> +++ b/lib/Target/R600/SIInsertWaits.cpp
> @@ -302,21 +302,8 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
> Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
> }
>
> -bool SIInsertWaits::unorderedDefines(MachineInstr &MI) {
> -
> - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
> - if (TSFlags & SIInstrFlags::LGKM_CNT)
> - return true;
> -
> - if (TSFlags & SIInstrFlags::EXP_CNT)
> - return ExpInstrTypesSeen == 3;
> -
> - return false;
> -}
> -
> Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
>
> - bool UnorderedDefines = unorderedDefines(MI);
> Counters Result = ZeroCounts;
>
> // For each register affected by this
> @@ -329,8 +316,7 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
>
> if (Op.isDef()) {
> increaseCounters(Result, UsedRegs[j]);
> - if (UnorderedDefines)
> - increaseCounters(Result, DefinedRegs[j]);
> + increaseCounters(Result, DefinedRegs[j]);
> }
>
> if (Op.isUse())
> --
> 1.7.10.4
>
> From cc732c836a019ab5bab6d57365d5fd1d69346822 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Tue, 26 Feb 2013 19:04:28 +0100
> Subject: [PATCH 2/7] R600/SI: enable all S_LOAD and S_BUFFER_LOAD opcodes
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/SIInstrInfo.td | 7 ++++---
> lib/Target/R600/SIInstructions.td | 36 +++++++++++++++++++++++++-----------
> 2 files changed, 29 insertions(+), 14 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 260c651..2f10c38 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -115,16 +115,17 @@ class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
> opName#" $dst, $src0", pattern
> >;
>
> -multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
> +multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
> + RegisterClass dstClass> {
> def _IMM : SMRD <
> op, 1, (outs dstClass:$dst),
> - (ins SReg_64:$sbase, i32imm:$offset),
> + (ins baseClass:$sbase, i32imm:$offset),
> asm#" $dst, $sbase, $offset", []
> >;
>
> def _SGPR : SMRD <
> op, 0, (outs dstClass:$dst),
> - (ins SReg_64:$sbase, SReg_32:$soff),
> + (ins baseClass:$sbase, SReg_32:$soff),
> asm#" $dst, $sbase, $soff", []
> >;
> }
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 0ab9e4e..e8247a8 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -458,17 +458,31 @@ def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORM
>
> let mayLoad = 1 in {
>
> -defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
> -
> -//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
> -defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
> -defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
> -//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
> -//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
> -//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
> -//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
> -//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
> -//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
> +defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
> +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
> +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
> +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
> +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
> +
> +defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
> + 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
> +>;
> +
> +defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
> + 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
> +>;
> +
> +defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
> + 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
> +>;
> +
> +defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
> + 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
> +>;
> +
> +defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
> + 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
> +>;
>
> } // mayLoad = 1
>
> --
> 1.7.10.4
>
> From 12a0a5bd073b3e9b4d421f3783915f32413ee86d Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Wed, 27 Feb 2013 10:39:50 +0100
> Subject: [PATCH 3/7] R600/SI: implement SI.load.const intrinsic
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/SIInstructions.td | 12 ++++++++++++
> lib/Target/R600/SIIntrinsics.td | 3 +--
> 2 files changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index e8247a8..5e8ab0c 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1403,6 +1403,18 @@ def : Pat <
> (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
> >;
>
> +// 1. Offset as 8bit DWORD immediate
> +def : Pat <
> + (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset),
> + (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset)
> +>;
> +
> +// 2. Offset loaded in an 32bit SGPR
> +def : Pat <
> + (int_SI_load_const SReg_128:$sbase, imm:$offset),
> + (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset))
> +>;
> +
> /********** ================== **********/
> /********** VOP3 Patterns **********/
> /********** ================== **********/
> diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
> index 7c23d17..33bb815 100644
> --- a/lib/Target/R600/SIIntrinsics.td
> +++ b/lib/Target/R600/SIIntrinsics.td
> @@ -16,8 +16,7 @@ let TargetPrefix = "SI", isTarget = 1 in {
>
> def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
> def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
> - /* XXX: We may need a seperate intrinsic here for loading integer values */
> - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
> + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrReadMem]>;
> def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
> def int_SI_wqm : Intrinsic <[], [], []>;
>
> --
> 1.7.10.4
>
> From 6e556f1c42d03fe9647efd47d91ffe8fa360a747 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Wed, 27 Feb 2013 13:30:33 +0100
> Subject: [PATCH 4/7] R600/SI: add BUFFER_LOAD_DWORD pattern
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/SIInstructions.td | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 5e8ab0c..c14e8a3 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -403,9 +403,9 @@ def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT
> //def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
> //def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
> //def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
> -//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
> -//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
> -//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
> +def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
> +def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
> +def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
> //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
> //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
> //def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
> @@ -1415,6 +1415,12 @@ def : Pat <
> (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset))
> >;
>
> +// 3. Offset in an 32Bit VGPR
> +def : Pat <
> + (int_SI_load_const SReg_128:$sbase, VReg_32:$voff),
> + (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0)
> +>;
> +
> /********** ================== **********/
> /********** VOP3 Patterns **********/
> /********** ================== **********/
> --
> 1.7.10.4
>
> From 2ffd8c39d9ab184070e5e6c284b490809c75a865 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Thu, 28 Feb 2013 15:41:23 +0100
> Subject: [PATCH 5/7] R600/SI: add shl pattern
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/SIISelLowering.cpp | 4 ++++
> lib/Target/R600/SIISelLowering.h | 1 +
> lib/Target/R600/SIInstructions.td | 4 +++-
> 3 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 063f5fa..7a1cd94 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -229,6 +229,10 @@ EVT SITargetLowering::getSetCCResultType(EVT VT) const {
> return MVT::i1;
> }
>
> +MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
> + return MVT::i32;
> +}
> +
> //===----------------------------------------------------------------------===//
> // Custom DAG Lowering Operations
> //===----------------------------------------------------------------------===//
> diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
> index 0411565..d656225 100644
> --- a/lib/Target/R600/SIISelLowering.h
> +++ b/lib/Target/R600/SIISelLowering.h
> @@ -48,6 +48,7 @@ public:
> virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
> MachineBasicBlock * BB) const;
> virtual EVT getSetCCResultType(EVT VT) const;
> + virtual MVT getScalarShiftAmountTy(EVT VT) const;
> virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
> virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
> virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index c14e8a3..7593b94 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -854,7 +854,9 @@ defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
> defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
> defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
> defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
> -defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
> +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
> + [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))]
> +>;
> defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
>
> let isCommutable = 1 in {
> --
> 1.7.10.4
>
> From 1c5ef6a49767b100e0417ed88d94e816484153c7 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Wed, 6 Mar 2013 20:02:10 +0100
> Subject: [PATCH 6/7] R600/SI: add float vector types
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/AMDGPUInstructions.td | 4 +-
> lib/Target/R600/R600Instructions.td | 4 +-
> lib/Target/R600/SIInstructions.td | 81 +++++++++++++++++++++++++++++----
> lib/Target/R600/SIRegisterInfo.td | 14 +++---
> 4 files changed, 82 insertions(+), 21 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
> index a59c775..e740348 100644
> --- a/lib/Target/R600/AMDGPUInstructions.td
> +++ b/lib/Target/R600/AMDGPUInstructions.td
> @@ -202,8 +202,8 @@ class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
> (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
> >;
>
> -class Vector_Build <ValueType vecType, RegisterClass vectorClass,
> - ValueType elemType, RegisterClass elemClass> : Pat <
> +class Vector4_Build <ValueType vecType, RegisterClass vectorClass,
> + ValueType elemType, RegisterClass elemClass> : Pat <
> (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
> (elemType elemClass:$z), (elemType elemClass:$w))),
> (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
> diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
> index c5fa334..8c50d54 100644
> --- a/lib/Target/R600/R600Instructions.td
> +++ b/lib/Target/R600/R600Instructions.td
> @@ -1979,8 +1979,8 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
> def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
> def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
>
> -def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
> -def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
> +def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
> +def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
>
> // bitconvert patterns
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 7593b94..5a3c901 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1257,22 +1257,83 @@ defm : SamplePatterns<VReg_128, v4i32>;
> defm : SamplePatterns<VReg_256, v8i32>;
> defm : SamplePatterns<VReg_512, v16i32>;
>
> -def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>;
> -def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>;
> -def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>;
> -def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>;
> +/********** ============================================ **********/
> +/********** Extraction, Insertion, Building and Casting **********/
> +/********** ============================================ **********/
>
> -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>;
> -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>;
> -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>;
> -def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>;
> +foreach Index = 0-2 in {
> + def Extract_Element_v2i32_#Index : Extract_Element <
> + i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v2i32_#Index : Insert_Element <
> + i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +
> + def Extract_Element_v2f32_#Index : Extract_Element <
> + f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v2f32_#Index : Insert_Element <
> + f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +}
> +
> +foreach Index = 0-3 in {
> + def Extract_Element_v4i32_#Index : Extract_Element <
> + i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v4i32_#Index : Insert_Element <
> + i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +
> + def Extract_Element_v4f32_#Index : Extract_Element <
> + f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v4f32_#Index : Insert_Element <
> + f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +}
> +
> +foreach Index = 0-7 in {
> + def Extract_Element_v8i32_#Index : Extract_Element <
> + i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v8i32_#Index : Insert_Element <
> + i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +
> + def Extract_Element_v8f32_#Index : Extract_Element <
> + f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v8f32_#Index : Insert_Element <
> + f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +}
> +
> +foreach Index = 0-15 in {
> + def Extract_Element_v16i32_#Index : Extract_Element <
> + i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v16i32_#Index : Insert_Element <
> + i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +
> + def Extract_Element_v16f32_#Index : Extract_Element <
> + f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> + def Insert_Element_v16f32_#Index : Insert_Element <
> + f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
> + >;
> +}
>
> def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
> def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
> -def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
> -def : Vector_Build <v4i32, VReg_128, i32, VReg_32>;
> +def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>;
> +def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>;
> +def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>;
> def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
> +def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>;
> def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
> +def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>;
>
> def : BitConvert <i32, f32, SReg_32>;
> def : BitConvert <i32, f32, VReg_32>;
> diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
> index 3dcad50..4f14931 100644
> --- a/lib/Target/R600/SIRegisterInfo.td
> +++ b/lib/Target/R600/SIRegisterInfo.td
> @@ -158,15 +158,15 @@ def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
> def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
>
> // Register class for all vector registers (VGPRs + Interploation Registers)
> -def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32, (add VGPR_32)>;
> +def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
>
> -def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>;
> +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
>
> -def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>;
> +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
>
> -def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>;
> +def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
>
> -def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
> +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
>
> //===----------------------------------------------------------------------===//
> // [SV]Src_* register classes, can have either an immediate or an register
> @@ -174,9 +174,9 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
>
> def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
>
> -def SSrc_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SReg_64)>;
> +def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
>
> def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
>
> -def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add VReg_64, SReg_64)>;
> +def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
>
> --
> 1.7.10.4
>
> From e8cbc46824395e93b5983aaa7ec7d8809179331a Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig at amd.com>
> Date: Wed, 6 Mar 2013 20:33:42 +0100
> Subject: [PATCH 7/7] R600/SI: implement indirect adressing for SI
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Signed-off-by: Christian K??nig <christian.koenig at amd.com>
> ---
> lib/Target/R600/SIISelLowering.cpp | 5 ++
> lib/Target/R600/SIInstructions.td | 69 +++++++++++++++++++
> lib/Target/R600/SILowerControlFlow.cpp | 117 +++++++++++++++++++++++++++++++-
> 3 files changed, 190 insertions(+), 1 deletion(-)
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 7a1cd94..93f8c38 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -58,6 +58,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
>
> computeRegisterProperties();
>
> + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
> + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
> + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
> + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
> +
> setOperationAction(ISD::ADD, MVT::i64, Legal);
> setOperationAction(ISD::ADD, MVT::i32, Legal);
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 5a3c901..05b04a9 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1149,6 +1149,31 @@ def SI_KILL : InstSI <
> } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
> // Uses = [EXEC], Defs = [EXEC]
>
> +let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
> +
> +def SI_INDIRECT_SRC : InstSI <
> + (outs VReg_32:$dst, SReg_64:$temp),
> + (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
> + "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
> + []
> +>;
> +
> +class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
> + (outs rc:$dst, SReg_64:$temp),
> + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
> + "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
> + []
> +> {
> + let Constraints = "$src = $dst";
> +}
> +
> +def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
> +def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
> +def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
> +def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
> +
> +} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
> +
> } // end IsCodeGenOnly, isPseudo
>
> def : Pat<
> @@ -1521,4 +1546,48 @@ defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
> defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
> defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
>
> +/********** ====================== **********/
> +/********** Indirect adressing **********/
> +/********** ====================== **********/
> +
> +multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
> + SI_INDIRECT_DST IndDst> {
> + // 1. Extract with offset
> + def : Pat<
> + (vector_extract (vt rc:$vec),
> + (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
> + ),
> + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
> + >;
> +
> + // 2. Extract without offset
> + def : Pat<
> + (vector_extract (vt rc:$vec),
> + (i64 (zext (i32 VReg_32:$idx)))
> + ),
> + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
> + >;
> +
> + // 3. Insert with offset
> + def : Pat<
> + (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
> + (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
> + ),
> + (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
> + >;
> +
> + // 4. Insert without offset
> + def : Pat<
> + (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
> + (i64 (zext (i32 VReg_32:$idx)))
> + ),
> + (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
> + >;
> +}
> +
> +defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
> +defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
> +defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
> +defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
> +
> } // End isSI predicate
> diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
> index b215aa2..9a027e7 100644
> --- a/lib/Target/R600/SILowerControlFlow.cpp
> +++ b/lib/Target/R600/SILowerControlFlow.cpp
> @@ -66,6 +66,7 @@ private:
> static const unsigned SkipThreshold = 12;
>
> static char ID;
> + const TargetRegisterInfo *TRI;
> const TargetInstrInfo *TII;
>
> bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
> @@ -84,9 +85,14 @@ private:
> void Kill(MachineInstr &MI);
> void Branch(MachineInstr &MI);
>
> + void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
> + void IndirectSrc(MachineInstr &MI);
> + void IndirectDst(MachineInstr &MI);
> +
> public:
> SILowerControlFlowPass(TargetMachine &tm) :
> - MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
> + MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
> + TII(tm.getInstrInfo()) { }
>
> virtual bool runOnMachineFunction(MachineFunction &MF);
>
> @@ -302,6 +308,104 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
> MI.eraseFromParent();
> }
>
> +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
> +
> + MachineBasicBlock &MBB = *MI.getParent();
> + DebugLoc DL = MI.getDebugLoc();
> + MachineBasicBlock::iterator I = MI;
> +
> + unsigned Save = MI.getOperand(1).getReg();
> + unsigned Idx = MI.getOperand(3).getReg();
> +
> + if (AMDGPU::SReg_32RegClass.contains(Idx)) {
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
> + .addReg(Idx);
> + MBB.insert(I, MovRel);
> + MI.eraseFromParent();
> + return;
> + }
> +
> + assert(AMDGPU::SReg_64RegClass.contains(Save));
> + assert(AMDGPU::VReg_32RegClass.contains(Idx));
> +
> + // Save the EXEC mask
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
> + .addReg(AMDGPU::EXEC);
> +
> + // Read the next variant into VCC (lower 32 bits) <- also loop target
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
> + .addReg(Idx);
> +
> + // Move index from VCC into M0
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
> + .addReg(AMDGPU::VCC);
> +
> + // Compare the just read M0 value to all possible Idx values
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
> + .addReg(AMDGPU::M0)
> + .addReg(Idx);
> +
> + // Update EXEC, save the original EXEC value to VCC
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
> + .addReg(AMDGPU::VCC);
> +
> + // Do the actual move
> + MBB.insert(I, MovRel);
> +
> + // Update EXEC, switch all done bits to 0 and all todo bits to 1
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
> + .addReg(AMDGPU::EXEC)
> + .addReg(AMDGPU::VCC);
> +
> + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> + .addImm(-7)
> + .addReg(AMDGPU::EXEC);
> +
> + // Restore EXEC
> + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
> + .addReg(Save);
> +
> + MI.eraseFromParent();
> +}
> +
> +void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
> +
> + MachineBasicBlock &MBB = *MI.getParent();
> + DebugLoc DL = MI.getDebugLoc();
> +
> + unsigned Dst = MI.getOperand(0).getReg();
> + unsigned Vec = MI.getOperand(2).getReg();
> + unsigned Off = MI.getOperand(4).getImm();
> +
> + MachineInstr *MovRel =
> + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
> + .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
> + .addReg(AMDGPU::M0, RegState::Implicit)
> + .addReg(Vec, RegState::Implicit);
> +
> + LoadM0(MI, MovRel);
> +}
> +
> +void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
> +
> + MachineBasicBlock &MBB = *MI.getParent();
> + DebugLoc DL = MI.getDebugLoc();
> +
> + unsigned Dst = MI.getOperand(0).getReg();
> + unsigned Off = MI.getOperand(4).getImm();
> + unsigned Val = MI.getOperand(5).getReg();
> +
> + MachineInstr *MovRel =
> + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
> + .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
> + .addReg(Val)
> + .addReg(AMDGPU::M0, RegState::Implicit)
> + .addReg(Dst, RegState::Implicit);
> +
> + LoadM0(MI, MovRel);
> +}
> +
> bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>
> bool HaveKill = false;
> @@ -363,6 +467,17 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
> case AMDGPU::S_BRANCH:
> Branch(MI);
> break;
> +
> + case AMDGPU::SI_INDIRECT_SRC:
> + IndirectSrc(MI);
> + break;
> +
> + case AMDGPU::SI_INDIRECT_DST_V2:
> + case AMDGPU::SI_INDIRECT_DST_V4:
> + case AMDGPU::SI_INDIRECT_DST_V8:
> + case AMDGPU::SI_INDIRECT_DST_V16:
> + IndirectDst(MI);
> + break;
> }
> }
> }
> --
> 1.7.10.4
>
More information about the llvm-commits
mailing list