[PATCH] R600/SI: Use VOP2 K instructions
Tom Stellard
tom at stellard.net
Wed Dec 31 19:35:59 PST 2014
On Sun, Dec 21, 2014 at 03:40:15PM -0500, Matt Arsenault wrote:
> Hi,
>
> These enable using the v_madak_f32 and v_madmk_f32 instructions
>
>
> From 735dc8a34b77ef5be66faf603f95dd45de31ff1b Mon Sep 17 00:00:00 2001
> From: Matt Arsenault <Matthew.Arsenault at amd.com>
> Date: Wed, 30 Jul 2014 23:00:19 -0700
> Subject: [PATCH 2/3] R600/SI: Try to use v_madak_f32
>
> ---
> lib/Target/R600/SIInstrFormats.td | 15 ++++
> lib/Target/R600/SIInstrInfo.cpp | 75 +++++++++++++++++++
> lib/Target/R600/SIInstrInfo.h | 3 +
> lib/Target/R600/SIInstrInfo.td | 13 +++-
> lib/Target/R600/SIInstructions.td | 4 +-
> test/CodeGen/R600/madak.ll | 154 ++++++++++++++++++++++++++++++++++++++
> 6 files changed, 261 insertions(+), 3 deletions(-)
> create mode 100644 test/CodeGen/R600/madak.ll
>
> diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
> index ff8db67..27fce82 100644
> --- a/lib/Target/R600/SIInstrFormats.td
> +++ b/lib/Target/R600/SIInstrFormats.td
> @@ -313,6 +313,12 @@ class VOP2e <bits<6> op> : Enc32 {
> let Inst{31} = 0x0; //encoding
> }
>
> +// Special case of VOP2 with a 3rd, pseudo operand that is a 32-bit
> +// literal.
> +class VOP2Ke<bits<6> op> : VOP2e<op> {
> + field bits<32> src2;
> +}
> +
I don't think src2 is actually being encoded. See the attached patch,
which I wrote while working on the assembler. I think it may address
Marek's concerns too.
-Tom
> class VOP3e <bits<9> op> : Enc64 {
>
> bits<8> dst;
> @@ -560,6 +566,15 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
> class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
> VOP2Common <outs, ins, asm, pattern>, VOP2e<op>;
>
> +class VOP2K <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
> + InstSI <outs, ins, asm, pattern>, VOP2Ke<op> {
> + let UseNamedOperandTable = 1;
> + let VOP2 = 1;
> +}
> +
> +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
> + VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
> +
> class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
> VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index d0e8a6a..4bea990 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -801,6 +801,81 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
> return RC != &AMDGPU::EXECRegRegClass;
> }
>
> +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
> + unsigned Reg, MachineRegisterInfo *MRI) const {
> + if (!MRI->hasOneNonDBGUse(Reg))
> + return false;
> +
> + unsigned Opc = UseMI->getOpcode();
> + if (Opc == AMDGPU::V_MAD_F32) {
> + int Src2Idx = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> + AMDGPU::OpName::src2);
> + MachineOperand &Src2 = UseMI->getOperand(Src2Idx);
> +
> + if (Src2.isReg() && Src2.getReg() == Reg) {
> + const MachineOperand *Src0 = getNamedOperand(*UseMI,
> + AMDGPU::OpName::src0);
> + // The VOP2 src0 can't be an SGPR since the constant bus use will be the
> + // literal constant.
> + if (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))
> + return false;
> +
> + const MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
> +
> + // The VOP2 src1 can't be an inline immediate, and can't be an SGPR for
> + // the same reason as src0.
> + if (!Src1->isReg() ||
> + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
> + return false;
> +
> + // Don't fold if we are using source modifiers. VOP2 doesn't have them.
> + int Src0ModIdx
> + = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> + AMDGPU::OpName::src0_modifiers);
> + if (UseMI->getOperand(Src0ModIdx).getImm() != 0)
> + return false;
> +
> + int Src1ModIdx
> + = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> + AMDGPU::OpName::src1_modifiers);
> + if (UseMI->getOperand(Src1ModIdx).getImm() != 0)
> + return false;
> +
> +
> + const ConstantFP *Imm = DefMI->getOperand(1).getFPImm();
> +
> + // FIXME: This would be a lot easier if we could return a new instruction
> + // instead of having to modify in place.
> +
> + // Remove these first since they are at the end.
> + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> + AMDGPU::OpName::omod));
> + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> + AMDGPU::OpName::clamp));
> +
> + Src2.ChangeToFPImmediate(Imm);
> +
> + // These come before src2.
> + int Src2ModIdx
> + = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> + AMDGPU::OpName::src2_modifiers);
> + UseMI->RemoveOperand(Src2ModIdx);
> +
> + UseMI->RemoveOperand(Src1ModIdx);
> + UseMI->RemoveOperand(Src0ModIdx);
> + UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
> +
> + bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
> + if (DeleteDef)
> + DefMI->eraseFromParent();
> +
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> bool
> SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
> AliasAnalysis *AA) const {
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index 6d63816..65c09cd 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -132,6 +132,9 @@ public:
>
> bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
>
> + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
> + unsigned Reg, MachineRegisterInfo *MRI) const final;
> +
> bool isSALU(uint16_t Opcode) const {
> return get(Opcode).TSFlags & SIInstrFlags::SALU;
> }
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 0471c6d..b4220ce 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -661,7 +661,8 @@ class hasModifiers<ValueType SrcVT> {
> class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
> dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
> !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
> - (ins)));
> + !if(!eq(NumSrcArgs, 3), (ins Src0RC:$src0, Src1RC:$src1, f32imm:$src2), // VOP2K
> + (ins))));
> }
>
> // Returns the input arguments for VOP3 instructions for the given SrcVT.
> @@ -1036,6 +1037,16 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
> revOpSI, revOpVI, P.HasModifiers
> >;
>
> +// VOP2 K instructions are not available in a VOP3 encoding. The SI
> +// documentation lists it, but it is not in the CI documentation, and
> +// wouldn't really make sense anyway.
> +class VOP2KInst <vop2 op, string opName, string revOp = opName> : VOP2K <
> + op.SI,
> + (outs VReg_32:$dst),
> + (ins VSrc_32:$src0, VReg_32:$src1, f32imm:$src2),
> + opName#" $dst, $src0, $src1, $src2", []
> +>;
> +
> multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
> dag ins32, string asm32, list<dag> pat32,
> dag ins64, string asm64, list<dag> pat64,
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 463287e..6f9b33b 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1454,10 +1454,10 @@ defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32",
> defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
> } // End isCommutable = 1
>
> -defm V_MADMK_F32 : VOP2Inst <vop2<0x20, 0x17>, "v_madmk_f32", VOP_F32_F32_F32>;
> +def V_MADMK_F32 : VOP2KInst <vop2<0x20, 0x17>, "v_madmk_f32">;
>
> let isCommutable = 1 in {
> -defm V_MADAK_F32 : VOP2Inst <vop2<0x21, 0x18>, "v_madak_f32", VOP_F32_F32_F32>;
> +def V_MADAK_F32 : VOP2KInst <vop2<0x21, 0x18>, "v_madak_f32">;
> } // End isCommutable = 1
>
> let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
> diff --git a/test/CodeGen/R600/madak.ll b/test/CodeGen/R600/madak.ll
> new file mode 100644
> index 0000000..98e49f7
> --- /dev/null
> +++ b/test/CodeGen/R600/madak.ll
> @@ -0,0 +1,154 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +declare float @llvm.fabs.f32(float) nounwind readnone
> +
> +; SI-LABEL: {{^}}madak_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_madak_f32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
> +define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %a = load float addrspace(1)* %in.a.gep, align 4
> + %b = load float addrspace(1)* %in.b.gep, align 4
> +
> + %mul = fmul float %a, %b
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> +
> +; SI-LABEL: {{^}}madak_m_inline_imm_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
> +define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %a = load float addrspace(1)* %in.a.gep, align 4
> +
> + %mul = fmul float 4.0, %a
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> +
> +; Make sure nothing weird happens with a value that is also allowed as
> +; an inline immediate.
> +
> +; SI-LABEL: {{^}}madak_inline_imm_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], 4.0
> +define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %a = load float addrspace(1)* %in.a.gep, align 4
> + %b = load float addrspace(1)* %in.b.gep, align 4
> +
> + %mul = fmul float %a, %b
> + %madak = fadd float %mul, 4.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> +
> +; We can't use an SGPR when forming madak
> +; SI-LABEL: {{^}}s_v_madak_f32:
> +; SI: s_load_dword [[SB:s[0-9]+]]
> +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
> +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
> +; SI-NOT: v_madak_f32
> +; SI: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
> +define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %a = load float addrspace(1)* %in.a.gep, align 4
> +
> + %mul = fmul float %a, %b
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> +
> +; SI-LABEL: @v_s_madak_f32
> +; SI: s_load_dword [[SB:s[0-9]+]]
> +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
> +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
> +; SI-NOT: v_madak_f32
> +; SI: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
> +define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %b = load float addrspace(1)* %in.b.gep, align 4
> +
> + %mul = fmul float %a, %b
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> +
> +; SI-LABEL: {{^}}s_s_madak_f32:
> +; SI-NOT: v_madak_f32
> +; SI: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
> +define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
> + %mul = fmul float %a, %b
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; SI-LABEL: {{^}}no_madak_src0_modifier_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
> +; SI: s_endpgm
> +define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %a = load float addrspace(1)* %in.a.gep, align 4
> + %b = load float addrspace(1)* %in.b.gep, align 4
> +
> + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
> +
> + %mul = fmul float %a.fabs, %b
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> +
> +; SI-LABEL: {{^}}no_madak_src1_modifier_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
> +; SI: s_endpgm
> +define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> + %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> + %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> + %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> + %a = load float addrspace(1)* %in.a.gep, align 4
> + %b = load float addrspace(1)* %in.b.gep, align 4
> +
> + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
> +
> + %mul = fmul float %a, %b.fabs
> + %madak = fadd float %mul, 10.0
> + store float %madak, float addrspace(1)* %out.gep, align 4
> + ret void
> +}
> --
> 2.2.1
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-SI-Fix-mad-k-definitions.patch
Type: text/x-diff
Size: 3487 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141231/b4a726ca/attachment.patch>
More information about the llvm-commits
mailing list