[PATCH] R600/SI: Use VOP2 K instructions

Wed Dec 31 19:35:59 PST 2014

On Sun, Dec 21, 2014 at 03:40:15PM -0500, Matt Arsenault wrote:
> Hi,
> 
> These enable using the v_madak_f32 and v_madmk_f32 instructions
> 
> 

> From 735dc8a34b77ef5be66faf603f95dd45de31ff1b Mon Sep 17 00:00:00 2001
> From: Matt Arsenault <Matthew.Arsenault at amd.com>
> Date: Wed, 30 Jul 2014 23:00:19 -0700
> Subject: [PATCH 2/3] R600/SI: Try to use v_madak_f32
> 
> ---
>  lib/Target/R600/SIInstrFormats.td |  15 ++++
>  lib/Target/R600/SIInstrInfo.cpp   |  75 +++++++++++++++++++
>  lib/Target/R600/SIInstrInfo.h     |   3 +
>  lib/Target/R600/SIInstrInfo.td    |  13 +++-
>  lib/Target/R600/SIInstructions.td |   4 +-
>  test/CodeGen/R600/madak.ll        | 154 ++++++++++++++++++++++++++++++++++++++
>  6 files changed, 261 insertions(+), 3 deletions(-)
>  create mode 100644 test/CodeGen/R600/madak.ll
> 
> diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
> index ff8db67..27fce82 100644
> --- a/lib/Target/R600/SIInstrFormats.td
> +++ b/lib/Target/R600/SIInstrFormats.td
> @@ -313,6 +313,12 @@ class VOP2e <bits<6> op> : Enc32 {
>    let Inst{31} = 0x0; //encoding
>  }
>  
> +// Special case of VOP2 with a 3rd, pseudo operand that is a 32-bit
> +// literal.
> +class VOP2Ke<bits<6> op> : VOP2e<op> {
> +  field bits<32> src2;
> +}
> +

I don't think src2 is actually being encoded.  See the attached patch,
which I wrote while working on the assembler.  I think it may address
Marek's concerns too.

-Tom

>  class VOP3e <bits<9> op> : Enc64 {
>  
>    bits<8> dst;
> @@ -560,6 +566,15 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
>  class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
>      VOP2Common <outs, ins, asm, pattern>, VOP2e<op>;
>  
> +class VOP2K <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
> +    InstSI <outs, ins, asm, pattern>, VOP2Ke<op> {
> +  let UseNamedOperandTable = 1;
> +  let VOP2 = 1;
> +}
> +
> +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
> +    VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
> +
>  class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
>      VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
>  
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index d0e8a6a..4bea990 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -801,6 +801,81 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
>    return RC != &AMDGPU::EXECRegRegClass;
>  }
>  
> +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
> +                                unsigned Reg, MachineRegisterInfo *MRI) const {
> +  if (!MRI->hasOneNonDBGUse(Reg))
> +    return false;
> +
> +  unsigned Opc = UseMI->getOpcode();
> +  if (Opc == AMDGPU::V_MAD_F32) {
> +    int Src2Idx = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> +                                             AMDGPU::OpName::src2);
> +    MachineOperand &Src2 = UseMI->getOperand(Src2Idx);
> +
> +    if (Src2.isReg() && Src2.getReg() == Reg) {
> +      const MachineOperand *Src0 = getNamedOperand(*UseMI,
> +                                                   AMDGPU::OpName::src0);
> +      // The VOP2 src0 can't be an SGPR since the constant bus use will be the
> +      // literal constant.
> +      if (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))
> +        return false;
> +
> +      const MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
> +
> +      // The VOP2 src1 can't be an inline immediate, and can't be an SGPR for
> +      // the same reason as src0.
> +      if (!Src1->isReg() ||
> +          (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
> +        return false;
> +
> +      // Don't fold if we are using source modifiers. VOP2 doesn't have them.
> +      int Src0ModIdx
> +        = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> +                                     AMDGPU::OpName::src0_modifiers);
> +      if (UseMI->getOperand(Src0ModIdx).getImm() != 0)
> +        return false;
> +
> +      int Src1ModIdx
> +        = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> +                                     AMDGPU::OpName::src1_modifiers);
> +      if (UseMI->getOperand(Src1ModIdx).getImm() != 0)
> +        return false;
> +
> +
> +      const ConstantFP *Imm = DefMI->getOperand(1).getFPImm();
> +
> +      // FIXME: This would be a lot easier if we could return a new instruction
> +      // instead of having to modify in place.
> +
> +      // Remove these first since they are at the end.
> +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> +                                                      AMDGPU::OpName::omod));
> +      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> +                                                      AMDGPU::OpName::clamp));
> +
> +      Src2.ChangeToFPImmediate(Imm);
> +
> +      // These come before src2.
> +      int Src2ModIdx
> +        = AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
> +                                     AMDGPU::OpName::src2_modifiers);
> +      UseMI->RemoveOperand(Src2ModIdx);
> +
> +      UseMI->RemoveOperand(Src1ModIdx);
> +      UseMI->RemoveOperand(Src0ModIdx);
> +      UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
> +
> +      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
> +      if (DeleteDef)
> +        DefMI->eraseFromParent();
> +
> +      return true;
> +    }
> +  }
> +
> +  return false;
> +}
> +
>  bool
>  SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
>                                           AliasAnalysis *AA) const {
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index 6d63816..65c09cd 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -132,6 +132,9 @@ public:
>  
>    bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
>  
> +  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
> +                     unsigned Reg, MachineRegisterInfo *MRI) const final;
> +
>    bool isSALU(uint16_t Opcode) const {
>      return get(Opcode).TSFlags & SIInstrFlags::SALU;
>    }
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 0471c6d..b4220ce 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -661,7 +661,8 @@ class hasModifiers<ValueType SrcVT> {
>  class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
>    dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
>              !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
> -                                    (ins)));
> +            !if(!eq(NumSrcArgs, 3), (ins Src0RC:$src0, Src1RC:$src1, f32imm:$src2), // VOP2K
> +                                    (ins))));
>  }
>  
>  // Returns the input arguments for VOP3 instructions for the given SrcVT.
> @@ -1036,6 +1037,16 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
>    revOpSI, revOpVI, P.HasModifiers
>  >;
>  
> +// VOP2 K instructions are not available in a VOP3 encoding. The SI
> +// documentation lists it, but it is not in the CI documentation, and
> +// wouldn't really make sense anyway.
> +class VOP2KInst <vop2 op, string opName, string revOp = opName> : VOP2K <
> +  op.SI,
> +  (outs VReg_32:$dst),
> +  (ins VSrc_32:$src0, VReg_32:$src1, f32imm:$src2),
> +  opName#" $dst, $src0, $src1, $src2", []
> +>;
> +
>  multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
>                           dag ins32, string asm32, list<dag> pat32,
>                           dag ins64, string asm64, list<dag> pat64,
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 463287e..6f9b33b 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1454,10 +1454,10 @@ defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32",
>  defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
>  } // End isCommutable = 1
>  
> -defm V_MADMK_F32 : VOP2Inst <vop2<0x20, 0x17>, "v_madmk_f32", VOP_F32_F32_F32>;
> +def V_MADMK_F32 : VOP2KInst <vop2<0x20, 0x17>, "v_madmk_f32">;
>  
>  let isCommutable = 1 in {
> -defm V_MADAK_F32 : VOP2Inst <vop2<0x21, 0x18>, "v_madak_f32", VOP_F32_F32_F32>;
> +def V_MADAK_F32 : VOP2KInst <vop2<0x21, 0x18>, "v_madak_f32">;
>  } // End isCommutable = 1
>  
>  let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
> diff --git a/test/CodeGen/R600/madak.ll b/test/CodeGen/R600/madak.ll
> new file mode 100644
> index 0000000..98e49f7
> --- /dev/null
> +++ b/test/CodeGen/R600/madak.ll
> @@ -0,0 +1,154 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> +declare float @llvm.fabs.f32(float) nounwind readnone
> +
> +; SI-LABEL: {{^}}madak_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_madak_f32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
> +define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %a = load float addrspace(1)* %in.a.gep, align 4
> +  %b = load float addrspace(1)* %in.b.gep, align 4
> +
> +  %mul = fmul float %a, %b
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: {{^}}madak_m_inline_imm_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
> +define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %a = load float addrspace(1)* %in.a.gep, align 4
> +
> +  %mul = fmul float 4.0, %a
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> +
> +; Make sure nothing weird happens with a value that is also allowed as
> +; an inline immediate.
> +
> +; SI-LABEL: {{^}}madak_inline_imm_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], 4.0
> +define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %a = load float addrspace(1)* %in.a.gep, align 4
> +  %b = load float addrspace(1)* %in.b.gep, align 4
> +
> +  %mul = fmul float %a, %b
> +  %madak = fadd float %mul, 4.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> +
> +; We can't use an SGPR when forming madak
> +; SI-LABEL: {{^}}s_v_madak_f32:
> +; SI: s_load_dword [[SB:s[0-9]+]]
> +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
> +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
> +; SI-NOT: v_madak_f32
> +; SI: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
> +define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %a = load float addrspace(1)* %in.a.gep, align 4
> +
> +  %mul = fmul float %a, %b
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @v_s_madak_f32
> +; SI: s_load_dword [[SB:s[0-9]+]]
> +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
> +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
> +; SI-NOT: v_madak_f32
> +; SI: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
> +define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %b = load float addrspace(1)* %in.b.gep, align 4
> +
> +  %mul = fmul float %a, %b
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: {{^}}s_s_madak_f32:
> +; SI-NOT: v_madak_f32
> +; SI: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
> +define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
> +  %mul = fmul float %a, %b
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: {{^}}no_madak_src0_modifier_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
> +; SI: s_endpgm
> +define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %a = load float addrspace(1)* %in.a.gep, align 4
> +  %b = load float addrspace(1)* %in.b.gep, align 4
> +
> +  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
> +
> +  %mul = fmul float %a.fabs, %b
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: {{^}}no_madak_src1_modifier_f32:
> +; SI: buffer_load_dword [[VA:v[0-9]+]]
> +; SI: buffer_load_dword [[VB:v[0-9]+]]
> +; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
> +; SI: s_endpgm
> +define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
> +
> +  %a = load float addrspace(1)* %in.a.gep, align 4
> +  %b = load float addrspace(1)* %in.b.gep, align 4
> +
> +  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
> +
> +  %mul = fmul float %a, %b.fabs
> +  %madak = fadd float %mul, 10.0
> +  store float %madak, float addrspace(1)* %out.gep, align 4
> +  ret void
> +}
> -- 
> 2.2.1
> 

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-SI-Fix-mad-k-definitions.patch
Type: text/x-diff
Size: 3487 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141231/b4a726ca/attachment.patch>