[PATCHES] AMDGPU/SI: Small ISel improvements

Wed Oct 14 11:37:29 PDT 2015

On Wed, Oct 14, 2015 at 8:32 PM, Matt Arsenault
<Matthew.Arsenault at amd.com> wrote:
> On 10/14/2015 11:15 AM, Marek Olšák via llvm-commits wrote:
>
> Please review.
>
> Marek
>
>
> 0001-AMDGPU-SI-use-S_AND-for-i1-trunc.patch
>
> From d7b62060d11f2d1c303438b45bb7d29b8ba012ca Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Wed, 7 Oct 2015 02:51:54 +0200
> Subject: [PATCH 1/4] AMDGPU/SI: use S_AND for i1 trunc
>
> ---
>  lib/Target/AMDGPU/SIInstructions.td | 4 ++--
>  test/CodeGen/AMDGPU/trunc.ll        | 8 ++++----
>  2 files changed, 6 insertions(+), 6 deletions(-)
>
> LGTM
>
>
> 0002-AMDGPU-SI-use-S_OR-for-fneg-fabs-f32.patch
>
> From 17a7e3f389813d7823179c5f6f5d7794edd97650 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Wed, 7 Oct 2015 03:02:42 +0200
> Subject: [PATCH 2/4] AMDGPU/SI: use S_OR for fneg (fabs f32)
>
> ---
>  lib/Target/AMDGPU/SIInstructions.td |  3 +--
>  test/CodeGen/AMDGPU/fneg-fabs.ll    | 27 +++++++++------------------
>  2 files changed, 10 insertions(+), 20 deletions(-)
>
> LGTM
>
>
>
>
> 0003-AMDGPU-SI-select-S_ABS_I32-when-possible.patch
>
> From 96d649f157b8e77d2313f8d2f7a1b27a15279145 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Sat, 10 Oct 2015 21:23:23 +0200
> Subject: [PATCH 3/4] AMDGPU/SI: select S_ABS_I32 when possible
>
> ---
>  lib/Target/AMDGPU/SIInstrInfo.cpp      | 25 +++++++++++++++++++++++++
>  lib/Target/AMDGPU/SIInstructions.td    |  5 +++++
>  test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll |  4 +---
>  test/CodeGen/AMDGPU/sminmax.ll         | 24 ++++++++++++++++++++++++
>  4 files changed, 55 insertions(+), 3 deletions(-)
>  create mode 100644 test/CodeGen/AMDGPU/sminmax.ll
>
> diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp
> b/lib/Target/AMDGPU/SIInstrInfo.cpp
> index 1af08a8..d7904b0 100644
> --- a/lib/Target/AMDGPU/SIInstrInfo.cpp
> +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
> @@ -2290,6 +2290,31 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst)
> const {
>        }
>        break;
>
> +    case AMDGPU::S_ABS_I32: {
> +      MachineBasicBlock &MBB = *Inst->getParent();
> +      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
> +      MachineBasicBlock::iterator MII = Inst;
> +      DebugLoc DL = Inst->getDebugLoc();
> +
> +      MachineOperand &Dest = Inst->getOperand(0);
> +      MachineOperand &Src = Inst->getOperand(1);
> +      unsigned TmpReg =
> MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +      unsigned ResultReg =
> MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> +
> +      BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
> +        .addImm(0)
> +        .addReg(Src.getReg());
>
>
> I think this could break if Src is a subregister. This probably needs what
> splitScalar64BitUnaryOp does to handle subregisters. I would also prefer
> splitting this into a separate function
>
> +
> +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
> +        .addReg(Src.getReg())
> +        .addReg(TmpReg);
> +
> +      MRI.replaceRegWith(Dest.getReg(), ResultReg);
> +      addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
> +      Inst->eraseFromParent();
> +      continue;
> +    }
> +
>      case AMDGPU::S_BFE_U64:
>      case AMDGPU::S_BFM_B64:
>        llvm_unreachable("Moving this op to VALU not implemented");
> diff --git a/lib/Target/AMDGPU/SIInstructions.td
> b/lib/Target/AMDGPU/SIInstructions.td
> index ed75b4d..9e0046b 100644
> --- a/lib/Target/AMDGPU/SIInstructions.td
> +++ b/lib/Target/AMDGPU/SIInstructions.td
> @@ -2177,6 +2177,11 @@ def : Pat <
>       (S_MOV_B32 0), sub1))
>  >;
>
> +def : Pat <
> +  (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
> +  (S_ABS_I32 $x)
> +>;
> +
>
> //===----------------------------------------------------------------------===//
>  // SOP2 Patterns
>
> //===----------------------------------------------------------------------===//
> diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
> b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
> index 8bf094b..ca8ddba 100644
> --- a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
> +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
> @@ -8,9 +8,7 @@ declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
>  declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
>
>  ; FUNC-LABEL: {{^}}s_abs_i32:
> -; SI: s_sub_i32
> -; SI: s_max_i32
> -; SI: s_endpgm
> +; SI: s_abs_i32
>
>  ; EG: SUB_INT
>  ; EG: MAX_INT
> diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
> new file mode 100644
> index 0000000..a481965
> --- /dev/null
> +++ b/test/CodeGen/AMDGPU/sminmax.ll
> @@ -0,0 +1,24 @@
> +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck
> -check-prefix=GCN -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck
> -check-prefix=GCN -check-prefix=FUNC %s
> +
> +; FUNC-LABEL: {{^}}s_abs_i32:
> +; GCN: s_abs_i32
> +define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
> +  %neg = sub i32 0, %val
> +  %cond = icmp sgt i32 %val, %neg
> +  %res = select i1 %cond, i32 %val, i32 %neg
> +  store i32 %res, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}v_abs_i32:
> +; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
> +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
> +define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src)
> nounwind {
> +  %val = load i32, i32 addrspace(1)* %src, align 4
> +  %neg = sub i32 0, %val
> +  %cond = icmp sgt i32 %val, %neg
> +  %res = select i1 %cond, i32 %val, i32 %neg
> +  store i32 %res, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> --
> 2.1.4
>
>
> Can you add another testcase where the output has another scalar instruction
> use before the store, and another for v2i32/v4i32 to make sure subregister
> sources work
>
>
>
>
> 0004-AMDGPU-SI-handle-undef-for-llvm.SI.packf16.patch
>
> From 0d0216ee98873d1c47af89d330a1af1fef9fcd18 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Sun, 11 Oct 2015 21:40:37 +0200
> Subject: [PATCH 4/4] AMDGPU/SI: handle undef for llvm.SI.packf16
>
> ---
>  lib/Target/AMDGPU/SIISelLowering.cpp   |  5 +++++
>  test/CodeGen/AMDGPU/llvm.SI.packf16.ll | 29 +++++++++++++++++++++++++++++
>  2 files changed, 34 insertions(+)
>  create mode 100644 test/CodeGen/AMDGPU/llvm.SI.packf16.ll
>
> diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp
> b/lib/Target/AMDGPU/SIISelLowering.cpp
> index 804b5e6..8d7a4d1 100644
> --- a/lib/Target/AMDGPU/SIISelLowering.cpp
> +++ b/lib/Target/AMDGPU/SIISelLowering.cpp
> @@ -1091,6 +1091,11 @@ SDValue
> SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
>                         DAG.getConstant(2, DL, MVT::i32), // P0
>                         Op.getOperand(1), Op.getOperand(2), Glue);
>    }
> +  case AMDGPUIntrinsic::SI_packf16:
> +    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
>
> Should this be ||?

No. If either operand is undef, only the corresponding 16 bits are
undefined, the other 16 bits are defined and should be preserved.

Marek