[PATCHES] R600/SI: New V_FRACT fix, intrinsic for S_FLBIT_I32, and more
Tom Stellard
tom at stellard.net
Mon Mar 23 08:25:19 PDT 2015
LGTM.
On Fri, Mar 13, 2015 at 07:13:11PM +0100, Marek Olšák wrote:
> New patches are attached. I only added -enable-unsafe-fp-math to the
> tests, so that folding NEG modifiers can be tested. Please review.
>
> Marek
>
> On Thu, Mar 12, 2015 at 4:48 PM, Marek Olšák <maraeo at gmail.com> wrote:
> > So can I consider that patches 2 & 3 have your "LGTM"?
> >
> > Marek
> >
> > On Tue, Mar 10, 2015 at 11:28 PM, Matt Arsenault
> > <Matthew.Arsenault at amd.com> wrote:
> >> On 03/10/2015 03:21 PM, Marek Olšák wrote:
> >>>
> >>> If we didn't have to deal with fsub, only one pattern would be needed.
> >>> A possible solution is to expand fsub, so that it's translated into
> >>> (v_add_f32 a, -b), and then convert it to v_sub_f32 in the shrinking
> >>> pass if it's possible.
> >>>
> >>> The hardware internally expands (v_sub_f32 a, b) into (v_add_f32 a, -b)
> >>> anyway.
> >>>
> >>> Marek
> >>
> >> When I removed the fsub pseudo, I was originally going to handle it with the
> >> expanded pattern, but then I noticed the expansion for fsub did the same
> >> thing anyway and was slightly easier. Another option might be to do that
> From bd77870e24d575ed59ea591fb2d7c83ac98cadf0 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Sun, 1 Mar 2015 23:07:48 +0100
> Subject: [PATCH 1/2] R600/SI: Expand fract to floor, then only select V_FRACT
> on CI
>
> V_FRACT is buggy on SI.
>
> R600-specific code is left intact.
>
> v2: drop the multiclass, use complex VOP3 patterns
> ---
> lib/Target/R600/AMDGPUISelLowering.cpp | 3 ---
> lib/Target/R600/R600ISelLowering.cpp | 4 +++
> lib/Target/R600/SIISelLowering.cpp | 6 +++++
> lib/Target/R600/SIInstructions.td | 22 ++++++++++++++++
> test/CodeGen/R600/llvm.AMDGPU.fract.ll | 47 ++++++++++++++++++++++++++++++----
> 5 files changed, 74 insertions(+), 8 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> index 4707279..62a33fa 100644
> --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -885,9 +885,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
> return LowerIntrinsicIABS(Op, DAG);
> case AMDGPUIntrinsic::AMDGPU_lrp:
> return LowerIntrinsicLRP(Op, DAG);
> - case AMDGPUIntrinsic::AMDGPU_fract:
> - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
> - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
>
> case AMDGPUIntrinsic::AMDGPU_clamp:
> case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
> diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
> index d4f3145..06b7217 100644
> --- a/lib/Target/R600/R600ISelLowering.cpp
> +++ b/lib/Target/R600/R600ISelLowering.cpp
> @@ -837,6 +837,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
> case Intrinsic::AMDGPU_rsq:
> // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
> return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
> +
> + case AMDGPUIntrinsic::AMDGPU_fract:
> + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
> + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
> }
> // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
> break;
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index af38c94..80074c0 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -928,6 +928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
> Op.getOperand(1),
> Op.getOperand(2),
> Op.getOperand(3));
> +
> + case AMDGPUIntrinsic::AMDGPU_fract:
> + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
> + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
> + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
> +
> default:
> return AMDGPUTargetLowering::LowerOperation(Op, DAG);
> }
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 5f02a31..1c9e11c 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -3344,6 +3344,28 @@ def : Pat <
> (V_CNDMASK_B32_e64 $src0, $src1, $src2)
> >;
>
> +//===----------------------------------------------------------------------===//
> +// Fract Patterns
> +//===----------------------------------------------------------------------===//
> +
> +let Predicates = [isCI] in {
> +
> +// Convert (x - floor(x)) to fract(x)
> +def : Pat <
> + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
> + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
> + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
> +>;
> +
> +// Convert (x + (-floor(x))) to fract(x)
> +def : Pat <
> + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
> + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
> + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
> +>;
> +
> +} // End Predicates = [isCI]
> +
> //============================================================================//
> // Miscellaneous Optimization Patterns
> //============================================================================//
> diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
> index f4cf7fc..7501b4b 100644
> --- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll
> +++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
> @@ -1,14 +1,19 @@
> -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>
> +declare float @llvm.fabs.f32(float %Val)
> declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
>
> ; Legacy name
> declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
>
> ; FUNC-LABEL: {{^}}fract_f32:
> -; SI: v_fract_f32
> +; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
> +; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
> +; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
> +; GCN: buffer_store_dword [[RESULT]]
> ; EG: FRACT
> define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
> %val = load float, float addrspace(1)* %src, align 4
> @@ -18,7 +23,10 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounw
> }
>
> ; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
> -; SI: v_fract_f32
> +; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
> +; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
> +; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
> +; GCN: buffer_store_dword [[RESULT]]
> ; EG: FRACT
> define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
> %val = load float, float addrspace(1)* %src, align 4
> @@ -26,3 +34,32 @@ define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)
> store float %fract, float addrspace(1)* %out, align 4
> ret void
> }
> +
> +; FUNC-LABEL: {{^}}fract_f32_neg:
> +; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
> +; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
> +; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
> +; GCN: buffer_store_dword [[RESULT]]
> +; EG: FRACT
> +define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
> + %val = load float, float addrspace(1)* %src, align 4
> + %neg = fsub float 0.0, %val
> + %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
> + store float %fract, float addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: {{^}}fract_f32_neg_abs:
> +; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
> +; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
> +; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
> +; GCN: buffer_store_dword [[RESULT]]
> +; EG: FRACT
> +define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
> + %val = load float, float addrspace(1)* %src, align 4
> + %abs = call float @llvm.fabs.f32(float %val)
> + %neg = fsub float 0.0, %abs
> + %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
> + store float %fract, float addrspace(1)* %out, align 4
> + ret void
> +}
> --
> 2.1.0
>
> From 7400ce890a5d648bc347c1bf7eb996d2f8189fe2 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Sun, 1 Mar 2015 23:39:34 +0100
> Subject: [PATCH 2/2] R600/SI: Use V_FRACT_F64 for faster 64-bit floor on SI
>
> Other f64 opcodes not supported on SI can be lowered in a similar way.
>
> v2: use complex VOP3 patterns
> ---
> lib/Target/R600/SIISelLowering.cpp | 2 +-
> lib/Target/R600/SIInstrInfo.cpp | 20 ++++++++++
> lib/Target/R600/SIInstrInfo.td | 1 +
> lib/Target/R600/SIInstructions.td | 51 ++++++++++++++++++++++++
> test/CodeGen/R600/ffloor.f64.ll | 63 ++++++++++++++++++++----------
> test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll | 60 ++++++++++++++++++++++++++++
> 6 files changed, 175 insertions(+), 22 deletions(-)
> create mode 100644 test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 80074c0..48f5ede 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -202,10 +202,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
> if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
> setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
> setOperationAction(ISD::FCEIL, MVT::f64, Legal);
> - setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
> setOperationAction(ISD::FRINT, MVT::f64, Legal);
> }
>
> + setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
> setOperationAction(ISD::FDIV, MVT::f32, Custom);
> setOperationAction(ISD::FDIV, MVT::f64, Custom);
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 95334c3..61a5057 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -717,6 +717,26 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
> MI->eraseFromParent();
> break;
> }
> +
> + case AMDGPU::V_CNDMASK_B64_PSEUDO: {
> + unsigned Dst = MI->getOperand(0).getReg();
> + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
> + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
> + unsigned Src0 = MI->getOperand(1).getReg();
> + unsigned Src1 = MI->getOperand(2).getReg();
> + const MachineOperand &SrcCond = MI->getOperand(3);
> +
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
> + .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
> + .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
> + .addOperand(SrcCond);
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
> + .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
> + .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
> + .addOperand(SrcCond);
> + MI->eraseFromParent();
> + break;
> + }
> }
> return true;
> }
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index b557b06..902c17d 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -320,6 +320,7 @@ def SIOperand {
>
> def SRCMODS {
> int NONE = 0;
> + int NEG = 1;
> }
>
> def DSTCLAMP {
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 1c9e11c..21f4efd 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -28,6 +28,8 @@ def SendMsgImm : Operand<i32> {
>
> def isGCN : Predicate<"Subtarget->getGeneration() "
> ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
> +def isSI : Predicate<"Subtarget->getGeneration() "
> + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
> def isSICI : Predicate<
> "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
> "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
> @@ -1902,6 +1904,11 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
> //===----------------------------------------------------------------------===//
> let isCodeGenOnly = 1, isPseudo = 1 in {
>
> +// For use in patterns
> +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
> + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
> +>;
> +
> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
> // 64-bit vector move instruction. This is mainly used by the SIFoldOperands
> // pass to enable folding of inline immediates.
> @@ -3348,6 +3355,50 @@ def : Pat <
> // Fract Patterns
> //===----------------------------------------------------------------------===//
>
> +let Predicates = [isSI] in {
> +
> +// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
> +// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
> +// way to implement it is using V_FRACT_F64.
> +// The workaround for the V_FRACT bug is:
> +// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
> +
> +// Convert (x + (-floor(x)) to fract(x)
> +def : Pat <
> + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
> + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
> + (V_CNDMASK_B64_PSEUDO
> + $x,
> + (V_MIN_F64
> + SRCMODS.NONE,
> + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
> + SRCMODS.NONE,
> + (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
> + DSTCLAMP.NONE, DSTOMOD.NONE),
> + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/))
> +>;
> +
> +// Convert floor(x) to (x - fract(x))
> +def : Pat <
> + (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
> + (V_ADD_F64
> + $mods,
> + $x,
> + SRCMODS.NEG,
> + (V_CNDMASK_B64_PSEUDO
> + $x,
> + (V_MIN_F64
> + SRCMODS.NONE,
> + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
> + SRCMODS.NONE,
> + (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
> + DSTCLAMP.NONE, DSTOMOD.NONE),
> + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
> + DSTCLAMP.NONE, DSTOMOD.NONE)
> +>;
> +
> +} // End Predicates = [isSI]
> +
> let Predicates = [isCI] in {
>
> // Convert (x - floor(x)) to fract(x)
> diff --git a/test/CodeGen/R600/ffloor.f64.ll b/test/CodeGen/R600/ffloor.f64.ll
> index 745ad3b..45f8382 100644
> --- a/test/CodeGen/R600/ffloor.f64.ll
> +++ b/test/CodeGen/R600/ffloor.f64.ll
> @@ -1,7 +1,8 @@
> -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
> -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
>
> +declare double @llvm.fabs.f64(double %Val)
> declare double @llvm.floor.f64(double) nounwind readnone
> declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
> declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
> @@ -11,24 +12,11 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
>
> ; FUNC-LABEL: {{^}}ffloor_f64:
> ; CI: v_floor_f64_e32
> -
> -; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
> -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
> -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
> -; SI: s_lshr_b64
> -; SI: s_not_b64
> -; SI: s_and_b64
> -; SI: cmp_lt_i32
> -; SI: cndmask_b32
> -; SI: cndmask_b32
> -; SI: cmp_gt_i32
> -; SI: cndmask_b32
> -; SI: cndmask_b32
> -; SI-DAG: v_cmp_lt_f64
> -; SI-DAG: v_cmp_lg_f64
> -; SI-DAG: s_and_b64
> -; SI-DAG: v_cndmask_b32
> -; SI-DAG: v_cndmask_b32
> +; SI: v_fract_f64_e32
> +; SI: v_min_f64
> +; SI: v_cmp_class_f64_e64
> +; SI: v_cndmask_b32_e64
> +; SI: v_cndmask_b32_e64
> ; SI: v_add_f64
> ; SI: s_endpgm
> define void @ffloor_f64(double addrspace(1)* %out, double %x) {
> @@ -37,6 +25,39 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
> ret void
> }
>
> +; FUNC-LABEL: {{^}}ffloor_f64_neg:
> +; CI: v_floor_f64_e64
> +; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
> +; SI: v_min_f64
> +; SI: v_cmp_class_f64_e64
> +; SI: v_cndmask_b32_e64
> +; SI: v_cndmask_b32_e64
> +; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
> +; SI: s_endpgm
> +define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
> + %neg = fsub double 0.0, %x
> + %y = call double @llvm.floor.f64(double %neg) nounwind readnone
> + store double %y, double addrspace(1)* %out
> + ret void
> +}
> +
> +; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
> +; CI: v_floor_f64_e64
> +; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
> +; SI: v_min_f64
> +; SI: v_cmp_class_f64_e64
> +; SI: v_cndmask_b32_e64
> +; SI: v_cndmask_b32_e64
> +; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
> +; SI: s_endpgm
> +define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
> + %abs = call double @llvm.fabs.f64(double %x)
> + %neg = fsub double 0.0, %abs
> + %y = call double @llvm.floor.f64(double %neg) nounwind readnone
> + store double %y, double addrspace(1)* %out
> + ret void
> +}
> +
> ; FUNC-LABEL: {{^}}ffloor_v2f64:
> ; CI: v_floor_f64_e32
> ; CI: v_floor_f64_e32
> diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
> new file mode 100644
> index 0000000..e098dd3
> --- /dev/null
> +++ b/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll
> @@ -0,0 +1,60 @@
> +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
> +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
> +
> +declare double @llvm.fabs.f64(double %Val)
> +declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
> +
> +; FUNC-LABEL: {{^}}fract_f64:
> +; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
> +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
> +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
> +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
> +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
> +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
> +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
> +; CI: buffer_store_dwordx2 [[FRC]]
> +define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
> + %val = load double, double addrspace(1)* %src, align 4
> + %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
> + store double %fract, double addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: {{^}}fract_f64_neg:
> +; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
> +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
> +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
> +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
> +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
> +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
> +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
> +; CI: buffer_store_dwordx2 [[FRC]]
> +define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
> + %val = load double, double addrspace(1)* %src, align 4
> + %neg = fsub double 0.0, %val
> + %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
> + store double %fract, double addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: {{^}}fract_f64_neg_abs:
> +; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
> +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
> +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
> +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
> +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
> +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
> +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
> +; CI: buffer_store_dwordx2 [[FRC]]
> +define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
> + %val = load double, double addrspace(1)* %src, align 4
> + %abs = call double @llvm.fabs.f64(double %val)
> + %neg = fsub double 0.0, %abs
> + %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
> + store double %fract, double addrspace(1)* %out, align 4
> + ret void
> +}
> --
> 2.1.0
>
More information about the llvm-commits
mailing list