[PATCH] R600/SI: Fix div_scale intrinsic
Tom Stellard
tom at stellard.net
Mon Jun 23 09:07:50 PDT 2014
On Mon, Jun 23, 2014 at 12:08:47AM +0000, Matt Arsenault wrote:
> The choice of 1st operand does matter, so add it back to the intrinsic. Implement selection for it, and add verifier check for operand restriction that the first operand must equal one of the other 2.
>
LGTM.
> http://reviews.llvm.org/D4247
>
> Files:
> include/llvm/IR/IntrinsicsR600.td
> lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> lib/Target/R600/AMDGPUISelLowering.cpp
> lib/Target/R600/SIInstrInfo.cpp
> lib/Target/R600/SIInstrInfo.td
> lib/Target/R600/SIInstructions.td
> test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> Index: include/llvm/IR/IntrinsicsR600.td
> ===================================================================
> --- include/llvm/IR/IntrinsicsR600.td
> +++ include/llvm/IR/IntrinsicsR600.td
> @@ -38,8 +38,12 @@
>
> let TargetPrefix = "AMDGPU" in {
> def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">,
> + // 1st parameter: Numerator
> + // 2nd parameter: Denominator
> + // 3rd parameter: Constant to select select between first and
> + // second. (0 = first, 1 = second).
> Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty],
> - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
> + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
> [IntrNoMem]>;
>
> def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">,
> Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> +++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> @@ -85,6 +85,8 @@
> bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
> bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
>
> + SDNode *SelectDIV_SCALE(SDNode *N);
> +
> // Include the pieces autogenerated from the target description.
> #include "AMDGPUGenDAGISel.inc"
> };
> @@ -489,6 +491,9 @@
> PackedOffsetWidth);
>
> }
> + case AMDGPUISD::DIV_SCALE: {
> + return SelectDIV_SCALE(N);
> + }
> }
> return SelectCode(N);
> }
> @@ -682,6 +687,31 @@
> return true;
> }
>
> +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
> + SDLoc SL(N);
> + EVT VT = N->getValueType(0);
> +
> + assert(VT == MVT::f32 || VT == MVT::f64);
> +
> + unsigned Opc
> + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
> +
> + const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
> +
> + SDValue Ops[] = {
> + N->getOperand(0),
> + N->getOperand(1),
> + N->getOperand(2),
> + Zero,
> + Zero,
> + Zero,
> + Zero
> + };
> +
> + return CurDAG->SelectNodeTo(N, Opc, MVT::f64, MVT::i1, Ops);
> +}
> +
> +
> void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
> const AMDGPUTargetLowering& Lowering =
> *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -849,9 +849,21 @@
> return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
> Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
>
> - case Intrinsic::AMDGPU_div_scale:
> + case Intrinsic::AMDGPU_div_scale: {
> + // 3rd parameter required to be a constant.
> + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
> + if (!Param)
> + return DAG.getUNDEF(VT);
> +
> + // Translate to the operands expected by the machine instruction. The
> + // first parameter must be the same as the first instruction.
> + SDValue Numerator = Op.getOperand(1);
> + SDValue Denominator = Op.getOperand(2);
> + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
> +
> return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
> - Op.getOperand(1), Op.getOperand(2));
> + Src0, Denominator, Numerator);
> + }
>
> case Intrinsic::AMDGPU_div_fmas:
> return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
> Index: lib/Target/R600/SIInstrInfo.cpp
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.cpp
> +++ lib/Target/R600/SIInstrInfo.cpp
> @@ -524,6 +524,23 @@
> return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
> }
>
> +static bool compareMachineOp(const MachineOperand &Op0,
> + const MachineOperand &Op1) {
> + if (Op0.getType() != Op1.getType())
> + return false;
> +
> + switch (Op0.getType()) {
> + case MachineOperand::MO_Register:
> + return Op0.getReg() == Op1.getReg();
> + case MachineOperand::MO_Immediate:
> + return Op0.getImm() == Op1.getImm();
> + case MachineOperand::MO_FPImmediate:
> + return Op0.getFPImm() == Op1.getFPImm();
> + default:
> + llvm_unreachable("Didn't expect to be comparing these operand types");
> + }
> +}
> +
> bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
> StringRef &ErrInfo) const {
> uint16_t Opcode = MI->getOpcode();
> @@ -630,6 +647,24 @@
> return false;
> }
> }
> +
> + // Verify misc. restrictions on specific instructions.
> + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
> + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
> + MI->dump();
> +
> + const MachineOperand &Src0 = MI->getOperand(2);
> + const MachineOperand &Src1 = MI->getOperand(3);
> + const MachineOperand &Src2 = MI->getOperand(4);
> + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
> + if (!compareMachineOp(Src0, Src1) &&
> + !compareMachineOp(Src0, Src2)) {
> + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
> + return false;
> + }
> + }
> + }
> +
> return true;
> }
>
> Index: lib/Target/R600/SIInstrInfo.td
> ===================================================================
> --- lib/Target/R600/SIInstrInfo.td
> +++ lib/Target/R600/SIInstrInfo.td
> @@ -433,6 +433,22 @@
> opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
> >, VOP <opName>;
>
> +
> +class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc,
> + string opName, list<dag> pattern> : VOP3 <
> + op, (outs vrc:$dst0, SReg_64:$dst1),
> + (ins arc:$src0, arc:$src1, arc:$src2,
> + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
> + opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
> +>, VOP <opName>;
> +
> +
> +class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> :
> + VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
> +
> +class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> :
> + VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
> +
> //===----------------------------------------------------------------------===//
> // Vector I/O classes
> //===----------------------------------------------------------------------===//
> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -1459,8 +1459,10 @@
>
> } // isCommutable = 1
>
> -defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
> -def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
> +def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
> +
> +// Double precision division pre-scale.
> +def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
>
> defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
> [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
> Index: test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> @@ -1,41 +1,48 @@
> -; XFAIL: *
> ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
>
> -declare float @llvm.AMDGPU.div.scale.f32(float, float, float) nounwind readnone
> -declare double @llvm.AMDGPU.div.scale.f64(double, double, double) nounwind readnone
> +declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
> +declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
>
> ; SI-LABEL @test_div_scale_f32_1:
> +; SI: V_DIV_SCALE_F32
> define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
> %a = load float addrspace(1)* %aptr, align 4
> %b = load float addrspace(1)* %bptr, align 4
> - %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b, float %a) nounwind readnone
> - store float %result, float addrspace(1)* %out, align 4
> + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
> + %result0 = extractvalue { float, i1 } %result, 0
> + store float %result0, float addrspace(1)* %out, align 4
> ret void
> }
>
> ; SI-LABEL @test_div_scale_f32_2:
> +; SI: V_DIV_SCALE_F32
> define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
> %a = load float addrspace(1)* %aptr, align 4
> %b = load float addrspace(1)* %bptr, align 4
> - %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b, float %b) nounwind readnone
> - store float %result, float addrspace(1)* %out, align 4
> + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
> + %result0 = extractvalue { float, i1 } %result, 0
> + store float %result0, float addrspace(1)* %out, align 4
> ret void
> }
>
> ; SI-LABEL @test_div_scale_f64_1:
> +; SI: V_DIV_SCALE_F64
> define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
> %a = load double addrspace(1)* %aptr, align 8
> %b = load double addrspace(1)* %bptr, align 8
> - %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b, double %a) nounwind readnone
> - store double %result, double addrspace(1)* %out, align 8
> + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
> + %result0 = extractvalue { double, i1 } %result, 0
> + store double %result0, double addrspace(1)* %out, align 8
> ret void
> }
>
> ; SI-LABEL @test_div_scale_f64_1:
> +; SI: V_DIV_SCALE_F64
> define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
> %a = load double addrspace(1)* %aptr, align 8
> %b = load double addrspace(1)* %bptr, align 8
> - %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b, double %b) nounwind readnone
> - store double %result, double addrspace(1)* %out, align 8
> + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
> + %result0 = extractvalue { double, i1 } %result, 0
> + store double %result0, double addrspace(1)* %out, align 8
> ret void
> }
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list