[llvm] r295797 - AMDGPU: Add cvt.pkrtz intrinsic
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 21 16:27:35 PST 2017
Author: arsenm
Date: Tue Feb 21 18:27:34 2017
New Revision: 295797
URL: http://llvm.org/viewvc/llvm-project?rev=295797&view=rev
Log:
AMDGPU: Add cvt.pkrtz intrinsic
Convert llvm.SI.packf16 test uses
Added:
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
llvm/trunk/test/CodeGen/AMDGPU/commute-shifts.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll
llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll
llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll
llvm/trunk/test/Transforms/InstCombine/amdgcn-intrinsics.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Tue Feb 21 18:27:34 2017
@@ -206,6 +206,10 @@ def int_amdgcn_fract : Intrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
>;
+def int_amdgcn_cvt_pkrtz : Intrinsic<
+ [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]
+>;
+
def int_amdgcn_class : Intrinsic<
[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]
>;
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Tue Feb 21 18:27:34 2017
@@ -3458,6 +3458,7 @@ const char* AMDGPUTargetLowering::getTar
NODE_NAME_CASE(CVT_F32_UBYTE1)
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
+ NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Tue Feb 21 18:27:34 2017
@@ -318,6 +318,11 @@ enum NodeType : unsigned {
CVT_F32_UBYTE1,
CVT_F32_UBYTE2,
CVT_F32_UBYTE3,
+
+ // Convert two float 32 numbers into a single register holding two packed f16
+ // with round to zero.
+ CVT_PKRTZ_F16_F32,
+
/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td Tue Feb 21 18:27:34 2017
@@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2
[SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
>;
+def AMDGPUFPPackOp : SDTypeProfile<1, 2,
+ [SDTCisFP<1>, SDTCisSameAs<1, 2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@@ -78,6 +82,8 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD:
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+
def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
// out = max(a, b) a and b are floats, where a nan comparison fails.
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Tue Feb 21 18:27:34 2017
@@ -188,6 +188,7 @@ SITargetLowering::SITargetLowering(const
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -2014,6 +2015,23 @@ void SITargetLowering::ReplaceNodeResult
Results.push_back(Res);
return;
}
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_cvt_pkrtz: {
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ SDLoc SL(N);
+ SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
+ Src0, Src1);
+
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
+ return;
+ }
+ default:
+ break;
+ }
+ }
default:
break;
}
@@ -2691,10 +2709,6 @@ SDValue SITargetLowering::LowerINTRINSIC
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
- case AMDGPUIntrinsic::SI_packf16:
- if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
- return DAG.getUNDEF(MVT::i32);
- return Op;
case Intrinsic::amdgcn_interp_mov: {
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
SDValue Glue = M0.getValue(1);
@@ -2811,6 +2825,18 @@ SDValue SITargetLowering::LowerINTRINSIC
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_sffbh:
return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_cvt_pkrtz: {
+ // FIXME: Stop adding cast if v2f16 legal.
+ EVT VT = Op.getValueType();
+ SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+ Op.getOperand(1), Op.getOperand(2));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Node);
+ }
+ case AMDGPUIntrinsic::SI_packf16: { // Legacy name
+ EVT VT = Op.getValueType();
+ return DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ }
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
@@ -4154,6 +4180,15 @@ SDValue SITargetLowering::performFMed3Co
return SDValue();
}
+SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ if (Src0.isUndef() && Src1.isUndef())
+ return DCI.DAG.getUNDEF(N->getValueType(0));
+ return SDValue();
+}
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -4422,6 +4457,8 @@ SDValue SITargetLowering::PerformDAGComb
return performCvtF32UByteNCombine(N, DCI);
case AMDGPUISD::FMED3:
return performFMed3Combine(N, DCI);
+ case AMDGPUISD::CVT_PKRTZ_F16_F32:
+ return performCvtPkRTZCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Tue Feb 21 18:27:34 2017
@@ -88,6 +88,7 @@ class SITargetLowering final : public AM
SDValue Op0, SDValue Op1) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Tue Feb 21 18:27:34 2017
@@ -1151,6 +1151,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64,
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
Modified: llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td Tue Feb 21 18:27:34 2017
@@ -354,7 +354,7 @@ defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f3
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
Modified: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp Tue Feb 21 18:27:34 2017
@@ -3206,6 +3206,31 @@ Instruction *InstCombiner::visitCallInst
return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
}
+ case Intrinsic::amdgcn_cvt_pkrtz: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+ if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+ if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+ const fltSemantics &HalfSem
+ = II->getType()->getScalarType()->getFltSemantics();
+ bool LosesInfo;
+ APFloat Val0 = C0->getValueAPF();
+ APFloat Val1 = C1->getValueAPF();
+ Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+ Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+ Constant *Folded = ConstantVector::get({
+ ConstantFP::get(II->getContext(), Val0),
+ ConstantFP::get(II->getContext(), Val1) });
+ return replaceInstUsesWith(*II, Folded);
+ }
+ }
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ break;
+ }
case Intrinsic::stackrestore: {
// If the save is right next to the restore, remove the restore. This can
// happen when variable allocas are DCE'd.
Modified: llvm/trunk/test/CodeGen/AMDGPU/commute-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/commute-shifts.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/commute-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/commute-shifts.ll Tue Feb 21 18:27:34 2017
@@ -15,13 +15,13 @@ bb:
%tmp5 = and i32 %tmp2, %tmp4
%tmp6 = icmp eq i32 %tmp5, 0
%tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
- %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
- %tmp9 = bitcast i32 %tmp8 to float
+ %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7)
+ %tmp9 = bitcast <2 x half> %tmp8 to float
ret float %tmp9
}
declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare i32 @llvm.SI.packf16(float, float) #1
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll?rev=295797&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll Tue Feb 21 18:27:34 2017
@@ -0,0 +1,163 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: s_load_dword [[Y:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
+define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
+ %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
+define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
+ %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_undef_undef:
+; GCN-NEXT: ; BB#0
+; GCN-NEXT: s_endpgm
+define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
+ %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
+define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
+define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
+define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
+define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
+define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
+define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fabs.a = call float @llvm.fabs.f32(float %a)
+ %neg.fabs.a = fsub float -0.0, %fabs.a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
+ store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll Tue Feb 21 18:27:34 2017
@@ -204,11 +204,9 @@ main_body:
%tmp = call float @llvm.fabs.f32(float %p2.i)
%tmp34 = call float @llvm.fabs.f32(float %p2.i12)
%tmp35 = call float @llvm.fabs.f32(float %p2.i6)
- %tmp36 = call i32 @llvm.SI.packf16(float %tmp, float %tmp34)
- %tmp37 = bitcast i32 %tmp36 to <2 x half>
- %tmp38 = call i32 @llvm.SI.packf16(float %tmp35, float 1.000000e+00)
- %tmp39 = bitcast i32 %tmp38 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 true) #0
+ %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp, float %tmp34)
+ %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp35, float 1.000000e+00)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 true) #0
ret void
}
@@ -218,7 +216,7 @@ declare float @llvm.amdgcn.interp.p2(flo
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
-declare i32 @llvm.SI.packf16(float, float) #1
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll Tue Feb 21 18:27:34 2017
@@ -164,11 +164,9 @@ ENDIF24:
%tmp110 = fmul float %tmp109, %tmp106
%tmp111 = fsub float -0.000000e+00, %tmp105
%tmp112 = fmul float %tmp111, %tmp106
- %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110)
- %tmp114 = bitcast i32 %tmp113 to <2 x half>
- %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00)
- %tmp116 = bitcast i32 %tmp115 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp114, <2 x half> %tmp116, i1 true, i1 true) #0
+ %tmp113 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp108, float %tmp110)
+ %tmp115 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp112, float 1.000000e+00)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp113, <2 x half> %tmp115, i1 true, i1 true) #0
ret void
}
@@ -388,9 +386,8 @@ bb:
%tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
%tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp10 = extractelement <4 x float> %tmp9, i32 0
- %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
- %tmp13 = bitcast i32 %tmp12 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0
+ %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
ret void
}
@@ -404,9 +401,8 @@ bb:
%tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
%tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp10 = extractelement <4 x float> %tmp9, i32 0
- %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
- %tmp13 = bitcast i32 %tmp12 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0
+ %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
ret void
}
@@ -419,8 +415,8 @@ declare float @llvm.amdgcn.interp.p1(flo
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
-declare i32 @llvm.SI.packf16(float, float) #1
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-scheduler.ll Tue Feb 21 18:27:34 2017
@@ -45,11 +45,9 @@ main_body:
%tmp33 = extractelement <4 x float> %tmp31, i32 1
%tmp34 = extractelement <4 x float> %tmp31, i32 2
%tmp35 = extractelement <4 x float> %tmp31, i32 3
- %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33)
- %tmp37 = bitcast i32 %tmp36 to <2 x half>
- %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35)
- %tmp39 = bitcast i32 %tmp38 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 false) #0
+ %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33)
+ %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0
ret void
}
@@ -58,7 +56,7 @@ declare float @llvm.amdgcn.interp.p2(flo
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare i32 @llvm.SI.packf16(float, float) #1
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll Tue Feb 21 18:27:34 2017
@@ -731,11 +731,9 @@ IF67:
%tmp578 = fadd float %tmp577, %tmp554
%tmp579 = fmul float %tmp574, %tmp45
%tmp580 = fadd float %tmp579, %tmp556
- %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578)
- %tmp582 = bitcast i32 %tmp581 to <2 x half>
- %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282)
- %tmp584 = bitcast i32 %tmp583 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp582, <2 x half> %tmp584, i1 true, i1 true) #0
+ %tmp581 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp576, float %tmp578)
+ %tmp583 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp580, float %tmp282)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp581, <2 x half> %tmp583, i1 true, i1 true) #0
ret void
ENDIF66: ; preds = %LOOP65
@@ -1813,11 +1811,9 @@ ENDIF209:
%tmp774 = fadd float %tmp773, %tmp52
%max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00)
%clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
- %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770)
- %tmp777 = bitcast i32 %tmp776 to <2 x half>
- %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %clamp.i2)
- %tmp779 = bitcast i32 %tmp778 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp777, <2 x half> %tmp779, i1 true, i1 true) #0
+ %tmp776 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp768, float %tmp770)
+ %tmp778 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp772, float %clamp.i2)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp776, <2 x half> %tmp778, i1 true, i1 true) #0
ret void
ELSE214: ; preds = %ELSE211
@@ -1849,13 +1845,13 @@ declare float @llvm.amdgcn.interp.p2(flo
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-declare i32 @llvm.SI.packf16(float, float) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll Tue Feb 21 18:27:34 2017
@@ -105,9 +105,8 @@ else:
endif: ; preds = %else, %if
%export = phi float [ %lds_data, %if ], [ %interp, %else ]
- %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export)
- %tmp5 = bitcast i32 %tmp4 to float
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp5, float %tmp5, float %tmp5, float %tmp5, i1 true, i1 true) #0
+ %tmp4 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %export, float %export)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp4, <2 x half> %tmp4, i1 true, i1 true) #0
ret void
}
@@ -207,7 +206,8 @@ ret:
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-declare i32 @llvm.SI.packf16(float, float) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll Tue Feb 21 18:27:34 2017
@@ -23,17 +23,16 @@ bb3:
%tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
%tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tmp10 = extractelement <4 x float> %tmp9, i32 0
- %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
- %tmp13 = bitcast i32 %tmp12 to <2 x half>
- call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0
+ %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
+ call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
ret void
}
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare i32 @llvm.SI.packf16(float, float) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/Transforms/InstCombine/amdgcn-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/amdgcn-intrinsics.ll?rev=295797&r1=295796&r2=295797&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/amdgcn-intrinsics.ll (original)
+++ llvm/trunk/test/Transforms/InstCombine/amdgcn-intrinsics.ll Tue Feb 21 18:27:34 2017
@@ -633,3 +633,73 @@ define float @cos_fabs_fneg_f32(float %x
%cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
ret float %cos
}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pkrtz
+; --------------------------------------------------------------------
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
+
+; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y)
+define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00)
+define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pkrtz(
+; CHECK: ret <2 x half> undef
+define <2 x half> @undef_cvt_pkrtz() {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
+; CHECK: ret <2 x half> zeroinitializer
+define <2 x half> @constant_splat0_cvt_pkrtz() {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
+ ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_cvt_pkrtz(
+; CHECK: ret <2 x half> <half 0xH4000, half 0xH4400>
+define <2 x half> @constant_cvt_pkrtz() {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
+ ret <2 x half> %cvt
+}
+
+; Test constant values where rtz changes result
+; CHECK-LABEL: @constant_rtz_pkrtz(
+; CHECK: ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
+define <2 x half> @constant_rtz_pkrtz() {
+ %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
+ ret <2 x half> %cvt
+}
More information about the llvm-commits
mailing list