[llvm] r339072 - AMDGPU: Push fcanonicalize through partially constant build_vector
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 6 15:30:44 PDT 2018
Author: arsenm
Date: Mon Aug 6 15:30:44 2018
New Revision: 339072
URL: http://llvm.org/viewvc/llvm-project?rev=339072&view=rev
Log:
AMDGPU: Push fcanonicalize through partially constant build_vector
This usually avoids some re-packing code, and may
help find canonical sources.
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=339072&r1=339071&r2=339072&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Mon Aug 6 15:30:44 2018
@@ -6879,15 +6879,19 @@ SDValue SITargetLowering::getCanonicalCo
return DAG.getConstantFP(C, SL, VT);
}
+static bool vectorEltWillFoldAway(SDValue Op) {
+ return Op.isUndef() || isa<ConstantFPSDNode>(Op);
+}
+
SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
// fcanonicalize undef -> qnan
if (N0.isUndef()) {
- EVT VT = N->getValueType(0);
APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
return DAG.getConstantFP(QNaN, SDLoc(N), VT);
}
@@ -6897,6 +6901,38 @@ SDValue SITargetLowering::performFCanoni
return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
}
+ // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
+ // (fcanonicalize k)
+ //
+ // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
+
+ // TODO: This could be better with wider vectors that will be split to v2f16,
+ // and to consider uses since there aren't that many packed operations.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
+ SDLoc SL(N);
+ SDValue NewElts[2];
+ SDValue Lo = N0.getOperand(0);
+ SDValue Hi = N0.getOperand(1);
+ if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
+ for (unsigned I = 0; I != 2; ++I) {
+ SDValue Op = N0.getOperand(I);
+ EVT EltVT = Op.getValueType();
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+ NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
+ CFP->getValueAPF());
+ } else if (Op.isUndef()) {
+ // This would ordinarily be folded to a qNaN. Since this may be half
+ // of a packed operation, it may be cheaper to use a 0.
+ NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
+ } else {
+ NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
+ }
+ }
+
+ return DAG.getBuildVector(VT, SL, NewElts);
+ }
+ }
+
return isCanonicalized(DAG, N0) ? N0 : SDValue();
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll?rev=339072&r1=339071&r2=339072&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll Mon Aug 6 15:30:44 2018
@@ -10,6 +10,14 @@ declare <3 x half> @llvm.canonicalize.v3
declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
+; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
+; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
+; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
+ %canonicalized = call half @llvm.canonicalize.f16(half undef)
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
@@ -34,6 +42,21 @@ define amdgpu_kernel void @s_test_canoni
ret void
}
+; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
+; GFX9: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+
+; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_max_f16_e32 v0, v0, v0
+; VI: v_or_b32_e32 v0, v0, v1
+define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
+ %ins0 = insertelement <2 x half> undef, half %lo, i32 0
+ %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
+ ret <2 x half> %canonicalized
+}
+
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
@@ -515,6 +538,156 @@ define <4 x half> @v_test_canonicalize_v
ret <4 x half> %canonicalized
}
+; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+ store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: s_setpc_b64
+
+; High bits known zero
+; FIXME: Should also be true on gfx9 by default?
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
+ %vec = insertelement <2 x half> undef, half %val, i32 0
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
+; GFX9-NEXT: s_setpc_b64
+
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
+ %vec = insertelement <2 x half> undef, half %val, i32 1
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
+; GFX9: s_waitcnt
+; GFX9-DAG: v_max_f16_e32 v0, v0, v0
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000
+; GFX9: v_and_b32_e32 v0, 0xffff, v0
+; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
+; GFX9: s_setpc_b64
+
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
+; VI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
+ %vec0 = insertelement <2 x half> undef, half %val, i32 0
+ %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
+; GFX9: v_max_f16_e32 v0, v0, v0
+; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000
+; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
+; GFX9: s_setpc_b64
+
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
+; VI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
+ %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
+ %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16:
+; GCN: v_mov_b32_e32 v0, 0x7e007e00
+; GCN: v_mov_b32_e32 v1, v0
+define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 {
+ %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+ store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_setpc_b64
+
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
+; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT: s_setpc_b64
+define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
+ %vec = insertelement <4 x half> undef, half %val, i32 0
+ %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
+ ret <4 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_setpc_b64
+
+; VI: s_waitcnt
+; VI-DAG: v_max_f16_e32 v0, v0, v0
+; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT: s_setpc_b64
+define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
+ %vec0 = insertelement <4 x half> undef, half %val0, i32 0
+ %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
+ %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
+ ret <4 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
+; GFX9-NEXT: v_and_b32_e32 v1, [[MASK]], v1
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v0, [[MASK]], v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT: s_setpc_b64
+
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: v_max_f16_e32 v1, v1, v1
+; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: s_setpc_b64
+define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
+ %vec0 = insertelement <4 x half> undef, half %val0, i32 0
+ %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
+ %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
+ %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
+ ret <4 x half> %canonicalized
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind "target-features"="-fp64-fp16-denormals" }
More information about the llvm-commits
mailing list