[llvm] r339512 - AMDGPU: Use splat vectors for undefs when folding canonicalize

Sun Aug 12 01:42:54 PDT 2018

Author: arsenm
Date: Sun Aug 12 01:42:54 2018
New Revision: 339512

URL: http://llvm.org/viewvc/llvm-project?rev=339512&view=rev
Log:
AMDGPU: Use splat vectors for undefs when folding canonicalize

If one of the elements is undef, use the canonicalized constant
from the other element instead of 0.

Splat vectors are more useful for other optimizations, such
as matching vector clamps. This was breaking on clamps
of half3 from the undef 4th component.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/clamp.ll
    llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=339512&r1=339511&r2=339512&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Sun Aug 12 01:42:54 2018
@@ -6989,27 +6989,42 @@ SDValue SITargetLowering::performFCanoni
 
   // TODO: This could be better with wider vectors that will be split to v2f16,
   // and to consider uses since there aren't that many packed operations.
-  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
+      isTypeLegal(MVT::v2f16)) {
     SDLoc SL(N);
     SDValue NewElts[2];
     SDValue Lo = N0.getOperand(0);
     SDValue Hi = N0.getOperand(1);
+    EVT EltVT = Lo.getValueType();
+
     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
       for (unsigned I = 0; I != 2; ++I) {
         SDValue Op = N0.getOperand(I);
-        EVT EltVT = Op.getValueType();
         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
           NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
                                               CFP->getValueAPF());
         } else if (Op.isUndef()) {
-          // This would ordinarily be folded to a qNaN. Since this may be half
-          // of a packed operation, it may be cheaper to use a 0.
-          NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
+          // Handled below based on what the other operand is.
+          NewElts[I] = Op;
         } else {
           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
         }
       }
 
+      // If one half is undef, and one is constant, perfer a splat vector rather
+      // than the normal qNaN. If it's a register, prefer 0.0 since that's
+      // cheaper to use and may be free with a packed operation.
+      if (NewElts[0].isUndef()) {
+        if (isa<ConstantFPSDNode>(NewElts[1]))
+          NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
+            NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
+      }
+
+      if (NewElts[1].isUndef()) {
+        NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
+          NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
+      }
+
       return DAG.getBuildVector(VT, SL, NewElts);
     }
   }

Modified: llvm/trunk/test/CodeGen/AMDGPU/clamp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/clamp.ll?rev=339512&r1=339511&r2=339512&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/clamp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/clamp.ll Sun Aug 12 01:42:54 2018
@@ -688,6 +688,38 @@ define amdgpu_kernel void @v_clamp_v2f16
   ret void
 }
 
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
+  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
+
+  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
+  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
+
+  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
 ; GCN: v_add_f32_e32 [[A:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[B:v[0-9]+]]

Modified: llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll?rev=339512&r1=339511&r2=339512&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fcanonicalize.f16.ll Sun Aug 12 01:42:54 2018
@@ -565,19 +565,70 @@ define <2 x half> @v_test_canonicalize_r
 }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_setpc_b64
+; GFX89: s_waitcnt
+; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
   %vec = insertelement <2 x half> undef, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
+
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 1.0
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 1.0, i32 1
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 1.0
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 1.0, i32 0
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 16.0, i32 1
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 16.0, i32 0
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
 ; GFX9: s_waitcnt