[llvm] c08d556 - AMDGPU: Fix creating illegal f16 fp_class

Tue Nov 29 15:37:08 PST 2022

Author: Matt Arsenault
Date: 2022-11-29T18:24:30-05:00
New Revision: c08d55623d209070883b420e70b070009d447bca

URL: https://github.com/llvm/llvm-project/commit/c08d55623d209070883b420e70b070009d447bca
DIFF: https://github.com/llvm/llvm-project/commit/c08d55623d209070883b420e70b070009d447bca.diff

LOG: AMDGPU: Fix creating illegal f16 fp_class

We were missing legality checks. The device library build was broken
for targets without f16 support. Technically the first pattern isn't
tested by this patch; it only triggers with the isBeforeLegalize check
in performAndCombine removed. I'm not sure how to trick this into
appearing post-legalization.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/fp-classify.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d38ae3dee7bf4..dac4ba708ee79 100644

--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9709,7 +9709,8 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
 
     SDValue X = LHS.getOperand(0);
     SDValue Y = RHS.getOperand(0);
-    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
+        !isTypeLegal(X.getValueType()))
       return SDValue();
 
     if (LCC == ISD::SETO) {
@@ -11471,8 +11472,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
     }
   }
 
-  if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
-                                           VT != MVT::f16))
+  if (VT != MVT::f32 && VT != MVT::f64 &&
+      (!Subtarget->has16BitInsts() || VT != MVT::f16))
     return SDValue();
 
   // Match isinf/isfinite pattern

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 32dd8bfc2de22..447e22dce3a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -180,5 +180,68 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
   ret void
 }
 
+; GCN-LABEL: {{^}}test_isinf_pattern_f16:
+; SI-DAG: s_mov_b32 [[INF:s[0-9]+]], 0x7f800000
+; SI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |s{{[0-9]+}}|
+; SI: v_cmp_eq_f32_e32 vcc, [[INF]], [[CVT]]
+; SI-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
+
+; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}}
+; VI: v_cmp_class_f16_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; VI-NOT: v_cmp
+
+; GCN: s_endpgm
+define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
+  %fabs = tail call half @llvm.fabs.f16(half %x) #1
+  %cmp = fcmp oeq half %fabs, 0xH7C00
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_isfinite_pattern_0_f16:
+; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1f8
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
+; SI: v_cmp_class_f32_e64 [[CLASS:s\[[0-9]+:[0-9]+\]]], [[CVT]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CLASS]]
+
+; VI-NOT: v_cmp
+; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}}
+; VI: v_cmp_class_f16_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; VI-NOT: v_cmp
+
+; GCN: s_endpgm
+define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
+  %ord = fcmp ord half %x, 0.0
+  %x.fabs = tail call half @llvm.fabs.f16(half %x) #1
+  %ninf = fcmp une half %x.fabs, 0xH7C00
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_isfinite_pattern_4_f16:
+; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1f8
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], s{{[0-9]+}}
+; SI: v_cmp_class_f32_e64 [[CLASS:s\[[0-9]+:[0-9]+\]]], [[CVT]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CLASS]]
+
+; VI-DAG: s_load_dword [[X:s[0-9]+]]
+; VI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8
+; VI: v_cmp_class_f16_e32 vcc, [[X]], [[MASK]]
+; VI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
+define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
+  %ord = fcmp ord half %x, 0.0
+  %x.fabs = tail call half @llvm.fabs.f16(half %x) #1
+  %ninf = fcmp one half %x.fabs, 0xH7C00
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }