[llvm] Enable Custom Lowering for fabs.v8f16 on AVX (PR #71730)

Wed Nov 8 12:09:06 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: David Li (david-xl)

<details>
<summary>Changes</summary>

[X86]: Enable custom lowering for fabs.v8f16 on AVX

Currently, custom lowering of fabs.v8f16 requires AVX512FP16, which is too restrictive. For v8f16 fabs lowering, no instructions in AVX512FP16 are needed.  Without the fix, horribly inefficient code is generated without AVX512FP16. Note instcombiner generates calls to intrinsics @llvm.fabs.v8f16 when simplifyping AND <8 x half> operations.

---
Full diff: https://github.com/llvm/llvm-project/pull/71730.diff


2 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+3) 
- (modified) llvm/test/CodeGen/X86/vec_fabs.ll (+41) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 22fba5601ccfd38..b3b5a0c1b68ec82 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2238,6 +2238,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX())
+    setOperationAction(ISD::FABS, MVT::v8f16, Custom);
+
   if (!Subtarget.useSoftFloat() &&
       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
     addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index 982062d8907542a..08364449ab1a378 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX512VL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX512VLDQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VLDQ
 
@@ -111,6 +113,45 @@ define <4 x double> @fabs_v4f64(<4 x double> %p) {
 }
 declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
 
+define <8 x half> @fabs_v8f16(ptr %p) {
+; X86-AVX-LABEL: fabs_v8f16:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl 4(%esp), [[ADDRREG:%.*]]
+; X86-AVX-NEXT:    vmovaps ([[ADDRREG]]), %xmm0
+; X86-AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+
+; X86-AVX2-LABEL: fabs_v8f16:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    movl 4(%esp), [[REG:%.*]]
+; X86-AVX2-NEXT:    vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-AVX2-NEXT:    vpand ([[REG]]), %xmm0, %xmm0
+; X86-AVX2-NEXT:    retl
+
+; X64-AVX512VL-LABEL: fabs_v8f16:
+; X64-AVX512VL:       # %bb.0:
+; X64-AVX512VL-NEXT:    vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-AVX512VL-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-AVX512VL-NEXT:    retq
+
+; X64-AVX-LABEL: fabs_v8f16:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+
+; X64-AVX2-LABEL: fabs_v8f16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-AVX2-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+
+  %v = load <8 x half>, ptr %p, align 16
+  %nnv = call <8 x half> @llvm.fabs.v8f16(<8 x half> %v)
+  ret <8 x half> %nnv
+}
+declare <8 x half> @llvm.fabs.v8f16(<8 x half> %p)
+
 define <8 x float> @fabs_v8f32(<8 x float> %p) {
 ; X86-AVX-LABEL: fabs_v8f32:
 ; X86-AVX:       # %bb.0:

``````````

</details>


https://github.com/llvm/llvm-project/pull/71730