[llvm] 68f34e4 - [AArch64] Add tablegen patterns for faddp of two extracts

Sun Jun 18 23:48:36 PDT 2023

Author: David Green
Date: 2023-06-19T07:48:31+01:00
New Revision: 68f34e4d39690cb22f11269a128cae420feaf212

URL: https://github.com/llvm/llvm-project/commit/68f34e4d39690cb22f11269a128cae420feaf212
DIFF: https://github.com/llvm/llvm-project/commit/68f34e4d39690cb22f11269a128cae420feaf212.diff

LOG: [AArch64] Add tablegen patterns for faddp of two extracts

This adds some simple tablegen patterns for converting
`faddp v2f32 extractlow(Rn), v2f32 extracthigh(Rn)` to
`faddp v4f32 Rn, v4f32 Rn` using the q variants of the
instructions, avoiding the extra ext needed to extract
the high lanes. Only the bottom lanes of the new faddp
are used, the second Rn operand is used as a placeholder.
It uses Rn to prevent any false dependencies, but could
equally by undef.

Differential Revision: https://reviews.llvm.org/D152245

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/faddp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 29d023bf2570e..9c1dc83a3e1a1 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8517,6 +8517,15 @@ def : Pat<(any_fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
                     (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
           (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
 
+// Prefer using the bottom lanes of faddp Rn, Rn compared to
+// faddp extractlow(Rn), extracthigh(Rn)
+def : Pat<(AArch64faddp (v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 0))),
+                        (v2f32 (extract_subvector (v4f32 FPR128:$Rn), (i64 2)))),
+          (v2f32 (EXTRACT_SUBREG (FADDPv4f32 $Rn, $Rn), dsub))>;
+def : Pat<(AArch64faddp (v4f16 (extract_subvector (v8f16 FPR128:$Rn), (i64 0))),
+                        (v4f16 (extract_subvector (v8f16 FPR128:$Rn), (i64 4)))),
+          (v4f16 (EXTRACT_SUBREG (FADDPv8f16 $Rn, $Rn), dsub))>;
+
 // Scalar 64-bit shifts in FPR64 registers.
 def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
           (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;

diff  --git a/llvm/test/CodeGen/AArch64/faddp.ll b/llvm/test/CodeGen/AArch64/faddp.ll
index 2da2e8f699d75..d933d7fb7554d 100644
--- a/llvm/test/CodeGen/AArch64/faddp.ll
+++ b/llvm/test/CodeGen/AArch64/faddp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple aarch64 < %s | FileCheck %s
+; RUN: llc --mtriple aarch64 -mattr=+fullfp16 < %s | FileCheck %s
 
 define float @faddp_2xfloat(<2 x float> %a) {
 ; CHECK-LABEL: faddp_2xfloat:
@@ -256,6 +256,39 @@ entry:
   ret <16 x float> %b
 }
 
+define float @faddp_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: faddp_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+  %1 = fadd <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 1>
+  %3 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 2, i32 3>
+  %4 = tail call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %2, <2 x float> %3)
+  %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+  %6 = fadd <2 x float> %4, %5
+  %7 = extractelement <2 x float> %6, i64 0
+  ret float %7
+}
+
+define <4 x half> @faddp_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: faddp_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    faddp v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %1 = fadd <8 x half> %a, %b
+  %2 = shufflevector <8 x half> %1, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = shufflevector <8 x half> %1, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %4 = tail call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> %2, <4 x half> %3)
+  ret <4 x half> %4
+}
+
+declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>)
+declare <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half>, <4 x half>)
 
 attributes #0 = { strictfp }