[PATCH] D109001: [AArch64] Fold an sqadd of a sqdmull at lane 0 into an sqdmlal

Tue Aug 31 08:36:39 PDT 2021

samtebbs created this revision.
samtebbs added reviewers: dmgreen, SjoerdMeijer, NickGuy.
Herald added subscribers: hiraditya, kristof.beyls.
samtebbs requested review of this revision.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

This patch folds a sqadd (i32, vector_extract (sqdmull v4i16, v4i16, 0)) into a sqdmlal. We already generate an sqdmlal for lanes greater than 0, so this patch emits an sqdmlal of the same format but for a lane of 0, and is necessary as the existing pattern doesn't match.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D109001

Files:
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll


Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================

--- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -52,6 +52,8 @@
 
 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) #1
+
 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
@@ -3235,6 +3237,25 @@
   ret <4 x i32> %vqdmlal4.i
 }
 
+define i32 @test_vqdmlal_lane_s16_0_i32(i32 %a, i16 %b, <4 x i16> %c)  {
+; CHECK-LABEL: test_vqdmlal_lane_s16_0_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w1
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    sqdmlal v2.4s, v1.4h, v0.h[0]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    ret
+entry:
+  %0 = insertelement <4 x i16> undef, i16 %b, i64 0
+  %1 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %vqdmlXl = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %0, <4 x i16> %1)
+  %lane0 = extractelement <4 x i32> %vqdmlXl, i64 0
+  %vqdmlXl1 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %lane0)
+  ret i32 %vqdmlXl1
+}
+
+
 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmlal_lane_s32_0:
 ; CHECK:       // %bb.0: // %entry
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1201,6 +1201,17 @@
             (ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
   def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
             (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;
+
+  def : Pat<(i32 (int_aarch64_neon_sqadd (i32 FPR32Op:$Rd),
+                  (i32 (vector_extract (v4i32 (int_aarch64_neon_sqdmull
+                    (v4i16 V64:$Rm),
+                    (v4i16 V64:$Rn))),
+                    (i64 0))))),
+            (EXTRACT_SUBREG (SQDMLALv4i16_indexed
+              (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub),
+              V64:$Rm, (INSERT_SUBREG
+                (v8i16 (IMPLICIT_DEF)), V64:$Rn, dsub),
+              (i64 0)), ssub)>;
 } // End HasSVE
 
 let Predicates = [HasSVEorStreamingSVE] in {


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D109001.369711.patch
Type: text/x-patch
Size: 2491 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210831/f53558a7/attachment.bin>