[llvm] e38392b - [AArch64] Transform add(x, abs(y)) -> saba(x, y, 0) (#156615)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 8 06:14:28 PDT 2025


Author: Hari Limaye
Date: 2025-09-08T14:14:24+01:00
New Revision: e38392b19b3989222b1d2b248069f3fb36bfea7a

URL: https://github.com/llvm/llvm-project/commit/e38392b19b3989222b1d2b248069f3fb36bfea7a
DIFF: https://github.com/llvm/llvm-project/commit/e38392b19b3989222b1d2b248069f3fb36bfea7a.diff

LOG: [AArch64] Transform add(x, abs(y)) -> saba(x, y, 0) (#156615)

Add a DAGCombine to perform the following transformations: 
- add(x, abs(y)) -> saba(x, y, 0)
- add(x, zext(abs(y))) -> sabal(x, y, 0)

As well as being a useful generic transformation, this also fixes an
issue where LLVM de-optimises [US]ABA neon ACLE intrinsics into separate
ABD+ADD instructions when one of the operands is a zero vector.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/neon-saba.ll
    llvm/test/CodeGen/AArch64/vecreduce-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2a90288c35751..f0020a9a3c91d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8323,6 +8323,29 @@ def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
                             (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 }
 
+// SABA patterns for add(x, abs(y)) -> saba(x, y, 0)
+def : Pat<(v8i8 (add V64:$Vn, (abs V64:$Vm))),
+          (SABAv8i8 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v4i16 (add V64:$Vn, (abs V64:$Vm))),
+          (SABAv4i16 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v2i32 (add V64:$Vn, (abs V64:$Vm))),
+          (SABAv2i32 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v16i8 (add V128:$Vn, (abs V128:$Vm))),
+          (SABAv16i8 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+def : Pat<(v8i16 (add V128:$Vn, (abs V128:$Vm))),
+          (SABAv8i16 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+def : Pat<(v4i32 (add V128:$Vn, (abs V128:$Vm))),
+          (SABAv4i32 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
+
+// SABAL patterns for add(x, zext(abs(y))) -> sabal(x, y, 0)
+def : Pat<(v8i16 (add V128:$Vn, (zext (abs (v8i8 V64:$Vm))))),
+          (SABALv8i8_v8i16 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v4i32 (add V128:$Vn, (zext (abs (v4i16 V64:$Vm))))),
+          (SABALv4i16_v4i32 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+def : Pat<(v2i64 (add V128:$Vn, (zext (abs (v2i32 V64:$Vm))))),
+          (SABALv2i32_v2i64 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
+
+
 //----------------------------------------------------------------------------
 // AdvSIMD indexed element
 //----------------------------------------------------------------------------

diff  --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll
index 19967bd1a69ec..ddb85d6dee03c 100644
--- a/llvm/test/CodeGen/AArch64/neon-saba.ll
+++ b/llvm/test/CodeGen/AArch64/neon-saba.ll
@@ -12,9 +12,9 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
 ;
 ; CHECK-GI-LABEL: saba_abs_4s:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    abs v1.4s, v1.4s
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    saba v0.4s, v1.4s, v3.4s
 ; CHECK-GI-NEXT:    ret
   %sub = sub nsw <4 x i32> %b, %c
   %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
@@ -30,9 +30,9 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
 ;
 ; CHECK-GI-LABEL: saba_abs_2s:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    sub v1.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    abs v1.2s, v1.2s
-; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    saba v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
   %sub = sub nsw <2 x i32> %b, %c
   %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true)
@@ -48,9 +48,9 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
 ;
 ; CHECK-GI-LABEL: saba_abs_8h:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    abs v1.8h, v1.8h
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    saba v0.8h, v1.8h, v3.8h
 ; CHECK-GI-NEXT:    ret
   %sub = sub nsw <8 x i16> %b, %c
   %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
@@ -66,9 +66,9 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
 ;
 ; CHECK-GI-LABEL: saba_abs_4h:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    sub v1.4h, v1.4h, v2.4h
-; CHECK-GI-NEXT:    abs v1.4h, v1.4h
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    saba v0.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
   %sub = sub nsw <4 x i16> %b, %c
   %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true)
@@ -84,9 +84,9 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
 ;
 ; CHECK-GI-LABEL: saba_abs_16b:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    sub v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    abs v1.16b, v1.16b
-; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    saba v0.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
   %sub = sub nsw <16 x i8> %b, %c
   %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true)
@@ -102,9 +102,9 @@ define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
 ;
 ; CHECK-GI-LABEL: saba_abs_8b:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    sub v1.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    abs v1.8b, v1.8b
-; CHECK-GI-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saba v0.8b, v1.8b, v3.8b
 ; CHECK-GI-NEXT:    ret
   %sub = sub nsw <8 x i8> %b, %c
   %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true)
@@ -174,6 +174,214 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
   ret <8 x i8> %add
 }
 
+; SABA from ADD(SABD(X, ZEROS))
+
+define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: saba_sabd_zeros_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer)
+  %add = add <4 x i32> %sabd, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: saba_sabd_zeros_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.2s, v1.2s, v2.2s
+; CHECK-NEXT:    ret
+  %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer)
+  %add = add <2 x i32> %sabd, %a
+  ret <2 x i32> %add
+}
+
+define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: saba_sabd_zeros_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    ret
+  %sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer)
+  %add = add <8 x i16> %sabd, %a
+  ret <8 x i16> %add
+}
+
+define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 {
+; CHECK-LABEL: saba_sabd_zeros_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    ret
+  %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer)
+  %add = add <4 x i16> %sabd, %a
+  ret <4 x i16> %add
+}
+
+define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 {
+; CHECK-LABEL: saba_sabd_zeros_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer)
+  %add = add <16 x i8> %sabd, %a
+  ret <16 x i8> %add
+}
+
+define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 {
+; CHECK-LABEL: saba_sabd_zeros_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer)
+  %add = add <8 x i8> %sabd, %a
+  ret <8 x i8> %add
+}
+
+define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: saba_abs_zeros_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true)
+  %add = add <4 x i32> %a, %abs
+  ret <4 x i32> %add
+}
+
+define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: saba_abs_zeros_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.2s, v1.2s, v2.2s
+; CHECK-NEXT:    ret
+  %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true)
+  %add = add <2 x i32> %a, %abs
+  ret <2 x i32> %add
+}
+
+define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: saba_abs_zeros_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    ret
+  %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true)
+  %add = add <8 x i16> %a, %abs
+  ret <8 x i16> %add
+}
+
+define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 {
+; CHECK-LABEL: saba_abs_zeros_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    ret
+  %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true)
+  %add = add <4 x i16> %a, %abs
+  ret <4 x i16> %add
+}
+
+define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 {
+; CHECK-LABEL: saba_abs_zeros_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true)
+  %add = add <16 x i8> %a, %abs
+  ret <16 x i8> %add
+}
+
+define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 {
+; CHECK-LABEL: saba_abs_zeros_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    saba v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true)
+  %add = add <8 x i8> %a, %abs
+  ret <8 x i8> %add
+}
+
+; SABAL from ADD(ZEXT(SABD(X, ZEROS)))
+
+define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: sabal_sabd_zeros_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    sabal v0.2d, v1.2s, v2.2s
+; CHECK-NEXT:    ret
+  %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer)
+  %sabd.zext = zext <2 x i32> %sabd to <2 x i64>
+  %add = add <2 x i64> %sabd.zext, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 {
+; CHECK-LABEL: sabal_sabd_zeros_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    sabal v0.4s, v1.4h, v2.4h
+; CHECK-NEXT:    ret
+  %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer)
+  %sabd.zext = zext <4 x i16> %sabd to <4 x i32>
+  %add = add <4 x i32> %sabd.zext, %a
+  ret <4 x i32> %add
+}
+
+define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 {
+; CHECK-LABEL: sabal_sabd_zeros_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    sabal v0.8h, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer)
+  %sabd.zext = zext <8 x i8> %sabd to <8 x i16>
+  %add = add <8 x i16> %sabd.zext, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: sabal_abs_zeros_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    sabal v0.2d, v1.2s, v2.2s
+; CHECK-NEXT:    ret
+  %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true)
+  %abs.zext = zext <2 x i32> %abs to <2 x i64>
+  %add = add <2 x i64> %a, %abs.zext
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 {
+; CHECK-LABEL: sabal_abs_zeros_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    sabal v0.4s, v1.4h, v2.4h
+; CHECK-NEXT:    ret
+  %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true)
+  %abs.zext = zext <4 x i16> %abs to <4 x i32>
+  %add = add <4 x i32> %a, %abs.zext
+  ret <4 x i32> %add
+}
+
+define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 {
+; CHECK-LABEL: sabal_abs_zeros_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    sabal v0.8h, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true)
+  %abs.zext = zext <8 x i8> %abs to <8 x i16>
+  %add = add <8 x i16> %a, %abs.zext
+  ret <8 x i16> %add
+}
+
 declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
 declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
 declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 74d1165d99b82..fb504028a161b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4535,96 +4535,89 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI-NEXT:    ldr d1, [x2]
 ; CHECK-GI-NEXT:    add x10, x0, x9
 ; CHECK-GI-NEXT:    add x11, x2, x8
-; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d2, [x11]
+; CHECK-GI-NEXT:    ldr d3, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
-; CHECK-GI-NEXT:    ldr d3, [x10]
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    ldr d2, [x10]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ldr d6, [x11]
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b
-; CHECK-GI-NEXT:    abs v5.4s, v5.4s
-; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    usubl v3.8h, v1.8b, v3.8b
 ; CHECK-GI-NEXT:    ldr d4, [x10]
-; CHECK-GI-NEXT:    ldr d16, [x11]
-; CHECK-GI-NEXT:    abs v7.4s, v7.4s
-; CHECK-GI-NEXT:    abs v1.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d5, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
 ; CHECK-GI-NEXT:    ldr d6, [x10]
-; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    ldr d7, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
-; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT:    sshll2 v16.4s, v3.8h, #0
+; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v5.8b
 ; CHECK-GI-NEXT:    ldr d5, [x10]
-; CHECK-GI-NEXT:    ldr d7, [x11]
-; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b
-; CHECK-GI-NEXT:    ldr d17, [x11, x8]
-; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
-; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b
-; CHECK-GI-NEXT:    ldr d7, [x10, x9]
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v7.8b
+; CHECK-GI-NEXT:    ldr d7, [x10]
+; CHECK-GI-NEXT:    ldr d19, [x11]
+; CHECK-GI-NEXT:    abs v2.4s, v2.4s
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    abs v16.4s, v16.4s
-; CHECK-GI-NEXT:    abs v3.4s, v3.4s
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v17.8b
+; CHECK-GI-NEXT:    ldr d17, [x10]
+; CHECK-GI-NEXT:    ldr d20, [x11]
+; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v19.8b
+; CHECK-GI-NEXT:    ldr d19, [x10, x9]
+; CHECK-GI-NEXT:    ldr d21, [x11, x8]
+; CHECK-GI-NEXT:    sshll2 v18.4s, v4.8h, #0
+; CHECK-GI-NEXT:    saba v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshll2 v1.4s, v6.8h, #0
+; CHECK-GI-NEXT:    usubl v17.8h, v17.8b, v20.8b
+; CHECK-GI-NEXT:    saba v16.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    sshll2 v3.4s, v5.8h, #0
+; CHECK-GI-NEXT:    usubl v19.8h, v19.8b, v21.8b
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    abs v18.4s, v18.4s
-; CHECK-GI-NEXT:    abs v2.4s, v2.4s
-; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b
-; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0
-; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
-; CHECK-GI-NEXT:    abs v19.4s, v19.4s
-; CHECK-GI-NEXT:    abs v4.4s, v4.4s
-; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s
-; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s
-; CHECK-GI-NEXT:    abs v17.4s, v17.4s
+; CHECK-GI-NEXT:    sshll2 v20.4s, v7.8h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
+; CHECK-GI-NEXT:    sshll2 v21.4s, v17.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT:    abs v3.4s, v3.4s
+; CHECK-GI-NEXT:    sshll2 v22.4s, v19.8h, #0
+; CHECK-GI-NEXT:    saba v18.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    sshll v4.4s, v7.4h, #0
+; CHECK-GI-NEXT:    abs v7.4s, v20.4s
+; CHECK-GI-NEXT:    saba v1.4s, v6.4s, v0.4s
+; CHECK-GI-NEXT:    sshll v6.4s, v17.4h, #0
+; CHECK-GI-NEXT:    abs v17.4s, v21.4s
+; CHECK-GI-NEXT:    saba v3.4s, v5.4s, v0.4s
+; CHECK-GI-NEXT:    sshll v5.4s, v19.4h, #0
+; CHECK-GI-NEXT:    abs v19.4s, v22.4s
+; CHECK-GI-NEXT:    saba v7.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    saba v17.4s, v6.4s, v0.4s
+; CHECK-GI-NEXT:    saba v19.4s, v5.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    addv s2, v16.4s
+; CHECK-GI-NEXT:    addv s4, v18.4s
 ; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    abs v6.4s, v6.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
-; CHECK-GI-NEXT:    addv s3, v3.4s
-; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0
-; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
-; CHECK-GI-NEXT:    abs v16.4s, v16.4s
-; CHECK-GI-NEXT:    abs v5.4s, v5.4s
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
-; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    addv s4, v4.4s
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    abs v18.4s, v18.4s
-; CHECK-GI-NEXT:    abs v7.4s, v7.4s
-; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s
+; CHECK-GI-NEXT:    addv s0, v3.4s
+; CHECK-GI-NEXT:    fmov w10, s4
+; CHECK-GI-NEXT:    addv s2, v7.4s
 ; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    addv s3, v6.4s
-; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    addv s1, v17.4s
 ; CHECK-GI-NEXT:    add w8, w10, w8
-; CHECK-GI-NEXT:    fmov w10, s4
-; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    add w8, w10, w8
-; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    addv s0, v19.4s
+; CHECK-GI-NEXT:    add w8, w9, w8
+; CHECK-GI-NEXT:    fmov w9, s2
 ; CHECK-GI-NEXT:    add w8, w9, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w8, w9, w8


        


More information about the llvm-commits mailing list