[llvm] [AArch64] Avoid NEON dot product in streaming[-compatible] functions (PR #101677)

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 2 06:54:49 PDT 2024


https://github.com/sdesmalen-arm created https://github.com/llvm/llvm-project/pull/101677

The NEON dot product is not valid in streaming mode.
A follow-up patch will improve codegen for these operations.

>From 79d074a3d5476cea53e98739cb5865752cbb4ab7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 2 Aug 2024 14:39:31 +0100
Subject: [PATCH 1/2] Precommit test

---
 ...-streaming-mode-fixed-length-reductions.ll | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll

diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
new file mode 100644
index 0000000000000..7d0918c7023cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mattr=+dotprod,+sve < %s | FileCheck %s -check-prefix=DOT
+; RUN: llc -mattr=+dotprod,+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=STREAMING-SVE
+; RUN: llc -mattr=+dotprod,+sme -force-streaming < %s | FileCheck %s --check-prefix=STREAMING-SVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
+; CHECK-LABEL: reduce_uaddv_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT:    ushll2 v3.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    uaddl2 v4.4s, v3.8h, v2.8h
+; CHECK-NEXT:    uaddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    uaddl2 v5.4s, v0.8h, v1.8h
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; DOT-LABEL: reduce_uaddv_v16i8:
+; DOT:       // %bb.0:
+; DOT-NEXT:    movi v2.16b, #1
+; DOT-NEXT:    movi v3.2d, #0000000000000000
+; DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
+; DOT-NEXT:    addv s0, v3.4s
+; DOT-NEXT:    fmov w0, s0
+; DOT-NEXT:    ret
+;
+; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
+; STREAMING-SVE:       // %bb.0:
+; STREAMING-SVE-NEXT:    mov z2.b, #1 // =0x1
+; STREAMING-SVE-NEXT:    mov z3.s, #0 // =0x0
+; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
+; STREAMING-SVE-NEXT:    udot v3.4s, v1.16b, v2.16b
+; STREAMING-SVE-NEXT:    udot v3.4s, v0.16b, v2.16b
+; STREAMING-SVE-NEXT:    uaddv d0, p0, z3.s
+; STREAMING-SVE-NEXT:    fmov x0, d0
+; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT:    ret
+  %1 = zext <32 x i8> %a to <32 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
+; CHECK-LABEL: reduce_saddv_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll2 v2.8h, v1.16b, #0
+; CHECK-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    saddl2 v4.4s, v3.8h, v2.8h
+; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    saddl2 v5.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; DOT-LABEL: reduce_saddv_v16i8:
+; DOT:       // %bb.0:
+; DOT-NEXT:    movi v2.16b, #1
+; DOT-NEXT:    movi v3.2d, #0000000000000000
+; DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
+; DOT-NEXT:    addv s0, v3.4s
+; DOT-NEXT:    fmov w0, s0
+; DOT-NEXT:    ret
+;
+; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
+; STREAMING-SVE:       // %bb.0:
+; STREAMING-SVE-NEXT:    mov z2.b, #1 // =0x1
+; STREAMING-SVE-NEXT:    mov z3.s, #0 // =0x0
+; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
+; STREAMING-SVE-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; STREAMING-SVE-NEXT:    sdot v3.4s, v0.16b, v2.16b
+; STREAMING-SVE-NEXT:    uaddv d0, p0, z3.s
+; STREAMING-SVE-NEXT:    fmov x0, d0
+; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT:    ret
+  %1 = sext <32 x i8> %a to <32 x i32>
+  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
+  ret i32 %2
+}

>From 0cc64169015f469e174506c7eb92ee01032c5d11 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 2 Aug 2024 14:24:47 +0100
Subject: [PATCH 2/2] [AArch64] Avoid NEON dot product in
 streaming[-compatible] functions.

A follow-up patch will improve codegen for these functions.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  3 +
 ...-streaming-mode-fixed-length-reductions.ll | 68 ++++++++++++++++---
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 01079a95b4746..2e869f11b8431 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17866,6 +17866,9 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
 // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
                                           const AArch64Subtarget *ST) {
+  if (!ST->isNeonAvailable())
+    return SDValue();
+
   if (!ST->hasDotProd())
     return performVecReduceAddCombineWithUADDLP(N, DAG);
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 7d0918c7023cc..00a15f4bcd639 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -36,12 +36,36 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
 ;
 ; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
 ; STREAMING-SVE:       // %bb.0:
-; STREAMING-SVE-NEXT:    mov z2.b, #1 // =0x1
-; STREAMING-SVE-NEXT:    mov z3.s, #0 // =0x0
+; STREAMING-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; STREAMING-SVE-NEXT:    uunpklo z2.h, z1.b
+; STREAMING-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-SVE-NEXT:    uunpklo z3.h, z0.b
 ; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
-; STREAMING-SVE-NEXT:    udot v3.4s, v1.16b, v2.16b
-; STREAMING-SVE-NEXT:    udot v3.4s, v0.16b, v2.16b
-; STREAMING-SVE-NEXT:    uaddv d0, p0, z3.s
+; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
+; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT:    uunpklo z1.h, z1.b
+; STREAMING-SVE-NEXT:    uunpklo z0.h, z0.b
+; STREAMING-SVE-NEXT:    uunpklo z4.s, z2.h
+; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z2.b, #8
+; STREAMING-SVE-NEXT:    uunpklo z6.s, z3.h
+; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z3.b, #8
+; STREAMING-SVE-NEXT:    mov z5.d, z1.d
+; STREAMING-SVE-NEXT:    uunpklo z7.s, z0.h
+; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT:    uunpklo z2.s, z2.h
+; STREAMING-SVE-NEXT:    uunpklo z3.s, z3.h
+; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
+; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z1.b, #8
+; STREAMING-SVE-NEXT:    uunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT:    uunpklo z0.s, z0.h
+; STREAMING-SVE-NEXT:    add z2.s, z3.s, z2.s
+; STREAMING-SVE-NEXT:    uunpklo z5.s, z5.h
+; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
+; STREAMING-SVE-NEXT:    add z1.s, z4.s, z1.s
+; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
+; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
+; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
 ; STREAMING-SVE-NEXT:    fmov x0, d0
 ; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; STREAMING-SVE-NEXT:    ret
@@ -80,12 +104,36 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
 ;
 ; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
 ; STREAMING-SVE:       // %bb.0:
-; STREAMING-SVE-NEXT:    mov z2.b, #1 // =0x1
-; STREAMING-SVE-NEXT:    mov z3.s, #0 // =0x0
+; STREAMING-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; STREAMING-SVE-NEXT:    sunpklo z2.h, z1.b
+; STREAMING-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-SVE-NEXT:    sunpklo z3.h, z0.b
 ; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
-; STREAMING-SVE-NEXT:    sdot v3.4s, v1.16b, v2.16b
-; STREAMING-SVE-NEXT:    sdot v3.4s, v0.16b, v2.16b
-; STREAMING-SVE-NEXT:    uaddv d0, p0, z3.s
+; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
+; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT:    sunpklo z1.h, z1.b
+; STREAMING-SVE-NEXT:    sunpklo z0.h, z0.b
+; STREAMING-SVE-NEXT:    sunpklo z4.s, z2.h
+; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z2.b, #8
+; STREAMING-SVE-NEXT:    sunpklo z6.s, z3.h
+; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z3.b, #8
+; STREAMING-SVE-NEXT:    mov z5.d, z1.d
+; STREAMING-SVE-NEXT:    sunpklo z7.s, z0.h
+; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
+; STREAMING-SVE-NEXT:    sunpklo z2.s, z2.h
+; STREAMING-SVE-NEXT:    sunpklo z3.s, z3.h
+; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
+; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z1.b, #8
+; STREAMING-SVE-NEXT:    sunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT:    sunpklo z0.s, z0.h
+; STREAMING-SVE-NEXT:    add z2.s, z3.s, z2.s
+; STREAMING-SVE-NEXT:    sunpklo z5.s, z5.h
+; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
+; STREAMING-SVE-NEXT:    add z1.s, z4.s, z1.s
+; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
+; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
+; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
 ; STREAMING-SVE-NEXT:    fmov x0, d0
 ; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; STREAMING-SVE-NEXT:    ret



More information about the llvm-commits mailing list