[llvm] [AArch64][GlobalISel] Improve non-SVE popcount for 32bit and 64 bit using udot (PR #96409)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 25 05:46:47 PDT 2024
================
@@ -67,25 +140,338 @@ Entry:
declare i256 @llvm.ctpop.i256(i256)
define <1 x i128> @popcount1x128(<1 x i128> %0) {
+; CHECKO0-LABEL: popcount1x128:
+; CHECKO0: // %bb.0: // %Entry
+; CHECKO0-NEXT: // implicit-def: $q0
+; CHECKO0-NEXT: mov v0.d[0], x0
+; CHECKO0-NEXT: mov v0.d[1], x1
+; CHECKO0-NEXT: cnt v0.16b, v0.16b
+; CHECKO0-NEXT: uaddlv h0, v0.16b
+; CHECKO0-NEXT: // kill: def $q0 killed $h0
+; CHECKO0-NEXT: mov x1, xzr
+; CHECKO0-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECKO0-NEXT: fmov w0, s0
+; CHECKO0-NEXT: mov w8, wzr
+; CHECKO0-NEXT: // kill: def $x0 killed $w0
+; CHECKO0-NEXT: // kill: def $x8 killed $w8
+; CHECKO0-NEXT: bfi x0, x8, #32, #32
+; CHECKO0-NEXT: ret
+;
+; GISEL-LABEL: popcount1x128:
+; GISEL: // %bb.0: // %Entry
+; GISEL-NEXT: // implicit-def: $q0
+; GISEL-NEXT: mov v0.d[0], x0
+; GISEL-NEXT: mov v0.d[1], x1
+; GISEL-NEXT: cnt v0.16b, v0.16b
+; GISEL-NEXT: uaddlv h0, v0.16b
+; GISEL-NEXT: // kill: def $q0 killed $h0
+; GISEL-NEXT: mov x1, xzr
+; GISEL-NEXT: // kill: def $s0 killed $s0 killed $q0
+; GISEL-NEXT: fmov w0, s0
+; GISEL-NEXT: mov w8, wzr
+; GISEL-NEXT: // kill: def $x0 killed $w0
+; GISEL-NEXT: // kill: def $x8 killed $w8
+; GISEL-NEXT: bfi x0, x8, #32, #32
+; GISEL-NEXT: ret
+;
; CHECK-LABEL: popcount1x128:
; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: // implicit-def: $q0
; CHECK-NEXT: mov v0.d[0], x0
; CHECK-NEXT: mov v0.d[1], x1
+; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: cnt v0.16b, v0.16b
; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: // kill: def $q0 killed $h0
-; CHECK-NEXT: mov x1, xzr
-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: // kill: def $x0 killed $w0
-; CHECK-NEXT: // kill: def $x8 killed $w8
-; CHECK-NEXT: bfi x0, x8, #32, #32
+; CHECK-NEXT: mov w0, v0.s[0]
; CHECK-NEXT: ret
Entry:
- %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
+ %1 = tail call <1 x i128> @llvm.ctpop.v1i128(<1 x i128> %0)
ret <1 x i128> %1
}
-declare <1 x i128> @llvm.ctpop.v1.i128(<1 x i128>)
+declare <1 x i128> @llvm.ctpop.v1i128(<1 x i128>)
+
+define <2 x i64> @popcount2x64(<2 x i64> %0) {
+; CHECKO0-LABEL: popcount2x64:
+; CHECKO0: // %bb.0: // %Entry
+; CHECKO0-NEXT: cnt v0.16b, v0.16b
+; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
+; CHECKO0-NEXT: uaddlp v0.4s, v0.8h
+; CHECKO0-NEXT: uaddlp v0.2d, v0.4s
+; CHECKO0-NEXT: ret
+;
+; GISEL-LABEL: popcount2x64:
+; GISEL: // %bb.0: // %Entry
+; GISEL-NEXT: cnt v0.16b, v0.16b
+; GISEL-NEXT: uaddlp v0.8h, v0.16b
+; GISEL-NEXT: uaddlp v0.4s, v0.8h
+; GISEL-NEXT: uaddlp v0.2d, v0.4s
+; GISEL-NEXT: ret
+;
+; NEON-GISEL-LABEL: popcount2x64:
+; NEON-GISEL: // %bb.0: // %Entry
+; NEON-GISEL-NEXT: cnt v0.16b, v0.16b
+; NEON-GISEL-NEXT: uaddlp v0.8h, v0.16b
+; NEON-GISEL-NEXT: uaddlp v0.4s, v0.8h
+; NEON-GISEL-NEXT: uaddlp v0.2d, v0.4s
+; NEON-GISEL-NEXT: ret
+;
+; DOT-GISEL-LABEL: popcount2x64:
+; DOT-GISEL: // %bb.0: // %Entry
+; DOT-GISEL-NEXT: movi v1.2d, #0000000000000000
+; DOT-GISEL-NEXT: cnt v0.16b, v0.16b
+; DOT-GISEL-NEXT: movi v2.16b, #1
+; DOT-GISEL-NEXT: udot v1.4s, v2.16b, v0.16b
+; DOT-GISEL-NEXT: uaddlp v0.2d, v1.4s
+; DOT-GISEL-NEXT: ret
+;
+; SVE-GISEL-LABEL: popcount2x64:
+; SVE-GISEL: // %bb.0: // %Entry
+; SVE-GISEL-NEXT: cnt v0.16b, v0.16b
+; SVE-GISEL-NEXT: uaddlp v0.8h, v0.16b
+; SVE-GISEL-NEXT: uaddlp v0.4s, v0.8h
+; SVE-GISEL-NEXT: uaddlp v0.2d, v0.4s
+; SVE-GISEL-NEXT: ret
+; NEON-LABEL: popcount2x64:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: uaddlp v0.8h, v0.16b
+; NEON-NEXT: uaddlp v0.4s, v0.8h
+; NEON-NEXT: uaddlp v0.2d, v0.4s
+; NEON-NEXT: ret
+; DOT-LABEL: popcount2x64:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: movi v1.16b, #1
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: movi v2.2d, #0000000000000000
+; DOT-NEXT: udot v2.4s, v1.16b, v0.16b
+; DOT-NEXT: uaddlp v0.2d, v2.4s
+; DOT-NEXT: ret
+; SVE-LABEL: popcount2x64:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: cnt v0.16b, v0.16b
+; SVE-NEXT: uaddlp v0.8h, v0.16b
+; SVE-NEXT: uaddlp v0.4s, v0.8h
+; SVE-NEXT: uaddlp v0.2d, v0.4s
+; SVE-NEXT: ret
+Entry:
+ %1 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
+ ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
+
+define <1 x i64> @popcount1x64(<1 x i64> %0) {
+; CHECKO0-LABEL: popcount1x64:
+; CHECKO0: // %bb.0: // %Entry
+; CHECKO0-NEXT: fmov x0, d0
+; CHECKO0-NEXT: fmov d0, x0
+; CHECKO0-NEXT: cnt v0.8b, v0.8b
+; CHECKO0-NEXT: uaddlv h0, v0.8b
+; CHECKO0-NEXT: // kill: def $q0 killed $h0
+; CHECKO0-NEXT: mov w8, v0.s[0]
+; CHECKO0-NEXT: // kill: def $x8 killed $w8
+; CHECKO0-NEXT: fmov d0, x8
+; CHECKO0-NEXT: ret
+;
+; GISEL-LABEL: popcount1x64:
+; GISEL: // %bb.0: // %Entry
+; GISEL-NEXT: fmov x0, d0
+; GISEL-NEXT: fmov d0, x0
+; GISEL-NEXT: cnt v0.8b, v0.8b
+; GISEL-NEXT: uaddlv h0, v0.8b
+; GISEL-NEXT: // kill: def $q0 killed $h0
+; GISEL-NEXT: mov w8, v0.s[0]
+; GISEL-NEXT: // kill: def $x8 killed $w8
+; GISEL-NEXT: fmov d0, x8
+; GISEL-NEXT: ret
+;
+; CHECK-LABEL: popcount1x64:
+; CHECK: // %bb.0: // %Entry
+; CHECK-NEXT: cnt v0.8b, v0.8b
+; CHECK-NEXT: uaddlv h0, v0.8b
+; CHECK-NEXT: mov w8, v0.s[0]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+Entry:
+ %1 = tail call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %0)
+ ret <1 x i64> %1
+}
+
+declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>)
+
+define <4 x i32> @popcount4x32(<4 x i32> %0) {
+; CHECKO0-LABEL: popcount4x32:
+; CHECKO0: // %bb.0: // %Entry
+; CHECKO0-NEXT: cnt v0.16b, v0.16b
+; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
+; CHECKO0-NEXT: uaddlp v0.4s, v0.8h
+; CHECKO0-NEXT: ret
+;
+; GISEL-LABEL: popcount4x32:
+; GISEL: // %bb.0: // %Entry
+; GISEL-NEXT: cnt v0.16b, v0.16b
+; GISEL-NEXT: uaddlp v0.8h, v0.16b
+; GISEL-NEXT: uaddlp v0.4s, v0.8h
+; GISEL-NEXT: ret
+;
+; NEON-GISEL-LABEL: popcount4x32:
+; NEON-GISEL: // %bb.0: // %Entry
+; NEON-GISEL-NEXT: cnt v0.16b, v0.16b
+; NEON-GISEL-NEXT: uaddlp v0.8h, v0.16b
+; NEON-GISEL-NEXT: uaddlp v0.4s, v0.8h
+; NEON-GISEL-NEXT: ret
+;
+; DOT-GISEL-LABEL: popcount4x32:
+; DOT-GISEL: // %bb.0: // %Entry
+; DOT-GISEL-NEXT: movi v1.2d, #0000000000000000
+; DOT-GISEL-NEXT: cnt v0.16b, v0.16b
+; DOT-GISEL-NEXT: movi v2.16b, #1
+; DOT-GISEL-NEXT: udot v1.4s, v2.16b, v0.16b
+; DOT-GISEL-NEXT: mov v0.16b, v1.16b
+; DOT-GISEL-NEXT: ret
+;
+; SVE-GISEL-LABEL: popcount4x32:
+; SVE-GISEL: // %bb.0: // %Entry
+; SVE-GISEL-NEXT: cnt v0.16b, v0.16b
+; SVE-GISEL-NEXT: uaddlp v0.8h, v0.16b
+; SVE-GISEL-NEXT: uaddlp v0.4s, v0.8h
+; SVE-GISEL-NEXT: ret
+; NEON-LABEL: popcount4x32:
----------------
tgymnich wrote:
done
https://github.com/llvm/llvm-project/pull/96409
More information about the llvm-commits
mailing list