[llvm] [AArch64] Avoid NEON ctpop in Streaming-SVE mode (PR #93826)

via llvm-commits llvm-commits at lists.llvm.org
Thu May 30 07:06:40 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

<details>
<summary>Changes</summary>

The NEON ctpop instruction is also used for scalars.

---

Patch is 106.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93826.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll (+1499-833) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 365ef68dcb19b..ac6f1e07c4184 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9571,13 +9571,17 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
           Attribute::NoImplicitFloat))
     return SDValue();
 
-  if (!Subtarget->hasNEON())
+  EVT VT = Op.getValueType();
+  if (VT.isScalableVector() ||
+      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
+
+  if (!Subtarget->isNeonAvailable())
     return SDValue();
 
   bool IsParity = Op.getOpcode() == ISD::PARITY;
   SDValue Val = Op.getOperand(0);
   SDLoc DL(Op);
-  EVT VT = Op.getValueType();
 
   // for i32, general parity function using EORs is more efficient compared to
   // using floating point
@@ -9626,10 +9630,6 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
 
   assert(!IsParity && "ISD::PARITY of vector types not supported");
 
-  if (VT.isScalableVector() ||
-      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
-
   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
          "Unexpected type for custom ctpop lowering");
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index f920efeb4892d..f662140327135 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -741,37 +741,63 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
-; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #70]
-; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #68]
-; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #66]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    fmov d1, x9
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
-; NONEON-NOSVE-NEXT:    fmov d2, x10
-; NONEON-NOSVE-NEXT:    fmov d3, x8
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
-; NONEON-NOSVE-NEXT:    cnt v2.8b, v2.8b
-; NONEON-NOSVE-NEXT:    cnt v3.8b, v3.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
-; NONEON-NOSVE-NEXT:    uaddlv h2, v2.8b
-; NONEON-NOSVE-NEXT:    uaddlv h3, v3.8b
-; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    stp q3, q2, [sp]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp]
+; NONEON-NOSVE-NEXT:    lsr w13, w9, #1
+; NONEON-NOSVE-NEXT:    lsr w14, w11, #1
+; NONEON-NOSVE-NEXT:    lsr w15, w10, #1
+; NONEON-NOSVE-NEXT:    lsr w16, w12, #1
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w13
+; NONEON-NOSVE-NEXT:    and w13, w14, #0x55555555
+; NONEON-NOSVE-NEXT:    and w14, w15, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w11, w11, w13
+; NONEON-NOSVE-NEXT:    lsr w13, w9, #2
+; NONEON-NOSVE-NEXT:    and w15, w16, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w10, w10, w14
+; NONEON-NOSVE-NEXT:    sub w12, w12, w15
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x33333333
+; NONEON-NOSVE-NEXT:    lsr w14, w11, #2
+; NONEON-NOSVE-NEXT:    lsr w15, w10, #2
+; NONEON-NOSVE-NEXT:    add w9, w9, w13
+; NONEON-NOSVE-NEXT:    lsr w13, w12, #2
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    and w14, w14, #0x33333333
+; NONEON-NOSVE-NEXT:    and w15, w15, #0x33333333
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x33333333
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x33333333
+; NONEON-NOSVE-NEXT:    add w11, w11, w14
+; NONEON-NOSVE-NEXT:    add w10, w10, w15
+; NONEON-NOSVE-NEXT:    add w12, w12, w13
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    add w11, w11, w11, lsr #4
+; NONEON-NOSVE-NEXT:    add w10, w10, w10, lsr #4
+; NONEON-NOSVE-NEXT:    add w12, w12, w12, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    and w11, w11, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    and w12, w12, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w8
+; NONEON-NOSVE-NEXT:    mul w8, w12, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #24
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
@@ -788,67 +814,115 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #144
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
-; NONEON-NOSVE-NEXT:    str d0, [sp, #128]
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #135]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #134]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #133]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #132]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #131]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #130]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #129]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #143]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #141]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #139]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    str q0, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #136]
-; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
@@ -865,126 +939,219 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #304
-; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
-; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
-; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
-; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #271]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #270]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #269]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #268]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #267]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #266]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #265]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #264]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #263]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #262]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #261]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #260]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #259]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #258]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #257]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #256]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fmov d0, x8
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #287]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
-; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #286]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #285]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
-; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #284]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #283]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
-; NONEON-NOSVE-NEXT:    str q0, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #282]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #280]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #278]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #276]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #274]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
-; NONEON-NOSVE-NEXT:    ldr w8, [sp]
-; NONEON-NOSVE-NEXT:    strb w8, [sp, #272]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #272]
-; NONEON-NOSVE-NEXT:    add sp, sp, #304
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x33333333
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x33333333
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w9, lsr #4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0xf0f0f0f
+; NONEON-NOSVE-NEXT:    mul w9, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
+; NONEON-NOSVE-NEXT:    ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/93826


More information about the llvm-commits mailing list