[llvm-branch-commits] [llvm] release/20.x: [AArch64] Fix BE popcount casts. (#129879) (PR #129996)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Tue Mar 11 13:52:24 PDT 2025


https://github.com/llvmbot updated https://github.com/llvm/llvm-project/pull/129996

>From 05be3ca72e392ba5055d2c3e617adaeab89e258b Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 5 Mar 2025 11:23:33 +0000
Subject: [PATCH 1/2] [AArch64] Add BE test coverage for popcount. NFC

For #129843

(cherry picked from commit b673a59c9ae5583aa08a8d34a48f9409b660d826)
---
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 161 ++++++++++++++++++++++
 llvm/test/CodeGen/AArch64/popcount.ll     | 104 ++++++++++++++
 2 files changed, 265 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index ad0904ff98080..369667ec33f66 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
 ; RUN: llc < %s -mtriple=aarch64 -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
+; RUN: llc < %s -mtriple=aarch64_be-none-eabi | FileCheck %s --check-prefix=CHECK-BE
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-LABEL: cnt32_advsimd:
@@ -32,6 +33,14 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt w0, w0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt32_advsimd:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov s0, w0
+; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
+; CHECK-BE-NEXT:    addv b0, v0.8b
+; CHECK-BE-NEXT:    fmov w0, s0
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 }
@@ -69,6 +78,16 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
 ; CHECK-CSSC-NEXT:    fmov w8, s0
 ; CHECK-CSSC-NEXT:    cnt w0, w8
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt32_advsimd_2:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    fmov w8, s0
+; CHECK-BE-NEXT:    fmov s0, w8
+; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
+; CHECK-BE-NEXT:    addv b0, v0.8b
+; CHECK-BE-NEXT:    fmov w0, s0
+; CHECK-BE-NEXT:    ret
   %1 = extractelement <2 x i32> %x, i64 0
   %2 = tail call i32 @llvm.ctpop.i32(i32 %1)
   ret i32 %2
@@ -103,6 +122,16 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt x0, x0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt64_advsimd:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
+; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
+; CHECK-BE-NEXT:    addv b0, v0.8b
+; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
+; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 }
@@ -147,6 +176,22 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt w0, w0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    lsr w9, w0, #1
+; CHECK-BE-NEXT:    mov w8, #16843009 // =0x1010101
+; CHECK-BE-NEXT:    and w9, w9, #0x55555555
+; CHECK-BE-NEXT:    sub w9, w0, w9
+; CHECK-BE-NEXT:    lsr w10, w9, #2
+; CHECK-BE-NEXT:    and w9, w9, #0x33333333
+; CHECK-BE-NEXT:    and w10, w10, #0x33333333
+; CHECK-BE-NEXT:    add w9, w9, w10
+; CHECK-BE-NEXT:    add w9, w9, w9, lsr #4
+; CHECK-BE-NEXT:    and w9, w9, #0xf0f0f0f
+; CHECK-BE-NEXT:    mul w8, w9, w8
+; CHECK-BE-NEXT:    lsr w0, w8, #24
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 }
@@ -188,6 +233,22 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt x0, x0
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    lsr x9, x0, #1
+; CHECK-BE-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-BE-NEXT:    and x9, x9, #0x5555555555555555
+; CHECK-BE-NEXT:    sub x9, x0, x9
+; CHECK-BE-NEXT:    lsr x10, x9, #2
+; CHECK-BE-NEXT:    and x9, x9, #0x3333333333333333
+; CHECK-BE-NEXT:    and x10, x10, #0x3333333333333333
+; CHECK-BE-NEXT:    add x9, x9, x10
+; CHECK-BE-NEXT:    add x9, x9, x9, lsr #4
+; CHECK-BE-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
+; CHECK-BE-NEXT:    mul x8, x9, x8
+; CHECK-BE-NEXT:    lsr x0, x8, #56
+; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 }
@@ -215,6 +276,14 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
 ; CHECK-CSSC-NEXT:    cmp x8, #1
 ; CHECK-CSSC-NEXT:    cset w0, eq
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop_eq_one:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    sub x8, x0, #1
+; CHECK-BE-NEXT:    eor x9, x0, x8
+; CHECK-BE-NEXT:    cmp x9, x8
+; CHECK-BE-NEXT:    cset w0, hi
+; CHECK-BE-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp eq i64 %count, 1
   %conv = zext i1 %cmp to i32
@@ -244,6 +313,14 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 ; CHECK-CSSC-NEXT:    cmp x8, #1
 ; CHECK-CSSC-NEXT:    cset w0, ne
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop_ne_one:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    sub x8, x0, #1
+; CHECK-BE-NEXT:    eor x9, x0, x8
+; CHECK-BE-NEXT:    cmp x9, x8
+; CHECK-BE-NEXT:    cset w0, ls
+; CHECK-BE-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp ne i64 %count, 1
   %conv = zext i1 %cmp to i32
@@ -273,6 +350,14 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
 ; CHECK-CSSC-NEXT:    cmp w8, #1
 ; CHECK-CSSC-NEXT:    cset w0, ne
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop32_ne_one:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    sub w8, w0, #1
+; CHECK-BE-NEXT:    eor w9, w0, w8
+; CHECK-BE-NEXT:    cmp w9, w8
+; CHECK-BE-NEXT:    cset w0, ls
+; CHECK-BE-NEXT:    ret
   %count = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %count, 1
   ret i1 %cmp
@@ -299,6 +384,13 @@ define i1 @ctpop32_eq_one_nonzero(i32 %x) {
 ; CHECK-CSSC-NEXT:    tst w0, w8
 ; CHECK-CSSC-NEXT:    cset w0, eq
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    sub w8, w0, #1
+; CHECK-BE-NEXT:    tst w0, w8
+; CHECK-BE-NEXT:    cset w0, eq
+; CHECK-BE-NEXT:    ret
 entry:
   %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %popcnt, 1
@@ -326,11 +418,80 @@ define i1 @ctpop32_ne_one_nonzero(i32 %x) {
 ; CHECK-CSSC-NEXT:    tst w0, w8
 ; CHECK-CSSC-NEXT:    cset w0, ne
 ; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    sub w8, w0, #1
+; CHECK-BE-NEXT:    tst w0, w8
+; CHECK-BE-NEXT:    cset w0, ne
+; CHECK-BE-NEXT:    ret
 entry:
   %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %popcnt, 1
   ret i1 %cmp
 }
 
+define i128 @cnt128(i128 %x) nounwind readnone {
+; CHECK-LABEL: cnt128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    cnt.16b v0, v0
+; CHECK-NEXT:    addv.16b b0, v0
+; CHECK-NEXT:    mov.d x1, v0[1]
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; CHECK-NONEON-LABEL: cnt128:
+; CHECK-NONEON:       // %bb.0:
+; CHECK-NONEON-NEXT:    lsr x9, x0, #1
+; CHECK-NONEON-NEXT:    lsr x10, x1, #1
+; CHECK-NONEON-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
+; CHECK-NONEON-NEXT:    and x10, x10, #0x5555555555555555
+; CHECK-NONEON-NEXT:    sub x9, x0, x9
+; CHECK-NONEON-NEXT:    sub x10, x1, x10
+; CHECK-NONEON-NEXT:    mov x1, xzr
+; CHECK-NONEON-NEXT:    lsr x11, x9, #2
+; CHECK-NONEON-NEXT:    lsr x12, x10, #2
+; CHECK-NONEON-NEXT:    and x9, x9, #0x3333333333333333
+; CHECK-NONEON-NEXT:    and x10, x10, #0x3333333333333333
+; CHECK-NONEON-NEXT:    and x11, x11, #0x3333333333333333
+; CHECK-NONEON-NEXT:    add x9, x9, x11
+; CHECK-NONEON-NEXT:    and x11, x12, #0x3333333333333333
+; CHECK-NONEON-NEXT:    add x9, x9, x9, lsr #4
+; CHECK-NONEON-NEXT:    add x10, x10, x11
+; CHECK-NONEON-NEXT:    add x10, x10, x10, lsr #4
+; CHECK-NONEON-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON-NEXT:    mul x9, x9, x8
+; CHECK-NONEON-NEXT:    and x10, x10, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON-NEXT:    mul x8, x10, x8
+; CHECK-NONEON-NEXT:    lsr x9, x9, #56
+; CHECK-NONEON-NEXT:    add x0, x9, x8, lsr #56
+; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt128:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x1
+; CHECK-CSSC-NEXT:    cnt x9, x0
+; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    add x0, x9, x8
+; CHECK-CSSC-NEXT:    ret
+;
+; CHECK-BE-LABEL: cnt128:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    mov v0.d[1], x1
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    cnt v0.16b, v0.16b
+; CHECK-BE-NEXT:    addv b0, v0.16b
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    mov x1, v0.d[1]
+; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    ret
+  %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
+  ret i128 %cnt
+}
+
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 89b1ac0a0edf1..6cc925f0ae91f 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+neon,+dotprod | FileCheck %s --check-prefixes=CHECK,DOT
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mattr=+sve | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown | FileCheck %s --check-prefixes=BE
 ; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISEL
 ; RUN: llc < %s -O0 -global-isel -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=GISELO0
 ; RUN: llc < %s -global-isel -mtriple=aarch64-unknown-unknown -mattr=+neon | FileCheck %s --check-prefixes=GISEL,NEON-GISEL
@@ -32,6 +33,18 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount128:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    ldr d0, [x0]
+; BE-NEXT:    add x8, x0, #8
+; BE-NEXT:    ld1 { v0.d }[1], [x8]
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    addv b0, v0.16b
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    mov w0, v0.s[3]
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount128:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    ldr q0, [x0]
@@ -111,6 +124,27 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    add w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount256:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    ldr d0, [x0]
+; BE-NEXT:    ldr d1, [x0, #16]
+; BE-NEXT:    add x8, x0, #24
+; BE-NEXT:    add x9, x0, #8
+; BE-NEXT:    ld1 { v0.d }[1], [x9]
+; BE-NEXT:    ld1 { v1.d }[1], [x8]
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    rev64 v1.16b, v1.16b
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    cnt v1.16b, v1.16b
+; BE-NEXT:    addv b0, v0.16b
+; BE-NEXT:    addv b1, v1.16b
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    mov w8, v0.s[3]
+; BE-NEXT:    mov w9, v1.s[3]
+; BE-NEXT:    add w0, w9, w8
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount256:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    ldp x8, x9, [x0]
@@ -199,6 +233,18 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount1x128:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    mov v0.d[1], x1
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    addv b0, v0.16b
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    mov x1, v0.d[1]
+; BE-NEXT:    fmov x0, d0
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount1x128:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    mov v0.d[0], x0
@@ -266,6 +312,17 @@ define <2 x i64> @popcount2x64(<2 x i64> %0) {
 ; SVE-NEXT:    uaddlp v0.2d, v0.4s
 ; SVE-NEXT:    ret
 ;
+; BE-LABEL: popcount2x64:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    uaddlp v0.8h, v0.16b
+; BE-NEXT:    uaddlp v0.4s, v0.8h
+; BE-NEXT:    uaddlp v0.2d, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+;
 ; GISELO0-LABEL: popcount2x64:
 ; GISELO0:       // %bb.0: // %Entry
 ; GISELO0-NEXT:    cnt v0.16b, v0.16b
@@ -326,6 +383,15 @@ define <1 x i64> @popcount1x64(<1 x i64> %0) {
 ; CHECK-NEXT:    uaddlp v0.1d, v0.2s
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount1x64:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.8b, v0.8b
+; BE-NEXT:    cnt v0.8b, v0.8b
+; BE-NEXT:    uaddlp v0.4h, v0.8b
+; BE-NEXT:    uaddlp v0.2s, v0.4h
+; BE-NEXT:    uaddlp v0.1d, v0.2s
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount1x64:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    cnt v0.8b, v0.8b
@@ -382,6 +448,17 @@ define <4 x i32> @popcount4x32(<4 x i32> %0) {
 ; SVE-NEXT:    uaddlp v0.4s, v0.8h
 ; SVE-NEXT:    ret
 ;
+; BE-LABEL: popcount4x32:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    uaddlp v0.8h, v0.16b
+; BE-NEXT:    uaddlp v0.4s, v0.8h
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+;
 ; GISELO0-LABEL: popcount4x32:
 ; GISELO0:       // %bb.0: // %Entry
 ; GISELO0-NEXT:    cnt v0.16b, v0.16b
@@ -449,6 +526,15 @@ define <2 x i32> @popcount2x32(<2 x i32> %0) {
 ; SVE-NEXT:    uaddlp v0.2s, v0.4h
 ; SVE-NEXT:    ret
 ;
+; BE-LABEL: popcount2x32:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.8b, v0.8b
+; BE-NEXT:    cnt v0.8b, v0.8b
+; BE-NEXT:    uaddlp v0.4h, v0.8b
+; BE-NEXT:    uaddlp v0.2s, v0.4h
+; BE-NEXT:    rev64 v0.2s, v0.2s
+; BE-NEXT:    ret
+;
 ; GISELO0-LABEL: popcount2x32:
 ; GISELO0:       // %bb.0: // %Entry
 ; GISELO0-NEXT:    cnt v0.8b, v0.8b
@@ -498,6 +584,16 @@ define <8 x i16> @popcount8x16(<8 x i16> %0) {
 ; CHECK-NEXT:    uaddlp v0.8h, v0.16b
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount8x16:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    cnt v0.16b, v0.16b
+; BE-NEXT:    uaddlp v0.8h, v0.16b
+; BE-NEXT:    rev64 v0.8h, v0.8h
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount8x16:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    cnt v0.16b, v0.16b
@@ -529,6 +625,14 @@ define <4 x i16> @popcount4x16(<4 x i16> %0) {
 ; CHECK-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-NEXT:    ret
 ;
+; BE-LABEL: popcount4x16:
+; BE:       // %bb.0: // %Entry
+; BE-NEXT:    rev64 v0.8b, v0.8b
+; BE-NEXT:    cnt v0.8b, v0.8b
+; BE-NEXT:    uaddlp v0.4h, v0.8b
+; BE-NEXT:    rev64 v0.4h, v0.4h
+; BE-NEXT:    ret
+;
 ; GISEL-LABEL: popcount4x16:
 ; GISEL:       // %bb.0: // %Entry
 ; GISEL-NEXT:    cnt v0.8b, v0.8b

>From 32ce5b043c2b6e628e85d89df5461238632d9211 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 5 Mar 2025 20:08:34 +0000
Subject: [PATCH 2/2] [AArch64] Fix BE popcount casts. (#129879)

A bitcast, being defined as a load and a store, can change the lane
order. We need to use a NVCAST instead to keep the lanes out of the
VADDV the same in big-endian. The extracting from a v2i64 vector is
to keep the types of the nvcast legal, but also allow us to replace a
lane mov with a mov 0.

Fixes #129843

(cherry picked from commit ab811e75734a77247dae6df1579fa6f29394f200)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 10 ++++++++--
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll     |  8 +++-----
 llvm/test/CodeGen/AArch64/parity.ll           |  2 +-
 llvm/test/CodeGen/AArch64/popcount.ll         | 19 +++++++++----------
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a2824d6a8f5af..16fcd589cecd1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10785,7 +10785,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
     if (VT == MVT::i32)
       AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
                          DAG.getConstant(0, DL, MVT::i64));
-    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+    else
+      AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                         DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
+                         DAG.getConstant(0, DL, MVT::i64));
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
@@ -10794,7 +10797,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
 
     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
     SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
-    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+    AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
+                       DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
+                       DAG.getConstant(0, DL, MVT::i64));
+    AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 369667ec33f66..d06e42f5405ef 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -129,7 +129,6 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-BE-NEXT:    addv b0, v0.8b
-; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    fmov x0, d0
 ; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
@@ -436,9 +435,9 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt.16b v0, v0
 ; CHECK-NEXT:    addv.16b b0, v0
-; CHECK-NEXT:    mov.d x1, v0[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
@@ -481,13 +480,12 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; CHECK-BE-LABEL: cnt128:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    mov x0, xzr
 ; CHECK-BE-NEXT:    mov v0.d[1], x1
 ; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
 ; CHECK-BE-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-BE-NEXT:    addv b0, v0.16b
-; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-BE-NEXT:    mov x1, v0.d[1]
-; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    fmov x1, d0
 ; CHECK-BE-NEXT:    ret
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 1e51793fb5f91..91515277cb3f6 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -159,7 +159,7 @@ define i32 @parity_64_trunc(i64 %x) {
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 6cc925f0ae91f..e664e73594923 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -41,8 +41,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; BE-NEXT:    rev64 v0.16b, v0.16b
 ; BE-NEXT:    cnt v0.16b, v0.16b
 ; BE-NEXT:    addv b0, v0.16b
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    mov w0, v0.s[3]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    mov w0, v0.s[1]
 ; BE-NEXT:    ret
 ;
 ; GISEL-LABEL: popcount128:
@@ -138,10 +138,10 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; BE-NEXT:    cnt v1.16b, v1.16b
 ; BE-NEXT:    addv b0, v0.16b
 ; BE-NEXT:    addv b1, v1.16b
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    mov w8, v0.s[3]
-; BE-NEXT:    mov w9, v1.s[3]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    rev64 v1.4s, v1.4s
+; BE-NEXT:    mov w8, v0.s[1]
+; BE-NEXT:    mov w9, v1.s[1]
 ; BE-NEXT:    add w0, w9, w8
 ; BE-NEXT:    ret
 ;
@@ -227,22 +227,21 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK:       // %bb.0: // %Entry
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov v0.d[1], x1
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    mov x1, v0.d[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
 ; BE-LABEL: popcount1x128:
 ; BE:       // %bb.0: // %Entry
 ; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    mov x0, xzr
 ; BE-NEXT:    mov v0.d[1], x1
 ; BE-NEXT:    rev64 v0.16b, v0.16b
 ; BE-NEXT:    cnt v0.16b, v0.16b
 ; BE-NEXT:    addv b0, v0.16b
-; BE-NEXT:    rev64 v0.16b, v0.16b
-; BE-NEXT:    mov x1, v0.d[1]
-; BE-NEXT:    fmov x0, d0
+; BE-NEXT:    fmov x1, d0
 ; BE-NEXT:    ret
 ;
 ; GISEL-LABEL: popcount1x128:



More information about the llvm-branch-commits mailing list