[llvm] 82a5f1c - [AArch64] use CNT for ISD::popcnt and ISD::parity if available

Fri Dec 2 03:27:56 PST 2022

Author: Ties Stuij
Date: 2022-12-02T11:27:14Z
New Revision: 82a5f1c62b64830069139d5840675588577a8711

URL: https://github.com/llvm/llvm-project/commit/82a5f1c62b64830069139d5840675588577a8711
DIFF: https://github.com/llvm/llvm-project/commit/82a5f1c62b64830069139d5840675588577a8711.diff

LOG: [AArch64] use CNT for ISD::popcnt and ISD::parity if available

These are the two places where we explicitly want to use cnt in
SelectionDAG when feature CSSC is available: ISD::popcnt and ISD::parity

For both, we need to make sure we're emitting optimized code for i32 (and
lower), i64 and i128. The most optimal way is of course using the GPR CNT
instruction. If we don't have CSSC, but we do have neon, we'll use floating
point CNT. If all fails, we'll fall back on the general GPR popcnt and parity
implementations.

spec:
https://developer.arm.com/documentation/ddi0602/2022-09/Base-Instructions/CNT--Count-bits-

Reviewed By: lenary

Differential Revision: https://reviews.llvm.org/D138808

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/arm64-popcnt.ll
    llvm/test/CodeGen/AArch64/ctpop-nonean.ll
    llvm/test/CodeGen/AArch64/parity.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5162f5e867c37..184ea8e336d84 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -541,12 +541,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
-  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i128, Custom);
+  if (Subtarget->hasCSSC()) {
+    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+    setOperationAction(ISD::CTPOP, MVT::i128, Expand);
+    setOperationAction(ISD::PARITY, MVT::i128, Expand);
+  } else {
+    setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 
-  setOperationAction(ISD::PARITY, MVT::i64, Custom);
-  setOperationAction(ISD::PARITY, MVT::i128, Custom);
+    setOperationAction(ISD::PARITY, MVT::i64, Custom);
+    setOperationAction(ISD::PARITY, MVT::i128, Custom);
+  }
 
   setOperationAction(ISD::ABS, MVT::i32, Custom);
   setOperationAction(ISD::ABS, MVT::i64, Custom);
@@ -8413,8 +8420,16 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
     return SDValue();
 
   bool IsParity = Op.getOpcode() == ISD::PARITY;
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
 
-  // While there is no integer popcount instruction, it can
+  // for i32, general parity function using EORs is more efficient compared to
+  // using floating point
+  if (VT == MVT::i32 && IsParity)
+    return SDValue();
+
+  // If there is no CNT instruction available, GPR popcount can
   // be more efficiently lowered to the following sequence that uses
   // AdvSIMD registers/instructions as long as the copies to/from
   // the AdvSIMD registers are cheap.
@@ -8422,10 +8437,6 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
-  SDValue Val = Op.getOperand(0);
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
   if (VT == MVT::i32 || VT == MVT::i64) {
     if (VT == MVT::i32)
       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 427f5dc152a2d..9c0c37410a3e8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8529,7 +8529,7 @@ defm RCWSWP  : ReadCheckWriteOperation<0b010, "swp">;
 // General Data-Processing Instructions (FEAT_V94_DP)
 //===----------------------------------------------------------------------===//
 defm ABS : OneOperandData<0b001000, "abs">, Requires<[HasCSSC]>;
-defm CNT : OneOperandData<0b000111, "cnt">, Requires<[HasCSSC]>;
+defm CNT : OneOperandData<0b000111, "cnt", ctpop>, Requires<[HasCSSC]>;
 defm CTZ : OneOperandData<0b000110, "ctz">, Requires<[HasCSSC]>;
 
 defm SMAX : ComparisonOp<0, 0, "smax">, Requires<[HasCSSC]>;

diff  --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 872829e950dc7..2a4b30a9078d0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64-eabi -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
+; RUN: llc < %s -mtriple=aarch64-eabi -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-LABEL: cnt32_advsimd:
@@ -27,6 +28,11 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-NONEON-NEXT:    mul w8, w9, w8
 ; CHECK-NONEON-NEXT:    lsr w0, w8, #24
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt32_advsimd:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w0, w0
+; CHECK-CSSC-NEXT:    ret
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 }
@@ -57,6 +63,13 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
 ; CHECK-NONEON-NEXT:    mul w8, w9, w8
 ; CHECK-NONEON-NEXT:    lsr w0, w8, #24
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt32_advsimd_2:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-CSSC-NEXT:    fmov w8, s0
+; CHECK-CSSC-NEXT:    cnt w0, w8
+; CHECK-CSSC-NEXT:    ret
   %1 = extractelement <2 x i32> %x, i64 0
   %2 = tail call i32 @llvm.ctpop.i32(i32 %1)
   ret i32 %2
@@ -86,6 +99,11 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-NONEON-NEXT:    mul x8, x9, x8
 ; CHECK-NONEON-NEXT:    lsr x0, x8, #56
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt64_advsimd:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x0, x0
+; CHECK-CSSC-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 }
@@ -125,6 +143,11 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-NONEON-NEXT:    mul w8, w9, w8
 ; CHECK-NONEON-NEXT:    lsr w0, w8, #24
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt32:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w0, w0
+; CHECK-CSSC-NEXT:    ret
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 }
@@ -161,6 +184,11 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-NONEON-NEXT:    mul x8, x9, x8
 ; CHECK-NONEON-NEXT:    lsr x0, x8, #56
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: cnt64:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x0, x0
+; CHECK-CSSC-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 }
@@ -181,6 +209,13 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
 ; CHECK-NONEON-NEXT:    ccmp x0, #0, #4, eq
 ; CHECK-NONEON-NEXT:    cset w0, ne
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop_eq_one:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    cmp x8, #1
+; CHECK-CSSC-NEXT:    cset w0, eq
+; CHECK-CSSC-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp eq i64 %count, 1
   %conv = zext i1 %cmp to i32
@@ -203,6 +238,13 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
 ; CHECK-NONEON-NEXT:    ccmp x0, #0, #4, eq
 ; CHECK-NONEON-NEXT:    cset w0, eq
 ; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop_ne_one:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    cmp x8, #1
+; CHECK-CSSC-NEXT:    cset w0, ne
+; CHECK-CSSC-NEXT:    ret
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cmp = icmp ne i64 %count, 1
   %conv = zext i1 %cmp to i32

diff  --git a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
index 5de6a308ad28a..af5652a442ace 100644
--- a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon -mattr=+cssc < %s | FileCheck %s -check-prefix=CHECK-CSSC
 
 declare i128 @llvm.ctpop.i128(i128)
 
@@ -31,6 +32,14 @@ define i128 @ctpop_i128(i128 %i) {
 ; CHECK-NEXT:    lsr x9, x9, #56
 ; CHECK-NEXT:    add x0, x9, x8, lsr #56
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop_i128:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x1
+; CHECK-CSSC-NEXT:    cnt x9, x0
+; CHECK-CSSC-NEXT:    add x0, x9, x8
+; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    ret
   %c = call i128 @llvm.ctpop.i128(i128 %i)
   ret i128 %c
 }

diff  --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 534892d0a24d1..19dd185a6cb78 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mattr=+cssc | FileCheck %s -check-prefix=CHECK-CSSC
 
 define i4 @parity_4(i4 %x) {
 ; CHECK-LABEL: parity_4:
@@ -9,6 +10,13 @@ define i4 @parity_4(i4 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_4:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xf
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i4 @llvm.ctpop.i4(i4 %x)
   %2 = and i4 %1, 1
   ret i4 %2
@@ -23,6 +31,13 @@ define i8 @parity_8(i8 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_8:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i8 @llvm.ctpop.i8(i8 %x)
   %2 = and i8 %1, 1
   ret i8 %2
@@ -38,6 +53,13 @@ define i16 @parity_16(i16 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_16:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xffff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i16 @llvm.ctpop.i16(i16 %x)
   %2 = and i16 %1, 1
   ret i16 %2
@@ -54,6 +76,13 @@ define i17 @parity_17(i17 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_17:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0x1ffff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i17 @llvm.ctpop.i17(i17 %x)
   %2 = and i17 %1, 1
   ret i17 %2
@@ -69,6 +98,12 @@ define i32 @parity_32(i32 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_32:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w8, w0
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %2 = and i32 %1, 1
   ret i32 %2
@@ -83,6 +118,12 @@ define i64 @parity_64(i64 %x) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_64:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    and x0, x8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
   %2 = and i64 %1, 1
   ret i64 %2
@@ -99,6 +140,14 @@ define i128 @parity_128(i128 %x) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_128:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    eor x8, x0, x1
+; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    cnt x8, x8
+; CHECK-CSSC-NEXT:    and x0, x8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i128 @llvm.ctpop.i128(i128 %x)
   %2 = and i128 %1, 1
   ret i128 %2
@@ -113,6 +162,12 @@ define i32 @parity_64_trunc(i64 %x) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_64_trunc:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt x8, x0
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i64 @llvm.ctpop.i64(i64 %x)
   %2 = trunc i64 %1 to i32
   %3 = and i32 %2, 1
@@ -129,6 +184,12 @@ define i8 @parity_32_trunc(i32 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_32_trunc:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    cnt w8, w0
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %2 = trunc i32 %1 to i8
   %3 = and i8 %2, 1
@@ -144,6 +205,13 @@ define i32 @parity_8_zext(i8 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_8_zext:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %a = zext i8 %x to i32
   %b = tail call i32 @llvm.ctpop.i32(i32 %a)
   %c = and i32 %b, 1
@@ -159,6 +227,13 @@ define i32 @parity_8_mask(i32 %x) {
 ; CHECK-NEXT:    eor w8, w8, w8, lsr #1
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: parity_8_mask:
+; CHECK-CSSC:       // %bb.0:
+; CHECK-CSSC-NEXT:    and w8, w0, #0xff
+; CHECK-CSSC-NEXT:    cnt w8, w8
+; CHECK-CSSC-NEXT:    and w0, w8, #0x1
+; CHECK-CSSC-NEXT:    ret
   %a = and i32 %x, 255
   %b = tail call i32 @llvm.ctpop.i32(i32 %a)
   %c = and i32 %b, 1