[llvm] 79b5829 - [TargetLowering][AArch64] Teach DemandedBits about SVE count intrinsics

Fri Nov 25 02:18:09 PST 2022

Author: Benjamin Maxwell
Date: 2022-11-25T10:15:14Z
New Revision: 79b5829a155f256001ead535c358a1193cd0cbe8

URL: https://github.com/llvm/llvm-project/commit/79b5829a155f256001ead535c358a1193cd0cbe8
DIFF: https://github.com/llvm/llvm-project/commit/79b5829a155f256001ead535c358a1193cd0cbe8.diff

LOG: [TargetLowering][AArch64] Teach DemandedBits about SVE count intrinsics

This allows DemandedBits to see that the SVE count intrinsics (CNTB,
CNTH, CNTW, CNTD) sans multiplier will only ever produce small
positive integers. The maximum value you could get here is 256, which
is CNTB on a machine with a 2048bit vector size (the maximum for SVE).

Using this various redundant operations (zexts, sexts, ands, ors, etc)
can be eliminated.

Differential Revision: https://reviews.llvm.org/D138424

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4f514f79ea34b..466a2edbd3efd 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2677,7 +2677,9 @@ bool TargetLowering::SimplifyDemandedBits(
     [[fallthrough]];
   }
   default:
-    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+    // We also ask the target about intrinsics (which could be specific to it).
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+        Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
       if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, DemandedElts,
                                             Known, TLO, Depth))
         return true;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c29df3563ee07..72a9117d1f3cf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -92,6 +92,7 @@
 #include <cstdlib>
 #include <iterator>
 #include <limits>
+#include <optional>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -15006,17 +15007,20 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
   return CSNeg;
 }
 
-static bool IsSVECntIntrinsic(SDValue S) {
+static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
   switch(getIntrinsicID(S.getNode())) {
   default:
     break;
   case Intrinsic::aarch64_sve_cntb:
+    return 8;
   case Intrinsic::aarch64_sve_cnth:
+    return 16;
   case Intrinsic::aarch64_sve_cntw:
+    return 32;
   case Intrinsic::aarch64_sve_cntd:
-    return true;
+    return 64;
   }
-  return false;
+  return {};
 }
 
 /// Calculates what the pre-extend type is, based on the extension
@@ -23296,6 +23300,24 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
     // used - simplify to just Val.
     return TLO.CombineTo(Op, ShiftR->getOperand(0));
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (auto ElementSize = IsSVECntIntrinsic(Op)) {
+      unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
+      if (!MaxSVEVectorSizeInBits)
+        MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
+      unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
+      // The SVE count intrinsics don't support the multiplier immediate so we
+      // don't have to account for that here. The value returned may be slightly
+      // over the true required bits, as this is based on the "ALL" pattern. The
+      // other patterns are also exposed by these intrinsics, but they all
+      // return a value that's strictly less than "ALL".
+      unsigned RequiredBits = Log2_32(MaxElements) + 1;
+      unsigned BitWidth = Known.Zero.getBitWidth();
+      if (RequiredBits < BitWidth)
+        Known.Zero.setHighBits(BitWidth - RequiredBits);
+      return false;
+    }
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(

diff  --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index 8485d9975c95d..895f5da9a1e13 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -29,9 +29,8 @@ define i64 @cntb_and_elimination() {
 ; CHECK-LABEL: cntb_and_elimination:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntb x8
-; CHECK-NEXT:    and x9, x8, #0x1ff
-; CHECK-NEXT:    and x8, x8, #0x3fffffffc
-; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    and x9, x8, #0x1fc
+; CHECK-NEXT:    add x0, x8, x9
 ; CHECK-NEXT:    ret
   %cntb = call i64 @llvm.aarch64.sve.cntb(i32 31)
   %and_redundant = and i64 %cntb, 511
@@ -44,9 +43,8 @@ define i64 @cnth_and_elimination() {
 ; CHECK-LABEL: cnth_and_elimination:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    and x9, x8, #0x3ff
-; CHECK-NEXT:    and x8, x8, #0x3fffffffc
-; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    and x9, x8, #0xfc
+; CHECK-NEXT:    add x0, x8, x9
 ; CHECK-NEXT:    ret
   %cnth = call i64 @llvm.aarch64.sve.cnth(i32 31)
   %and_redundant = and i64 %cnth, 1023
@@ -59,9 +57,8 @@ define i64 @cntw_and_elimination() {
 ; CHECK-LABEL: cntw_and_elimination:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    and x9, x8, #0x7f
-; CHECK-NEXT:    and x8, x8, #0x3fffffffc
-; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    and x9, x8, #0x7c
+; CHECK-NEXT:    add x0, x8, x9
 ; CHECK-NEXT:    ret
   %cntw = call i64 @llvm.aarch64.sve.cntw(i32 31)
   %and_redundant = and i64 %cntw, 127
@@ -74,9 +71,8 @@ define i64 @cntd_and_elimination() {
 ; CHECK-LABEL: cntd_and_elimination:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    and x9, x8, #0x3f
-; CHECK-NEXT:    and x8, x8, #0x3fffffffc
-; CHECK-NEXT:    add x0, x9, x8
+; CHECK-NEXT:    and x9, x8, #0x3c
+; CHECK-NEXT:    add x0, x8, x9
 ; CHECK-NEXT:    ret
   %cntd = call i64 @llvm.aarch64.sve.cntd(i32 31)
   %and_redundant = and i64 %cntd, 63
@@ -112,8 +108,7 @@ define i64 @vscale_trunc_sext() vscale_range(1,16) {
 define i64 @count_bytes_trunc_zext() {
 ; CHECK-LABEL: count_bytes_trunc_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cntb x8
-; CHECK-NEXT:    and x0, x8, #0xffffffff
+; CHECK-NEXT:    cntb x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cntb(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -124,8 +119,7 @@ define i64 @count_bytes_trunc_zext() {
 define i64 @count_halfs_trunc_zext() {
 ; CHECK-LABEL: count_halfs_trunc_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    and x0, x8, #0xffffffff
+; CHECK-NEXT:    cnth x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cnth(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -136,8 +130,7 @@ define i64 @count_halfs_trunc_zext() {
 define i64 @count_words_trunc_zext() {
 ; CHECK-LABEL: count_words_trunc_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    and x0, x8, #0xffffffff
+; CHECK-NEXT:    cntw x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cntw(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -148,8 +141,7 @@ define i64 @count_words_trunc_zext() {
 define i64 @count_doubles_trunc_zext() {
 ; CHECK-LABEL: count_doubles_trunc_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    and x0, x8, #0xffffffff
+; CHECK-NEXT:    cntd x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cntd(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -160,8 +152,7 @@ define i64 @count_doubles_trunc_zext() {
 define i64 @count_bytes_trunc_sext() {
 ; CHECK-LABEL: count_bytes_trunc_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cntb x8
-; CHECK-NEXT:    sxtw x0, w8
+; CHECK-NEXT:    cntb x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cntb(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -172,8 +163,7 @@ define i64 @count_bytes_trunc_sext() {
 define i64 @count_halfs_trunc_sext() {
 ; CHECK-LABEL: count_halfs_trunc_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    sxtw x0, w8
+; CHECK-NEXT:    cnth x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cnth(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -184,8 +174,7 @@ define i64 @count_halfs_trunc_sext() {
 define i64 @count_words_trunc_sext() {
 ; CHECK-LABEL: count_words_trunc_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    sxtw x0, w8
+; CHECK-NEXT:    cntw x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cntw(i32 31)
   %trunc = trunc i64 %cnt to i32
@@ -196,8 +185,7 @@ define i64 @count_words_trunc_sext() {
 define i64 @count_doubles_trunc_sext() {
 ; CHECK-LABEL: count_doubles_trunc_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    sxtw x0, w8
+; CHECK-NEXT:    cntd x0
 ; CHECK-NEXT:    ret
   %cnt = call i64 @llvm.aarch64.sve.cntd(i32 31)
   %trunc = trunc i64 %cnt to i32