[llvm] r374512 - [InstCombine] recognize popcount.

Thu Oct 10 22:13:56 PDT 2019

Author: shchenz
Date: Thu Oct 10 22:13:56 2019
New Revision: 374512

URL: http://llvm.org/viewvc/llvm-project?rev=374512&view=rev
Log:
[InstCombine] recognize popcount.

  This patch recognizes popcount intrinsic according to algorithm from website
  http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

Differential Revision: https://reviews.llvm.org/D68189

Added:
    llvm/trunk/test/Transforms/AggressiveInstCombine/popcount.ll
Modified:
    llvm/trunk/include/llvm/IR/PatternMatch.h
    llvm/trunk/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Modified: llvm/trunk/include/llvm/IR/PatternMatch.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/PatternMatch.h?rev=374512&r1=374511&r2=374512&view=diff
==============================================================================

--- llvm/trunk/include/llvm/IR/PatternMatch.h (original)
+++ llvm/trunk/include/llvm/IR/PatternMatch.h Thu Oct 10 22:13:56 2019
@@ -643,11 +643,11 @@ struct bind_const_intval_ty {
 };
 
 /// Match a specified integer value or vector of all elements of that
-// value.
+/// value.
 struct specific_intval {
-  uint64_t Val;
+  APInt Val;
 
-  specific_intval(uint64_t V) : Val(V) {}
+  specific_intval(APInt V) : Val(std::move(V)) {}
 
   template <typename ITy> bool match(ITy *V) {
     const auto *CI = dyn_cast<ConstantInt>(V);
@@ -655,13 +655,19 @@ struct specific_intval {
       if (const auto *C = dyn_cast<Constant>(V))
         CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue());
 
-    return CI && CI->getValue() == Val;
+    return CI && APInt::isSameValue(CI->getValue(), Val);
   }
 };
 
 /// Match a specific integer value or vector with all elements equal to
 /// the value.
-inline specific_intval m_SpecificInt(uint64_t V) { return specific_intval(V); }
+inline specific_intval m_SpecificInt(APInt V) {
+  return specific_intval(std::move(V));
+}
+
+inline specific_intval m_SpecificInt(uint64_t V) {
+  return m_SpecificInt(APInt(64, V));
+}
 
 /// Match a ConstantInt and bind to its value.  This does not match
 /// ConstantInts wider than 64-bits.

Modified: llvm/trunk/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp?rev=374512&r1=374511&r2=374512&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp (original)
+++ llvm/trunk/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp Thu Oct 10 22:13:56 2019
@@ -250,6 +250,72 @@ static bool foldAnyOrAllBitsSet(Instruct
   return true;
 }
 
+// Try to recognize below function as popcount intrinsic.
+// This is the "best" algorithm from
+// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+// Also used in TargetLowering::expandCTPOP().
+//
+// int popcount(unsigned int i) {
+//   i = i - ((i >> 1) & 0x55555555);
+//   i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+//   i = ((i + (i >> 4)) & 0x0F0F0F0F);
+//   return (i * 0x01010101) >> 24;
+// }
+static bool tryToRecognizePopCount(Instruction &I) {
+  if (I.getOpcode() != Instruction::LShr)
+    return false;
+
+  Type *Ty = I.getType();
+  if (!Ty->isIntOrIntVectorTy())
+    return false;
+
+  unsigned Len = Ty->getScalarSizeInBits();
+  // FIXME: fix Len == 8 and other irregular type lengths.
+  if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
+    return false;
+
+  APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
+  APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
+  APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
+  APInt MaskShift = APInt(Len, Len - 8);
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *MulOp0;
+  // Matching "(i * 0x01010101...) >> 24".
+  if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
+       match(Op1, m_SpecificInt(MaskShift))) {
+    Value *ShiftOp0;
+    // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
+    if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
+                                    m_Deferred(ShiftOp0)),
+                            m_SpecificInt(Mask0F)))) {
+      Value *AndOp0;
+      // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
+      if (match(ShiftOp0,
+                m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
+                        m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
+                              m_SpecificInt(Mask33))))) {
+        Value *Root, *SubOp1;
+        // Matching "i - ((i >> 1) & 0x55555555...)".
+        if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
+            match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
+                                m_SpecificInt(Mask55)))) {
+          LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
+          IRBuilder<> Builder(&I);
+          Function *Func = Intrinsic::getDeclaration(
+              I.getModule(), Intrinsic::ctpop, I.getType());
+          I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -268,6 +334,7 @@ static bool foldUnusualPatterns(Function
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= tryToRecognizePopCount(I); 
     }
   }
 

Added: llvm/trunk/test/Transforms/AggressiveInstCombine/popcount.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/AggressiveInstCombine/popcount.ll?rev=374512&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/AggressiveInstCombine/popcount.ll (added)
+++ llvm/trunk/test/Transforms/AggressiveInstCombine/popcount.ll Thu Oct 10 22:13:56 2019
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3 < %s -instcombine -S | FileCheck %s
+
+;int popcount8(unsigned char i) {
+;  i = i - ((i >> 1) & 0x55);
+;  i = (i & 0x33) + ((i >> 2) & 0x33);
+;  i = ((i + (i >> 4)) & 0x0F);
+; return (i * 0x01010101);
+;}
+define signext i32 @popcount8(i8 zeroext %0) {
+; CHECK-LABEL: @popcount8(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], 85
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP4]], 51
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i8 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], 51
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i8 [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i8 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i8 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], 15
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP11]] to i32 
+; CHECK-NEXT:    ret i32 [[TMP12]]
+;
+  %2 = lshr i8 %0, 1
+  %3 = and i8 %2, 85
+  %4 = sub i8 %0, %3
+  %5 = and i8 %4, 51
+  %6 = lshr i8 %4, 2
+  %7 = and i8 %6, 51
+  %8 = add nuw nsw i8 %7, %5
+  %9 = lshr i8 %8, 4
+  %10 = add nuw nsw i8 %9, %8
+  %11 = and i8 %10, 15
+  %12 = zext i8 %11 to i32 
+  ret i32 %12 
+}
+
+;int popcount32(unsigned i) {
+;  i = i - ((i >> 1) & 0x55555555);
+;  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+;  i = ((i + (i >> 4)) & 0x0F0F0F0F);
+; return (i * 0x01010101) >> 24;
+;}
+define signext i32 @popcount32(i32 zeroext %0) {
+; CHECK-LABEL: @popcount32(
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[TMP0:%.*]]), !range !0
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = lshr i32 %0, 1
+  %3 = and i32 %2, 1431655765
+  %4 = sub i32 %0, %3
+  %5 = and i32 %4, 858993459
+  %6 = lshr i32 %4, 2
+  %7 = and i32 %6, 858993459
+  %8 = add nuw nsw i32 %7, %5
+  %9 = lshr i32 %8, 4
+  %10 = add nuw nsw i32 %9, %8
+  %11 = and i32 %10, 252645135
+  %12 = mul i32 %11, 16843009
+  %13 = lshr i32 %12, 24
+  ret i32 %13
+}
+
+;int popcount64(unsigned long long i) {
+;  i = i - ((i >> 1) & 0x5555555555555555);
+;  i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333);
+;  i = ((i + (i >> 4)) & 0x0F0F0F0F0F0F0F0F);
+; return (i * 0x0101010101010101) >> 56;
+;}
+define signext i32 @popcount64(i64 %0) {
+; CHECK-LABEL: @popcount64(
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP0:%.*]]), !range !1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %2 = lshr i64 %0, 1
+  %3 = and i64 %2, 6148914691236517205
+  %4 = sub i64 %0, %3
+  %5 = and i64 %4, 3689348814741910323
+  %6 = lshr i64 %4, 2
+  %7 = and i64 %6, 3689348814741910323
+  %8 = add nuw nsw i64 %7, %5
+  %9 = lshr i64 %8, 4
+  %10 = add nuw nsw i64 %9, %8
+  %11 = and i64 %10, 1085102592571150095
+  %12 = mul i64 %11, 72340172838076673
+  %13 = lshr i64 %12, 56
+  %14 = trunc i64 %13 to i32
+  ret i32 %14
+}
+
+;int popcount128(__uint128_t i) {
+;  __uint128_t x = 0x5555555555555555;
+;  x <<= 64;
+;  x |= 0x5555555555555555;
+;  __uint128_t y = 0x3333333333333333;
+;  y <<= 64;
+;  y |= 0x3333333333333333;
+;  __uint128_t z = 0x0f0f0f0f0f0f0f0f;
+;  z <<= 64; 
+;  z |= 0x0f0f0f0f0f0f0f0f;
+;  __uint128_t a = 0x0101010101010101;
+;  a <<= 64;
+;  a |= 0x0101010101010101;
+;  unsigned mask = 120;
+;  i = i - ((i >> 1) & x);
+;  i = (i & y) + ((i >> 2) & y);
+;  i = ((i + (i >> 4)) & z);
+;  return (i * a) >> mask;
+;}
+define signext i32 @popcount128(i128 %0) {
+; CHECK-LABEL: @popcount128(
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i128 @llvm.ctpop.i128(i128 [[TMP0:%.*]]), !range !2
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i128 [[TMP2]] to i32 
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %2 = lshr i128 %0, 1
+  %3 = and i128 %2, 113427455640312821154458202477256070485
+  %4 = sub i128 %0, %3
+  %5 = and i128 %4, 68056473384187692692674921486353642291
+  %6 = lshr i128 %4, 2
+  %7 = and i128 %6, 68056473384187692692674921486353642291
+  %8 = add nuw nsw i128 %7, %5
+  %9 = lshr i128 %8, 4
+  %10 = add nuw nsw i128 %9, %8
+  %11 = and i128 %10, 20016609818878733144904388672456953615
+  %12 = mul i128 %11, 1334440654591915542993625911497130241
+  %13 = lshr i128 %12, 120 
+  %14 = trunc i128 %13 to i32 
+  ret i32 %14 
+}
+
+;vector unsigned char popcount8vec(vector unsigned char i)
+;{
+;  i = i - ((i>> 1) & 0x55);
+;  i = (i & 0x33) + ((i >> 2) & 0x33);
+;  i = ((i + (i >> 4)) & 0x0F);
+;  return (i * 0x01);
+;}
+define <16 x i8> @popcount8vec(<16 x i8> %0) {
+; CHECK-LABEL: @popcount8vec(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[TMP0:%.*]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i8> [[TMP2]], <i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i8> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i8> [[TMP4]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <16 x i8> [[TMP4]], <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i8> [[TMP6]], <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <16 x i8> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP8]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw <16 x i8> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i8> [[TMP10]], <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP11]]
+;
+  %2 = lshr <16 x i8> %0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = and <16 x i8> %2, <i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85, i8 85>
+  %4 = sub <16 x i8> %0, %3
+  %5 = and <16 x i8> %4, <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+  %6 = lshr <16 x i8> %4, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  %7 = and <16 x i8> %6, <i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51, i8 51>
+  %8 = add nuw nsw <16 x i8> %7, %5
+  %9 = lshr <16 x i8> %8, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %10 = add nuw nsw <16 x i8> %9, %8
+  %11 = and <16 x i8> %10, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+  ret <16 x i8> %11
+}
+
+;vector unsigned int popcount32vec(vector unsigned int i)
+;{
+;  i = i - ((i>> 1) & 0x55555555);
+;  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+;  i = ((i + (i >> 4)) & 0x0F0F0F0F);
+;  return (i * 0x01010101) >> 24;
+;}
+define <4 x i32> @popcount32vec(<4 x i32> %0) {
+; CHECK-LABEL: @popcount32vec(
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP0:%.*]])
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %2 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+  %3 = and <4 x i32> %2, <i32 1431655765, i32 1431655765, i32 1431655765, i32 1431655765>
+  %4 = sub <4 x i32> %0, %3
+  %5 = and <4 x i32> %4, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
+  %6 = lshr <4 x i32> %4, <i32 2, i32 2, i32 2, i32 2>
+  %7 = and <4 x i32> %6, <i32 858993459, i32 858993459, i32 858993459, i32 858993459>
+  %8 = add nuw nsw <4 x i32> %7, %5
+  %9 = lshr <4 x i32> %8, <i32 4, i32 4, i32 4, i32 4>
+  %10 = add nuw nsw <4 x i32> %9, %8
+  %11 = and <4 x i32> %10, <i32 252645135, i32 252645135, i32 252645135, i32 252645135>
+  %12 = mul <4 x i32> %11, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
+  %13 = lshr <4 x i32> %12, <i32 24, i32 24, i32 24, i32 24>
+  ret <4 x i32> %13
+}