[llvm] e9179a6 - [Support] improve known bits analysis for multiply by power-of-2 (1 set bit)

Wed Dec 8 08:50:12 PST 2021

Author: Sanjay Patel
Date: 2021-12-08T11:50:05-05:00
New Revision: e9179a6a029a501524cf3f34434c9dc2be4d74cc

URL: https://github.com/llvm/llvm-project/commit/e9179a6a029a501524cf3f34434c9dc2be4d74cc
DIFF: https://github.com/llvm/llvm-project/commit/e9179a6a029a501524cf3f34434c9dc2be4d74cc.diff

LOG: [Support] improve known bits analysis for multiply by power-of-2 (1 set bit)

This can be viewed as recognizing that multiply-by-power-of-2 doesn't
have a carry into the top bit of an M-bit * N-bit number.

Enhancing canonicalization of mul -> select might also handle some of
these if we were ok with increasing instruction count with casts in
some cases.

This doesn't help https://llvm.org/PR49055 , but it's a simpler
pattern that we miss.
Note: "-sccp" already gets these examples using a constant
range analysis.

Differential Revision: https://reviews.llvm.org/D114962

Added: 
    

Modified: 
    llvm/lib/Support/KnownBits.cpp
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/Transforms/InstCombine/icmp-mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 554e3248524c6..fdc8fdb6b0fda 100644

--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -421,9 +421,16 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
          "Self multiplication knownbits mismatch");
 
   // Compute a conservative estimate for high known-0 bits.
+  // TODO: This could be generalized to number of sign bits (negative numbers).
   unsigned LHSLeadZ = LHS.countMinLeadingZeros();
   unsigned RHSLeadZ = RHS.countMinLeadingZeros();
-  unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth;
+
+  // If either operand is a power-of-2, the multiply is only shifting bits in
+  // the other operand (there can't be a carry into the M+N bit of the result).
+  // Note: if we know that a value is entirely 0, that should simplify below.
+  bool BonusLZ = LHS.countMaxPopulation() == 1 || RHS.countMaxPopulation() == 1;
+
+  unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ + BonusLZ, BitWidth) - BitWidth;
   assert(LeadZ <= BitWidth && "More zeros than bits?");
 
   // The result of the bottom bits of an integer multiply can be

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 86cd6dee74d3d..dd65453e6fb3a 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1528,7 +1528,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-NEXT:    s_mov_b32 s4, 0x8000
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1578,18 +1577,14 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 17, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v6, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 17, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0x8000, v5
 ; GCN-NEXT:    v_subb_u32_e64 v6, s[4:5], v6, v1, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v7, s[4:5], v5, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v6, s[4:5], 0, v6, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index db2d99f887d71..5e500f5065d66 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1701,7 +1701,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    s_mov_b32 s4, 0x8000
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1751,18 +1750,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 17, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0x8000, v2
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]

diff  --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 785f285322d24..27886bc968521 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -684,11 +684,7 @@ define i1 @oss_fuzz_39934(i32 %arg) {
 
 define i1 @mul_of_bool(i32 %x, i8 %y) {
 ; CHECK-LABEL: @mul_of_bool(
-; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 255
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %b = and i32 %x, 1
   %z = zext i8 %y to i32
@@ -699,11 +695,7 @@ define i1 @mul_of_bool(i32 %x, i8 %y) {
 
 define i1 @mul_of_bool_commute(i32 %x, i32 %y) {
 ; CHECK-LABEL: @mul_of_bool_commute(
-; CHECK-NEXT:    [[X1:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[Y8:%.*]] = and i32 [[Y:%.*]], 255
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X1]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 255
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %x1 = and i32 %x, 1
   %y8 = and i32 %y, 255
@@ -714,11 +706,7 @@ define i1 @mul_of_bool_commute(i32 %x, i32 %y) {
 
 define i1 @mul_of_bools(i32 %x, i32 %y) {
 ; CHECK-LABEL: @mul_of_bools(
-; CHECK-NEXT:    [[X1:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[Y1:%.*]] = and i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[X1]], [[Y1]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[M]], 2
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %x1 = and i32 %x, 1
   %y1 = and i32 %y, 1
@@ -727,6 +715,8 @@ define i1 @mul_of_bools(i32 %x, i32 %y) {
   ret i1 %r
 }
 
+; negative test - not a mask of low bit
+
 define i1 @not_mul_of_bool(i32 %x, i8 %y) {
 ; CHECK-LABEL: @not_mul_of_bool(
 ; CHECK-NEXT:    [[Q:%.*]] = and i32 [[X:%.*]], 3
@@ -742,6 +732,8 @@ define i1 @not_mul_of_bool(i32 %x, i8 %y) {
   ret i1 %r
 }
 
+; negative test - not a single low bit
+
 define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
 ; CHECK-LABEL: @not_mul_of_bool_commute(
 ; CHECK-NEXT:    [[X30:%.*]] = lshr i32 [[X:%.*]], 30
@@ -757,6 +749,9 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
   ret i1 %r
 }
 
+; negative test - no leading zeros for 's'
+; TODO: If analysis was generalized for sign bits, we could reduce this to false.
+
 define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
 ; CHECK-LABEL: @mul_of_bool_no_lz_other_op(
 ; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 1
@@ -772,13 +767,11 @@ define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
   ret i1 %r
 }
 
+; high and low bits are known 0
+
 define i1 @mul_of_pow2(i32 %x, i8 %y) {
 ; CHECK-LABEL: @mul_of_pow2(
-; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 2
-; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 510
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %b = and i32 %x, 2
   %z = zext i8 %y to i32
@@ -787,13 +780,11 @@ define i1 @mul_of_pow2(i32 %x, i8 %y) {
   ret i1 %r
 }
 
+; high and low bits are known 0
+
 define i1 @mul_of_pow2_commute(i32 %x, i32 %y) {
 ; CHECK-LABEL: @mul_of_pow2_commute(
-; CHECK-NEXT:    [[X4:%.*]] = and i32 [[X:%.*]], 4
-; CHECK-NEXT:    [[Y8:%.*]] = and i32 [[Y:%.*]], 255
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X4]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 1020
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %x4 = and i32 %x, 4
   %y8 = and i32 %y, 255
@@ -802,13 +793,11 @@ define i1 @mul_of_pow2_commute(i32 %x, i32 %y) {
   ret i1 %r
 }
 
+; only bit 7 can be set by the multiply
+
 define i32 @mul_of_pow2s(i32 %x, i32 %y) {
 ; CHECK-LABEL: @mul_of_pow2s(
-; CHECK-NEXT:    [[X8:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[Y16:%.*]] = and i32 [[Y:%.*]], 16
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[X8]], [[Y16]]
-; CHECK-NEXT:    [[BIT7:%.*]] = or i32 [[M]], 128
-; CHECK-NEXT:    ret i32 [[BIT7]]
+; CHECK-NEXT:    ret i32 128
 ;
   %x8 = and i32 %x, 8
   %y16 = and i32 %y, 16
@@ -817,6 +806,8 @@ define i32 @mul_of_pow2s(i32 %x, i32 %y) {
   ret i32 %bit7
 }
 
+; negative test - 6 * 255 = 1530 (but constant range analysis can get this)
+
 define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
 ; CHECK-LABEL: @not_mul_of_pow2(
 ; CHECK-NEXT:    [[Q:%.*]] = and i32 [[X:%.*]], 6
@@ -832,6 +823,8 @@ define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
   ret i1 %r
 }
 
+; negative test - 12 * 255 = 3060 (but constant range analysis can get this)
+
 define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
 ; CHECK-LABEL: @not_mul_of_pow2_commute(
 ; CHECK-NEXT:    [[X30:%.*]] = and i32 [[X:%.*]], 12
@@ -847,6 +840,9 @@ define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
   ret i1 %r
 }
 
+; negative test - no leading zeros for 's'
+; TODO: If analysis was generalized for sign bits, we could reduce this to false.
+
 define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
 ; CHECK-LABEL: @mul_of_pow2_no_lz_other_op(
 ; CHECK-NEXT:    [[B:%.*]] = and i32 [[X:%.*]], 2