[llvm] e9179a6 - [Support] improve known bits analysis for multiply by power-of-2 (1 set bit)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 8 08:50:12 PST 2021
Author: Sanjay Patel
Date: 2021-12-08T11:50:05-05:00
New Revision: e9179a6a029a501524cf3f34434c9dc2be4d74cc
URL: https://github.com/llvm/llvm-project/commit/e9179a6a029a501524cf3f34434c9dc2be4d74cc
DIFF: https://github.com/llvm/llvm-project/commit/e9179a6a029a501524cf3f34434c9dc2be4d74cc.diff
LOG: [Support] improve known bits analysis for multiply by power-of-2 (1 set bit)
This can be viewed as recognizing that multiply-by-power-of-2 doesn't
have a carry into the top bit of an M-bit * N-bit number.
Enhancing canonicalization of mul -> select might also handle some of
these if we were ok with increasing instruction count with casts in
some cases.
This doesn't help https://llvm.org/PR49055 , but it's a simpler
pattern that we miss.
Note: "-sccp" already gets these examples using a constant
range analysis.
Differential Revision: https://reviews.llvm.org/D114962
Added:
Modified:
llvm/lib/Support/KnownBits.cpp
llvm/test/CodeGen/AMDGPU/sdiv64.ll
llvm/test/CodeGen/AMDGPU/srem64.ll
llvm/test/Transforms/InstCombine/icmp-mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 554e3248524c6..fdc8fdb6b0fda 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -421,9 +421,16 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
"Self multiplication knownbits mismatch");
// Compute a conservative estimate for high known-0 bits.
+ // TODO: This could be generalized to number of sign bits (negative numbers).
unsigned LHSLeadZ = LHS.countMinLeadingZeros();
unsigned RHSLeadZ = RHS.countMinLeadingZeros();
- unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth;
+
+ // If either operand is a power-of-2, the multiply is only shifting bits in
+ // the other operand (there can't be a carry into the M+N bit of the result).
+ // Note: if we know that a value is entirely 0, that should simplify below.
+ bool BonusLZ = LHS.countMaxPopulation() == 1 || RHS.countMaxPopulation() == 1;
+
+ unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ + BonusLZ, BitWidth) - BitWidth;
assert(LeadZ <= BitWidth && "More zeros than bits?");
// The result of the bottom bits of an integer multiply can be
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 86cd6dee74d3d..dd65453e6fb3a 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1528,7 +1528,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; GCN-NEXT: v_rcp_f32_e32 v3, v3
; GCN-NEXT: v_mov_b32_e32 v12, 0
-; GCN-NEXT: s_mov_b32 s4, 0x8000
; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GCN-NEXT: v_trunc_f32_e32 v4, v4
@@ -1578,18 +1577,14 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 17, v4
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 15, v4
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v6, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v3, 17, v3
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GCN-NEXT: v_mul_lo_u32 v4, v1, v3
; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_mul_lo_u32 v5, v0, v3
; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, s4, v5
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0x8000, v5
; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v1, vcc
; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v0
; GCN-NEXT: v_subbrev_u32_e64 v6, s[4:5], 0, v6, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index db2d99f887d71..5e500f5065d66 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1701,7 +1701,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; GCN-NEXT: v_rcp_f32_e32 v2, v2
; GCN-NEXT: v_mov_b32_e32 v11, 0
-; GCN-NEXT: s_mov_b32 s4, 0x8000
; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GCN-NEXT: v_trunc_f32_e32 v3, v3
@@ -1751,18 +1750,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 17, v3
-; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3
+; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc
; GCN-NEXT: v_mul_lo_u32 v3, v1, v2
; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2
; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0
; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 785f285322d24..27886bc968521 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -684,11 +684,7 @@ define i1 @oss_fuzz_39934(i32 %arg) {
define i1 @mul_of_bool(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_bool(
-; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]]
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 false
;
%b = and i32 %x, 1
%z = zext i8 %y to i32
@@ -699,11 +695,7 @@ define i1 @mul_of_bool(i32 %x, i8 %y) {
define i1 @mul_of_bool_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_bool_commute(
-; CHECK-NEXT: [[X1:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255
-; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X1]]
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 false
;
%x1 = and i32 %x, 1
%y8 = and i32 %y, 255
@@ -714,11 +706,7 @@ define i1 @mul_of_bool_commute(i32 %x, i32 %y) {
define i1 @mul_of_bools(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_bools(
-; CHECK-NEXT: [[X1:%.*]] = and i32 [[X:%.*]], 1
-; CHECK-NEXT: [[Y1:%.*]] = and i32 [[Y:%.*]], 1
-; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X1]], [[Y1]]
-; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[M]], 2
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 true
;
%x1 = and i32 %x, 1
%y1 = and i32 %y, 1
@@ -727,6 +715,8 @@ define i1 @mul_of_bools(i32 %x, i32 %y) {
ret i1 %r
}
+; negative test - not a mask of low bit
+
define i1 @not_mul_of_bool(i32 %x, i8 %y) {
; CHECK-LABEL: @not_mul_of_bool(
; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 3
@@ -742,6 +732,8 @@ define i1 @not_mul_of_bool(i32 %x, i8 %y) {
ret i1 %r
}
+; negative test - not a single low bit
+
define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @not_mul_of_bool_commute(
; CHECK-NEXT: [[X30:%.*]] = lshr i32 [[X:%.*]], 30
@@ -757,6 +749,9 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
ret i1 %r
}
+; negative test - no leading zeros for 's'
+; TODO: If analysis was generalized for sign bits, we could reduce this to false.
+
define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_bool_no_lz_other_op(
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 1
@@ -772,13 +767,11 @@ define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) {
ret i1 %r
}
+; high and low bits are known 0
+
define i1 @mul_of_pow2(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_pow2(
-; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2
-; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]]
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 510
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 false
;
%b = and i32 %x, 2
%z = zext i8 %y to i32
@@ -787,13 +780,11 @@ define i1 @mul_of_pow2(i32 %x, i8 %y) {
ret i1 %r
}
+; high and low bits are known 0
+
define i1 @mul_of_pow2_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_pow2_commute(
-; CHECK-NEXT: [[X4:%.*]] = and i32 [[X:%.*]], 4
-; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255
-; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X4]]
-; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 1020
-; CHECK-NEXT: ret i1 [[R]]
+; CHECK-NEXT: ret i1 false
;
%x4 = and i32 %x, 4
%y8 = and i32 %y, 255
@@ -802,13 +793,11 @@ define i1 @mul_of_pow2_commute(i32 %x, i32 %y) {
ret i1 %r
}
+; only bit 7 can be set by the multiply
+
define i32 @mul_of_pow2s(i32 %x, i32 %y) {
; CHECK-LABEL: @mul_of_pow2s(
-; CHECK-NEXT: [[X8:%.*]] = and i32 [[X:%.*]], 8
-; CHECK-NEXT: [[Y16:%.*]] = and i32 [[Y:%.*]], 16
-; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X8]], [[Y16]]
-; CHECK-NEXT: [[BIT7:%.*]] = or i32 [[M]], 128
-; CHECK-NEXT: ret i32 [[BIT7]]
+; CHECK-NEXT: ret i32 128
;
%x8 = and i32 %x, 8
%y16 = and i32 %y, 16
@@ -817,6 +806,8 @@ define i32 @mul_of_pow2s(i32 %x, i32 %y) {
ret i32 %bit7
}
+; negative test - 6 * 255 = 1530 (but constant range analysis can get this)
+
define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
; CHECK-LABEL: @not_mul_of_pow2(
; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 6
@@ -832,6 +823,8 @@ define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
ret i1 %r
}
+; negative test - 12 * 255 = 3060 (but constant range analysis can get this)
+
define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
; CHECK-LABEL: @not_mul_of_pow2_commute(
; CHECK-NEXT: [[X30:%.*]] = and i32 [[X:%.*]], 12
@@ -847,6 +840,9 @@ define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
ret i1 %r
}
+; negative test - no leading zeros for 's'
+; TODO: If analysis was generalized for sign bits, we could reduce this to false.
+
define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
; CHECK-LABEL: @mul_of_pow2_no_lz_other_op(
; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2
More information about the llvm-commits
mailing list