[llvm] 892c731 - [Support] improve known bits analysis for leading zeros of multiply

Mon Dec 20 06:12:27 PST 2021

Author: Sanjay Patel
Date: 2021-12-20T09:10:50-05:00
New Revision: 892c731681df962064828334531f59d29eb8473a

URL: https://github.com/llvm/llvm-project/commit/892c731681df962064828334531f59d29eb8473a
DIFF: https://github.com/llvm/llvm-project/commit/892c731681df962064828334531f59d29eb8473a.diff

LOG: [Support] improve known bits analysis for leading zeros of multiply

Instead of summing leading zeros on the input operands, multiply the
max possible values of those inputs and count the leading zeros of
the result. This can give us an extra zero bit (typically in cases
where one of the operands is a known constant).

This allows folding away the remaining 'add' ops in the motivating
bug (modeled in the PhaseOrdering IR test):
https://github.com/llvm/llvm-project/issues/48399

Fixes #48399

Differential Revision: https://reviews.llvm.org/D115969

Added: 
    

Modified: 
    llvm/lib/Support/KnownBits.cpp
    llvm/test/CodeGen/X86/mul128.ll
    llvm/test/Transforms/InstCombine/icmp-mul.ll
    llvm/test/Transforms/InstCombine/narrow-switch.ll
    llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index fdc8fdb6b0fda..8e154067abc0a 100644

--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -420,18 +420,19 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
   assert((!SelfMultiply || (LHS.One == RHS.One && LHS.Zero == RHS.Zero)) &&
          "Self multiplication knownbits mismatch");
 
-  // Compute a conservative estimate for high known-0 bits.
+  // Compute the high known-0 bits by multiplying the unsigned max of each side.
+  // Conservatively, M active bits * N active bits results in M + N bits in the
+  // result. But if we know a value is a power-of-2 for example, then this
+  // computes one more leading zero.
   // TODO: This could be generalized to number of sign bits (negative numbers).
-  unsigned LHSLeadZ = LHS.countMinLeadingZeros();
-  unsigned RHSLeadZ = RHS.countMinLeadingZeros();
-
-  // If either operand is a power-of-2, the multiply is only shifting bits in
-  // the other operand (there can't be a carry into the M+N bit of the result).
-  // Note: if we know that a value is entirely 0, that should simplify below.
-  bool BonusLZ = LHS.countMaxPopulation() == 1 || RHS.countMaxPopulation() == 1;
-
-  unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ + BonusLZ, BitWidth) - BitWidth;
-  assert(LeadZ <= BitWidth && "More zeros than bits?");
+  APInt UMaxLHS = LHS.getMaxValue();
+  APInt UMaxRHS = RHS.getMaxValue();
+
+  // For leading zeros in the result to be valid, the unsigned max product must
+  // fit in the bitwidth (it must not overflow).
+  bool HasOverflow;
+  APInt UMaxResult = UMaxLHS.umul_ov(UMaxRHS, HasOverflow);
+  unsigned LeadZ = HasOverflow ? 0 : UMaxResult.countLeadingZeros();
 
   // The result of the bottom bits of an integer multiply can be
   // inferred by looking at the bottom bits of both operands and

diff  --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index 492438edbce37..dc226ff8f5698 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -107,15 +107,12 @@ define i128 @foo(i128 %t, i128 %u) {
 define void @PR13897() nounwind {
 ; X64-LABEL: PR13897:
 ; X64:       # %bb.0: # %"0x0"
-; X64-NEXT:    movl bbb(%rip), %ecx
-; X64-NEXT:    movabsq $4294967297, %rdx # imm = 0x100000001
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdx
-; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    movl bbb(%rip), %eax
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    shlq $32, %rcx
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq %rax, aaa(%rip)
-; X64-NEXT:    movq %rdx, aaa+8(%rip)
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movq %rcx, aaa+8(%rip)
+; X64-NEXT:    movq %rcx, aaa(%rip)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: PR13897:

diff  --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index 15d36ca8ff763..f351203f7820a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -858,12 +858,11 @@ define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) {
   ret i1 %r
 }
 
+; The top 32-bits must be zero.
+
 define i1 @splat_mul_known_lz(i32 %x) {
 ; CHECK-LABEL: @splat_mul_known_lz(
-; CHECK-NEXT:    [[Z:%.*]] = zext i32 [[X:%.*]] to i128
-; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i128 [[Z]], 18446744078004518913
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i128 [[M]], 79228162514264337593543950336
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %z = zext i32 %x to i128
   %m = mul i128 %z, 18446744078004518913 ; 0x00000000_00000001_00000001_00000001
@@ -872,6 +871,8 @@ define i1 @splat_mul_known_lz(i32 %x) {
   ret i1 %r
 }
 
+; Negative test - the 33rd bit could be set.
+
 define i1 @splat_mul_unknown_lz(i32 %x) {
 ; CHECK-LABEL: @splat_mul_unknown_lz(
 ; CHECK-NEXT:    [[Z:%.*]] = zext i32 [[X:%.*]] to i128

diff  --git a/llvm/test/Transforms/InstCombine/narrow-switch.ll b/llvm/test/Transforms/InstCombine/narrow-switch.ll
index a1bb442f1b8f2..d0b05e2742ff1 100644
--- a/llvm/test/Transforms/InstCombine/narrow-switch.ll
+++ b/llvm/test/Transforms/InstCombine/narrow-switch.ll
@@ -99,14 +99,14 @@ return:
 ; Make sure to avoid assertion crashes and use the type before
 ; truncation to generate the sub constant expressions that leads
 ; to the recomputed condition.
-; We allow to truncate from i64 to i59 if in 32-bit mode,
+; We allow truncate from i64 to i58 if in 32-bit mode,
 ; because both are illegal.
 
-define void @trunc64to59(i64 %a) {
-; ALL-LABEL: @trunc64to59(
-; CHECK32:         switch i59
-; CHECK32-NEXT:    i59 0, label %sw.bb1
-; CHECK32-NEXT:    i59 18717182647723699, label %sw.bb2
+define void @trunc64to58(i64 %a) {
+; ALL-LABEL: @trunc64to58(
+; CHECK32:         switch i58
+; CHECK32-NEXT:    i58 0, label %sw.bb1
+; CHECK32-NEXT:    i58 18717182647723699, label %sw.bb2
 ; CHECK32-NEXT:    ]
 ; CHECK64:         switch i64
 ; CHECK64-NEXT:    i64 0, label %sw.bb1

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
index c5953b621d055..7d7b5376ba7b3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
@@ -40,21 +40,19 @@ define void @loop_or(i8* noalias %pIn, i32* noalias %pOut, i32 %s) {
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], <i32 65792, i32 65792, i32 65792, i32 65792>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], <i32 65792, i32 65792, i32 65792, i32 65792>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP4]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP5]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[POUT:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], <i32 65793, i32 65793, i32 65793, i32 65793>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], <i32 65793, i32 65793, i32 65793, i32 65793>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP6]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP7]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[POUT:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i64 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP15]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER5]]
@@ -64,11 +62,10 @@ define void @loop_or(i8* noalias %pIn, i32* noalias %pOut, i32 %s) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER5]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PIN]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP17]] to i32
-; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul nuw nsw i32 [[CONV]], 65792
-; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[CONV]], -16777216
-; CHECK-NEXT:    [[OR3:%.*]] = add nsw i32 [[OR2]], [[REASS_MUL]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[OR2:%.*]] = mul nuw nsw i32 [[CONV]], 65793
+; CHECK-NEXT:    [[OR3:%.*]] = or i32 [[OR2]], -16777216
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[POUT]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 [[OR3]], i32* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1