[PATCH] D64275: [InstCombine] Generalize InstCombiner::foldAndOrOfICmpsOfAndWithPow2().

Fri Jul 12 23:07:24 PDT 2019

lebedev.ri added inline comments.

================
Comment at: llvm/test/Transforms/InstCombine/onehot_merge.ll:340

 ; Should not fold
 define i1 @foo1_and_extra_use_cmp2(i32 %k, i32 %c1, i32 %c2, i1* %p) {
----------------
huihuiz wrote:
> lebedev.ri wrote:
> > huihuiz wrote:
> > > lebedev.ri wrote:
> > > > Could fold, no instruction count increase
> > > We should not fold when 'cmp' has more than one use, this will increase instruction count when one side is using the new form.
> > > 
> > > Take this as an example, we need to keep the extra 'cmp' that can't be reused, while the benefit of folding is offset by the added signbit shift instruction.
> > > 
> > > ```
> > > define i1 @foo(i32 %k, i32 %c1, i32 %c2, i1* %p) {
> > >   %t0 = shl i32 1, %c1
> > >   %t1 = and i32 %t0, %k
> > >   %t2 = icmp eq i32 %t1, 0
> > >   store i1 %t2, i1* %p  ; extra use of cmp
> > >   %t3 = shl i32 %k, %c2
> > >   %t4 = icmp sgt i32 %t3, -1
> > >   %or = or i1 %t2, %t4
> > >   ret i1 %or
> > > }
> > > 
> > > ```
> > I believe i'm missing the point in `@foo1_and_extra_use_cmp2`.
> > We were doing this fold previously (in llvm trunk), now we don't because of use-count.
> > But this restriction does not seem to improve anything?
> > We had 8 instructions, and we have 8 instructions without the fold?
> > 
> This restriction doesn't improve anything, but make sure we don't increase instruction count for certain cases.
> 
> If 'cmp' has extra use, and one side is using the new form, e.g., @foo1_and_signbit_lshr_without_shifting_signbit_extra_use_cmp1/2 (please expand the test file to see)
> Without this restriction, instruction count actually increased. 
> 
> If 'cmp' has extra use, and both sides are using old form, e.g., @foo1_and_extra_use_cmp, we have 8 instructions with and without the fold.
> 
> Here is the diff without this restriction: 
> ```
> --- a/llvm/test/Transforms/InstCombine/onehot_merge.ll
> +++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll
> @@ -274,10 +274,10 @@ define i1 @foo1_and_extra_use_cmp(i32 %k, i32 %c1, i32 %c2, i1* %p) {
>  ; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T0]], [[K:%.*]]
>  ; CHECK-NEXT:    [[T3:%.*]] = icmp eq i32 [[T2]], 0
>  ; CHECK-NEXT:    store i1 [[T3]], i1* [[P:%.*]], align 1
> -; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T1]], [[K]]
> -; CHECK-NEXT:    [[T5:%.*]] = icmp eq i32 [[T4]], 0
> -; CHECK-NEXT:    [[OR:%.*]] = or i1 [[T3]], [[T5]]
> -; CHECK-NEXT:    ret i1 [[OR]]
> +; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[T0]], [[T1]]
> +; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], [[K]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]]
> +; CHECK-NEXT:    ret i1 [[TMP3]]
>  ;
>    %t0 = shl i32 1, %c1
>    %t1 = shl i32 1, %c2
> @@ -340,13 +340,13 @@ define i1 @foo1_and_extra_use_cmp2(i32 %k, i32 %c1, i32 %c2, i1* %p) {
>  ; CHECK-LABEL: @foo1_and_extra_use_cmp2(
>  ; CHECK-NEXT:    [[T0:%.*]] = shl i32 1, [[C1:%.*]]
>  ; CHECK-NEXT:    [[T1:%.*]] = shl i32 1, [[C2:%.*]]
> -; CHECK-NEXT:    [[T2:%.*]] = and i32 [[T0]], [[K:%.*]]
> -; CHECK-NEXT:    [[T3:%.*]] = icmp eq i32 [[T2]], 0
> -; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T1]], [[K]]
> +; CHECK-NEXT:    [[T4:%.*]] = and i32 [[T1]], [[K:%.*]]
>  ; CHECK-NEXT:    [[T5:%.*]] = icmp eq i32 [[T4]], 0
>  ; CHECK-NEXT:    store i1 [[T5]], i1* [[P:%.*]], align 1
> -; CHECK-NEXT:    [[OR:%.*]] = or i1 [[T3]], [[T5]]
> -; CHECK-NEXT:    ret i1 [[OR]]
> +; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[T0]], [[T1]]
> +; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], [[K]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], [[TMP1]]
> +; CHECK-NEXT:    ret i1 [[TMP3]]
>  ;
>    %t0 = shl i32 1, %c1
>    %t1 = shl i32 1, %c2
> @@ -410,10 +410,11 @@ define i1 @foo1_and_signbit_lshr_without_shifting_signbit_extra_use_cmp1(i32 %k,
>  ; CHECK-NEXT:    [[T1:%.*]] = and i32 [[T0]], [[K:%.*]]
>  ; CHECK-NEXT:    [[T2:%.*]] = icmp eq i32 [[T1]], 0
>  ; CHECK-NEXT:    store i1 [[T2]], i1* [[P:%.*]], align 1
> -; CHECK-NEXT:    [[T3:%.*]] = shl i32 [[K]], [[C2:%.*]]
> -; CHECK-NEXT:    [[T4:%.*]] = icmp sgt i32 [[T3]], -1
> -; CHECK-NEXT:    [[OR:%.*]] = or i1 [[T2]], [[T4]]
> -; CHECK-NEXT:    ret i1 [[OR]]
> +; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 -2147483648, [[C2:%.*]]
> +; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[K]]
> +; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]]
> +; CHECK-NEXT:    ret i1 [[TMP4]]
>  ;
>    %t0 = shl i32 1, %c1
>    %t1 = and i32 %t0, %k
> 
>  define i1 @foo1_and_signbit_lshr_without_shifting_signbit_extra_use_cmp2(i32 %k, i32 %c1, i32 %c2, i1* %p) {
>  ; CHECK-LABEL: @foo1_and_signbit_lshr_without_shifting_signbit_extra_use_cmp2(
>  ; CHECK-NEXT:    [[T0:%.*]] = shl i32 1, [[C1:%.*]]
> -; CHECK-NEXT:    [[T1:%.*]] = and i32 [[T0]], [[K:%.*]]
> -; CHECK-NEXT:    [[T2:%.*]] = icmp eq i32 [[T1]], 0
> -; CHECK-NEXT:    [[T3:%.*]] = shl i32 [[K]], [[C2:%.*]]
> +; CHECK-NEXT:    [[T3:%.*]] = shl i32 [[K:%.*]], [[C2:%.*]]
>  ; CHECK-NEXT:    [[T4:%.*]] = icmp sgt i32 [[T3]], -1
>  ; CHECK-NEXT:    store i1 [[T4]], i1* [[P:%.*]], align 1
> -; CHECK-NEXT:    [[OR:%.*]] = or i1 [[T2]], [[T4]]
> -; CHECK-NEXT:    ret i1 [[OR]]
> +; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 -2147483648, [[C2]]
> +; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[T0]], [[TMP1]]
> +; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], [[K]]
> +; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], [[TMP2]]
> +; CHECK-NEXT:    ret i1 [[TMP4]]
>  ;
>    %t0 = shl i32 1, %c1
>    %t1 = and i32 %t0, %k
> 
> ```
> 
> 
What i'm saying is, yes, sure, that one-use check is nessesary for some of the patterns, but for this one
it causes regressions for no benefit. Can that check be tightened up to not have *this* false-positive?

Repository:
  rL LLVM

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D64275/new/

https://reviews.llvm.org/D64275