[PATCH] D156029: [InstCombine] icmp udiv transform

Thu Jul 27 03:59:47 PDT 2023

nikic added inline comments.

================
Comment at: llvm/test/Transforms/InstCombine/icmp-udiv.ll:99
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i128 [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i1 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    ret i1 [[TMP8]]
----------------
kitaisreal wrote:
> goldstein.w.n wrote:
> > kitaisreal wrote:
> > > kitaisreal wrote:
> > > > nikic wrote:
> > > > > goldstein.w.n wrote:
> > > > > > It's not clear this is more canonical or universally better. It's true we occasionally add instructions to avoid div, but this is a lot. Maybe this transform belongs in the backend?
> > > > > Yeah, this does not seem like a good canonicalization from an IR perspective.
> > > > > 
> > > > > I'm also wondering whether it would be preferable to use MULO instead of extending to double-width. Though it seems like codegen ends up being very similar for most targets (https://llvm.godbolt.org/z/73Phv83hT). Apparently muls that set overflow flags without computing a wide multiply aren't really a thing.
> > > > It seems that for `less`, `greater`, `less or equal`, `greater or equal` this optimization is universally better.
> > > > We trade `udiv` for 3 `zest`, 1 `mul` and for `greater` and `greater or equal` we also need 1 `add`. In final assembly it also looks much better.
> > > > 
> > > For `equals` and `not equals` it seems that it provides slightly more instructions, but I still think that if we can avoid division it is worth it.
> > > I think it is okay to move this optimization in Backend, althought I am afraid that if we will not keep this optimization in IR, such construction will not be autovectorized.
> > > For such code:
> > > ```
> > > void test(uint32_t * __restrict x, uint32_t * __restrict y, uint8_t * result, uint32_t z, size_t size) {
> > >     for (size_t i = 0; i < size; ++i) {
> > >         result[i] = x[i] / y[i] == z;
> > >     }
> > > }
> > > ```
> > > Result vectorized loop after this patch:
> > > ```
> > > .LBB0_4:                                # %vector.body
> > >                                         # =>This Inner Loop Header: Depth=1
> > > 	movdqu	(%rdi,%r9,4), %xmm4
> > > 	movdqu	(%rsi,%r9,4), %xmm9
> > > 	movdqa	%xmm4, %xmm5
> > > 	punpckldq	%xmm1, %xmm5            # xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
> > > 	punpckhdq	%xmm1, %xmm4            # xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
> > > 	movdqa	%xmm9, %xmm10
> > > 	punpckhdq	%xmm1, %xmm10           # xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
> > > 	punpckldq	%xmm1, %xmm9            # xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
> > > 	movdqa	%xmm0, %xmm6
> > > 	pmuludq	%xmm9, %xmm6
> > > 	movdqa	%xmm0, %xmm8
> > > 	pmuludq	%xmm10, %xmm8
> > > 	pxor	%xmm2, %xmm4
> > > 	movdqa	%xmm8, %xmm11
> > > 	pxor	%xmm2, %xmm11
> > > 	movdqa	%xmm11, %xmm12
> > > 	pcmpgtd	%xmm4, %xmm12
> > > 	pxor	%xmm2, %xmm5
> > > 	movdqa	%xmm6, %xmm13
> > > 	pxor	%xmm2, %xmm13
> > > 	movdqa	%xmm13, %xmm7
> > > 	pcmpgtd	%xmm5, %xmm7
> > > 	movdqa	%xmm7, %xmm14
> > > 	shufps	$136, %xmm12, %xmm14            # xmm14 = xmm14[0,2],xmm12[0,2]
> > > 	pcmpeqd	%xmm4, %xmm11
> > > 	pcmpeqd	%xmm5, %xmm13
> > > 	shufps	$221, %xmm11, %xmm13            # xmm13 = xmm13[1,3],xmm11[1,3]
> > > 	andps	%xmm14, %xmm13
> > > 	shufps	$221, %xmm12, %xmm7             # xmm7 = xmm7[1,3],xmm12[1,3]
> > > 	orps	%xmm13, %xmm7
> > > 	paddq	%xmm9, %xmm6
> > > 	paddq	%xmm10, %xmm8
> > > 	pxor	%xmm2, %xmm8
> > > 	movdqa	%xmm8, %xmm9
> > > 	pcmpgtd	%xmm4, %xmm9
> > > 	pxor	%xmm2, %xmm6
> > > 	movdqa	%xmm6, %xmm10
> > > 	pcmpgtd	%xmm5, %xmm10
> > > 	movdqa	%xmm10, %xmm11
> > > 	shufps	$136, %xmm9, %xmm11             # xmm11 = xmm11[0,2],xmm9[0,2]
> > > 	pcmpeqd	%xmm4, %xmm8
> > > 	pcmpeqd	%xmm5, %xmm6
> > > 	shufps	$221, %xmm8, %xmm6              # xmm6 = xmm6[1,3],xmm8[1,3]
> > > 	andps	%xmm11, %xmm6
> > > 	shufps	$221, %xmm9, %xmm10             # xmm10 = xmm10[1,3],xmm9[1,3]
> > > 	orps	%xmm6, %xmm10
> > > 	andnps	%xmm10, %xmm7
> > > 	andps	%xmm3, %xmm7
> > > 	packuswb	%xmm7, %xmm7
> > > 	packuswb	%xmm7, %xmm7
> > > 	movd	%xmm7, (%rdx,%r9)
> > > 	addq	$4, %r9
> > > 	cmpq	%r9, %rcx
> > > 	jne	.LBB0_4
> > > ```
> > > In clang-16 result loop:
> > > ```
> > > .LBB0_7:                                # =>This Inner Loop Header: Depth=1
> > > 	movl	(%rdi,%r10,4), %eax
> > > 	xorl	%edx, %edx
> > > 	divl	(%rsi,%r10,4)
> > > 	cmpl	%ecx, %eax
> > > 	sete	(%r9,%r10)
> > > 	movl	4(%rdi,%r10,4), %eax
> > > 	xorl	%edx, %edx
> > > 	divl	4(%rsi,%r10,4)
> > > 	cmpl	%ecx, %eax
> > > 	sete	1(%r9,%r10)
> > > 	addq	$2, %r10
> > > 	cmpq	%r10, %r11
> > > 	jne	.LBB0_7
> > > ```
> > > 
> > > For `equals` and `not equals` it seems that it provides slightly more instructions, but I still think that if we can avoid division it is worth it.
> > > I think it is okay to move this optimization in Backend, althought I am afraid that if we will not keep this optimization in IR, such construction will not be autovectorized.
> > > For such code:
> > > ```
> > > void test(uint32_t * __restrict x, uint32_t * __restrict y, uint8_t * result, uint32_t z, size_t size) {
> > >     for (size_t i = 0; i < size; ++i) {
> > >         result[i] = x[i] / y[i] == z;
> > >     }
> > > }
> > > ```
> > > Result vectorized loop after this patch:
> > > ```
> > > .LBB0_4:                                # %vector.body
> > >                                         # =>This Inner Loop Header: Depth=1
> > > 	movdqu	(%rdi,%r9,4), %xmm4
> > > 	movdqu	(%rsi,%r9,4), %xmm9
> > > 	movdqa	%xmm4, %xmm5
> > > 	punpckldq	%xmm1, %xmm5            # xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
> > > 	punpckhdq	%xmm1, %xmm4            # xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
> > > 	movdqa	%xmm9, %xmm10
> > > 	punpckhdq	%xmm1, %xmm10           # xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
> > > 	punpckldq	%xmm1, %xmm9            # xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
> > > 	movdqa	%xmm0, %xmm6
> > > 	pmuludq	%xmm9, %xmm6
> > > 	movdqa	%xmm0, %xmm8
> > > 	pmuludq	%xmm10, %xmm8
> > > 	pxor	%xmm2, %xmm4
> > > 	movdqa	%xmm8, %xmm11
> > > 	pxor	%xmm2, %xmm11
> > > 	movdqa	%xmm11, %xmm12
> > > 	pcmpgtd	%xmm4, %xmm12
> > > 	pxor	%xmm2, %xmm5
> > > 	movdqa	%xmm6, %xmm13
> > > 	pxor	%xmm2, %xmm13
> > > 	movdqa	%xmm13, %xmm7
> > > 	pcmpgtd	%xmm5, %xmm7
> > > 	movdqa	%xmm7, %xmm14
> > > 	shufps	$136, %xmm12, %xmm14            # xmm14 = xmm14[0,2],xmm12[0,2]
> > > 	pcmpeqd	%xmm4, %xmm11
> > > 	pcmpeqd	%xmm5, %xmm13
> > > 	shufps	$221, %xmm11, %xmm13            # xmm13 = xmm13[1,3],xmm11[1,3]
> > > 	andps	%xmm14, %xmm13
> > > 	shufps	$221, %xmm12, %xmm7             # xmm7 = xmm7[1,3],xmm12[1,3]
> > > 	orps	%xmm13, %xmm7
> > > 	paddq	%xmm9, %xmm6
> > > 	paddq	%xmm10, %xmm8
> > > 	pxor	%xmm2, %xmm8
> > > 	movdqa	%xmm8, %xmm9
> > > 	pcmpgtd	%xmm4, %xmm9
> > > 	pxor	%xmm2, %xmm6
> > > 	movdqa	%xmm6, %xmm10
> > > 	pcmpgtd	%xmm5, %xmm10
> > > 	movdqa	%xmm10, %xmm11
> > > 	shufps	$136, %xmm9, %xmm11             # xmm11 = xmm11[0,2],xmm9[0,2]
> > > 	pcmpeqd	%xmm4, %xmm8
> > > 	pcmpeqd	%xmm5, %xmm6
> > > 	shufps	$221, %xmm8, %xmm6              # xmm6 = xmm6[1,3],xmm8[1,3]
> > > 	andps	%xmm11, %xmm6
> > > 	shufps	$221, %xmm9, %xmm10             # xmm10 = xmm10[1,3],xmm9[1,3]
> > > 	orps	%xmm6, %xmm10
> > > 	andnps	%xmm10, %xmm7
> > > 	andps	%xmm3, %xmm7
> > > 	packuswb	%xmm7, %xmm7
> > > 	packuswb	%xmm7, %xmm7
> > > 	movd	%xmm7, (%rdx,%r9)
> > > 	addq	$4, %r9
> > > 	cmpq	%r9, %rcx
> > > 	jne	.LBB0_4
> > > ```
> > > In clang-16 result loop:
> > > ```
> > > .LBB0_7:                                # =>This Inner Loop Header: Depth=1
> > > 	movl	(%rdi,%r10,4), %eax
> > > 	xorl	%edx, %edx
> > > 	divl	(%rsi,%r10,4)
> > > 	cmpl	%ecx, %eax
> > > 	sete	(%r9,%r10)
> > > 	movl	4(%rdi,%r10,4), %eax
> > > 	xorl	%edx, %edx
> > > 	divl	4(%rsi,%r10,4)
> > > 	cmpl	%ecx, %eax
> > > 	sete	1(%r9,%r10)
> > > 	addq	$2, %r10
> > > 	cmpq	%r10, %r11
> > > 	jne	.LBB0_7
> > > ```
> > > 
> > 
> > The concern is not whether this can get better codegen, its whether it belongs in InstCombine or in something like the DAGCombiner. If the argument for this is "it gets better codegen" it probably belongs in DAGCombiner. If the argument is "it makes the IR simpler" it probably belongs in InstCombine.
> > 
> > Is there a reason not to do this in DAGCombiner instead?
> I am not very familiar with CodeGen part where DAGCombiner is placed, but I am afraid that if I move this optimization into it, construction like 
> ```
> for (size_t i = 0; i < size; ++i) {
>     result[i] = x[i] / y[i] == z;
> }
> ```
> will not be autovectorized, because this optimization is part of IR transformations. Please correct me if I am wrong.
That is correct, but you lose other things in turn. For example, consider what happens if you apply this transform, and then the divisor becomes a known constant after inlining. For a divisor like 4 which can be converted into a shift, we'll go from (the optimal)
```
define i1 @src(i8 %x, i8 %z) {
  %div1 = lshr i8 %x, 2
  %cmp = icmp eq i8 %div1, %z
  ret i1 %cmp
}
```
to something like
```
define i1 @tgt(i8 %x, i8 %z) {
  %x.ext = zext i8 %x to i16
  %z.ext = zext i8 %z to i16
  %mul = shl nuw nsw i16 %z.ext, 2
  %cmp1 = icmp ule i16 %mul, %x.ext
  %add = add nuw nsw i16 %mul, 4
  %cmp2 = icmp ugt i16 %add, %x.ext
  %and = and i1 %cmp1, %cmp2
  ret i1 %and
}
```
This pattern can in theory be reduced back to the efficient original form, but we certainly don't do that currently.

While udiv is a bit borderline, doing these kinds of expansion transforms in the middle-end is usually a bad idea.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D156029/new/

https://reviews.llvm.org/D156029