[PATCH] D156029: [InstCombine] icmp udiv transform

Mon Jul 24 16:21:54 PDT 2023

goldstein.w.n added inline comments.

================
Comment at: llvm/test/Transforms/InstCombine/icmp-udiv.ll:99
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i128 [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i1 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    ret i1 [[TMP8]]
----------------
kitaisreal wrote:
> kitaisreal wrote:
> > nikic wrote:
> > > goldstein.w.n wrote:
> > > > It's not clear this is more canonical or universally better. It's true we occasionally add instructions to avoid div, but this is a lot. Maybe this transform belongs in the backend?
> > > Yeah, this does not seem like a good canonicalization from an IR perspective.
> > > 
> > > I'm also wondering whether it would be preferable to use MULO instead of extending to double-width. Though it seems like codegen ends up being very similar for most targets (https://llvm.godbolt.org/z/73Phv83hT). Apparently muls that set overflow flags without computing a wide multiply aren't really a thing.
> > It seems that for `less`, `greater`, `less or equal`, `greater or equal` this optimization is universally better.
> > We trade `udiv` for 3 `zest`, 1 `mul` and for `greater` and `greater or equal` we also need 1 `add`. In final assembly it also looks much better.
> > 
> For `equals` and `not equals` it seems that it provides slightly more instructions, but I still think that if we can avoid division it is worth it.
> I think it is okay to move this optimization in Backend, althought I am afraid that if we will not keep this optimization in IR, such construction will not be autovectorized.
> For such code:
> ```
> void test(uint32_t * __restrict x, uint32_t * __restrict y, uint8_t * result, uint32_t z, size_t size) {
>     for (size_t i = 0; i < size; ++i) {
>         result[i] = x[i] / y[i] == z;
>     }
> }
> ```
> Result vectorized loop after this patch:
> ```
> .LBB0_4:                                # %vector.body
>                                         # =>This Inner Loop Header: Depth=1
> 	movdqu	(%rdi,%r9,4), %xmm4
> 	movdqu	(%rsi,%r9,4), %xmm9
> 	movdqa	%xmm4, %xmm5
> 	punpckldq	%xmm1, %xmm5            # xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
> 	punpckhdq	%xmm1, %xmm4            # xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
> 	movdqa	%xmm9, %xmm10
> 	punpckhdq	%xmm1, %xmm10           # xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
> 	punpckldq	%xmm1, %xmm9            # xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
> 	movdqa	%xmm0, %xmm6
> 	pmuludq	%xmm9, %xmm6
> 	movdqa	%xmm0, %xmm8
> 	pmuludq	%xmm10, %xmm8
> 	pxor	%xmm2, %xmm4
> 	movdqa	%xmm8, %xmm11
> 	pxor	%xmm2, %xmm11
> 	movdqa	%xmm11, %xmm12
> 	pcmpgtd	%xmm4, %xmm12
> 	pxor	%xmm2, %xmm5
> 	movdqa	%xmm6, %xmm13
> 	pxor	%xmm2, %xmm13
> 	movdqa	%xmm13, %xmm7
> 	pcmpgtd	%xmm5, %xmm7
> 	movdqa	%xmm7, %xmm14
> 	shufps	$136, %xmm12, %xmm14            # xmm14 = xmm14[0,2],xmm12[0,2]
> 	pcmpeqd	%xmm4, %xmm11
> 	pcmpeqd	%xmm5, %xmm13
> 	shufps	$221, %xmm11, %xmm13            # xmm13 = xmm13[1,3],xmm11[1,3]
> 	andps	%xmm14, %xmm13
> 	shufps	$221, %xmm12, %xmm7             # xmm7 = xmm7[1,3],xmm12[1,3]
> 	orps	%xmm13, %xmm7
> 	paddq	%xmm9, %xmm6
> 	paddq	%xmm10, %xmm8
> 	pxor	%xmm2, %xmm8
> 	movdqa	%xmm8, %xmm9
> 	pcmpgtd	%xmm4, %xmm9
> 	pxor	%xmm2, %xmm6
> 	movdqa	%xmm6, %xmm10
> 	pcmpgtd	%xmm5, %xmm10
> 	movdqa	%xmm10, %xmm11
> 	shufps	$136, %xmm9, %xmm11             # xmm11 = xmm11[0,2],xmm9[0,2]
> 	pcmpeqd	%xmm4, %xmm8
> 	pcmpeqd	%xmm5, %xmm6
> 	shufps	$221, %xmm8, %xmm6              # xmm6 = xmm6[1,3],xmm8[1,3]
> 	andps	%xmm11, %xmm6
> 	shufps	$221, %xmm9, %xmm10             # xmm10 = xmm10[1,3],xmm9[1,3]
> 	orps	%xmm6, %xmm10
> 	andnps	%xmm10, %xmm7
> 	andps	%xmm3, %xmm7
> 	packuswb	%xmm7, %xmm7
> 	packuswb	%xmm7, %xmm7
> 	movd	%xmm7, (%rdx,%r9)
> 	addq	$4, %r9
> 	cmpq	%r9, %rcx
> 	jne	.LBB0_4
> ```
> In clang-16 result loop:
> ```
> .LBB0_7:                                # =>This Inner Loop Header: Depth=1
> 	movl	(%rdi,%r10,4), %eax
> 	xorl	%edx, %edx
> 	divl	(%rsi,%r10,4)
> 	cmpl	%ecx, %eax
> 	sete	(%r9,%r10)
> 	movl	4(%rdi,%r10,4), %eax
> 	xorl	%edx, %edx
> 	divl	4(%rsi,%r10,4)
> 	cmpl	%ecx, %eax
> 	sete	1(%r9,%r10)
> 	addq	$2, %r10
> 	cmpq	%r10, %r11
> 	jne	.LBB0_7
> ```
> 
> For `equals` and `not equals` it seems that it provides slightly more instructions, but I still think that if we can avoid division it is worth it.
> I think it is okay to move this optimization in Backend, althought I am afraid that if we will not keep this optimization in IR, such construction will not be autovectorized.
> For such code:
> ```
> void test(uint32_t * __restrict x, uint32_t * __restrict y, uint8_t * result, uint32_t z, size_t size) {
>     for (size_t i = 0; i < size; ++i) {
>         result[i] = x[i] / y[i] == z;
>     }
> }
> ```
> Result vectorized loop after this patch:
> ```
> .LBB0_4:                                # %vector.body
>                                         # =>This Inner Loop Header: Depth=1
> 	movdqu	(%rdi,%r9,4), %xmm4
> 	movdqu	(%rsi,%r9,4), %xmm9
> 	movdqa	%xmm4, %xmm5
> 	punpckldq	%xmm1, %xmm5            # xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
> 	punpckhdq	%xmm1, %xmm4            # xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
> 	movdqa	%xmm9, %xmm10
> 	punpckhdq	%xmm1, %xmm10           # xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
> 	punpckldq	%xmm1, %xmm9            # xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
> 	movdqa	%xmm0, %xmm6
> 	pmuludq	%xmm9, %xmm6
> 	movdqa	%xmm0, %xmm8
> 	pmuludq	%xmm10, %xmm8
> 	pxor	%xmm2, %xmm4
> 	movdqa	%xmm8, %xmm11
> 	pxor	%xmm2, %xmm11
> 	movdqa	%xmm11, %xmm12
> 	pcmpgtd	%xmm4, %xmm12
> 	pxor	%xmm2, %xmm5
> 	movdqa	%xmm6, %xmm13
> 	pxor	%xmm2, %xmm13
> 	movdqa	%xmm13, %xmm7
> 	pcmpgtd	%xmm5, %xmm7
> 	movdqa	%xmm7, %xmm14
> 	shufps	$136, %xmm12, %xmm14            # xmm14 = xmm14[0,2],xmm12[0,2]
> 	pcmpeqd	%xmm4, %xmm11
> 	pcmpeqd	%xmm5, %xmm13
> 	shufps	$221, %xmm11, %xmm13            # xmm13 = xmm13[1,3],xmm11[1,3]
> 	andps	%xmm14, %xmm13
> 	shufps	$221, %xmm12, %xmm7             # xmm7 = xmm7[1,3],xmm12[1,3]
> 	orps	%xmm13, %xmm7
> 	paddq	%xmm9, %xmm6
> 	paddq	%xmm10, %xmm8
> 	pxor	%xmm2, %xmm8
> 	movdqa	%xmm8, %xmm9
> 	pcmpgtd	%xmm4, %xmm9
> 	pxor	%xmm2, %xmm6
> 	movdqa	%xmm6, %xmm10
> 	pcmpgtd	%xmm5, %xmm10
> 	movdqa	%xmm10, %xmm11
> 	shufps	$136, %xmm9, %xmm11             # xmm11 = xmm11[0,2],xmm9[0,2]
> 	pcmpeqd	%xmm4, %xmm8
> 	pcmpeqd	%xmm5, %xmm6
> 	shufps	$221, %xmm8, %xmm6              # xmm6 = xmm6[1,3],xmm8[1,3]
> 	andps	%xmm11, %xmm6
> 	shufps	$221, %xmm9, %xmm10             # xmm10 = xmm10[1,3],xmm9[1,3]
> 	orps	%xmm6, %xmm10
> 	andnps	%xmm10, %xmm7
> 	andps	%xmm3, %xmm7
> 	packuswb	%xmm7, %xmm7
> 	packuswb	%xmm7, %xmm7
> 	movd	%xmm7, (%rdx,%r9)
> 	addq	$4, %r9
> 	cmpq	%r9, %rcx
> 	jne	.LBB0_4
> ```
> In clang-16 result loop:
> ```
> .LBB0_7:                                # =>This Inner Loop Header: Depth=1
> 	movl	(%rdi,%r10,4), %eax
> 	xorl	%edx, %edx
> 	divl	(%rsi,%r10,4)
> 	cmpl	%ecx, %eax
> 	sete	(%r9,%r10)
> 	movl	4(%rdi,%r10,4), %eax
> 	xorl	%edx, %edx
> 	divl	4(%rsi,%r10,4)
> 	cmpl	%ecx, %eax
> 	sete	1(%r9,%r10)
> 	addq	$2, %r10
> 	cmpq	%r10, %r11
> 	jne	.LBB0_7
> ```
> 

The concern is not whether this can get better codegen, its whether it belongs in InstCombine or in something like the DAGCombiner. If the argument for this is "it gets better codegen" it probably belongs in DAGCombiner. If the argument is "it makes the IR simpler" it probably belongs in InstCombine.

Is there a reason not to do this in DAGCombiner instead?

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D156029/new/

https://reviews.llvm.org/D156029