[llvm] [InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs (PR #84389)

Fri Mar 8 10:35:15 PST 2024

goldsteinn wrote:

> > Could you please have a look at [dtcxzyw/llvm-opt-benchmark#336 (comment)](https://github.com/dtcxzyw/llvm-opt-benchmark/pull/336#discussion_r1517575694)?
> 
> So it seems this transform is enabling SLP vectorization in a case where its not profitable:
> 
> ```
> ;; Before
> ; *** IR Dump After SimplifyCFGPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
>   %r0 = urem i16 %inp, 20
>   %ui0 = uitofp i16 %r0 to float
>   %fadd0 = fadd float %ui0, -1.000000e+01
>   %fdiv0 = fdiv float %fadd0, 0.000000e+00
>   %ui1 = uitofp i16 %inp to float
>   %fdiv1 = fdiv float %ui1, 0.000000e+00
>   %r = fmul float %fdiv1, %fdiv0
>   ret float %r
> }
> ; *** IR Dump After SLPVectorizerPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
>   %r0 = urem i16 %inp, 20
>   %ui0 = uitofp i16 %r0 to float
>   %fadd0 = fadd float %ui0, -1.000000e+01
>   %fdiv0 = fdiv float %fadd0, 0.000000e+00
>   %ui1 = uitofp i16 %inp to float
>   %fdiv1 = fdiv float %ui1, 0.000000e+00
>   %r = fmul float %fdiv1, %fdiv0
>   ret float %r
> }
> ;; After
> ; *** IR Dump After SimplifyCFGPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
>   %r0 = urem i16 %inp, 20
>   %1 = add nsw i16 %r0, -10
>   %fadd0 = sitofp i16 %1 to float
>   %fdiv0 = fdiv float %fadd0, 0.000000e+00
>   %ui1 = uitofp i16 %inp to float
>   %fdiv1 = fdiv float %ui1, 0.000000e+00
>   %r = fmul float %fdiv1, %fdiv0
>   ret float %r
> }
> ; *** IR Dump After SLPVectorizerPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
>   %r0 = urem i16 %inp, 20
>   %1 = add nsw i16 %r0, -10
>   %2 = insertelement <2 x i16> poison, i16 %inp, i32 0
>   %3 = insertelement <2 x i16> %2, i16 %1, i32 1
>   %4 = uitofp <2 x i16> %3 to <2 x float>
>   %5 = sitofp <2 x i16> %3 to <2 x float>
>   %6 = shufflevector <2 x float> %4, <2 x float> %5, <2 x i32> <i32 0, i32 3>
>   %7 = fdiv <2 x float> %6, zeroinitializer
>   %8 = extractelement <2 x float> %7, i32 0
>   %9 = extractelement <2 x float> %7, i32 1
>   %r = fmul float %8, %9
>   ret float %r
> }
> ```
> 
> looking into a fix.

Actually, I think its profitable because it saves a division (now vectorized). For example on x86:

```
0000000000000000 <before>:
       0: 0f b7 c7                     	movzwl	%di, %eax
       3: 69 c8 cd cc 00 00            	imull	$0xcccd, %eax, %ecx     # imm = 0xCCCD
       9: c1 e9 12                     	shrl	$0x12, %ecx
       c: 83 e1 fc                     	andl	$-0x4, %ecx
       f: 8d 0c 89                     	leal	(%rcx,%rcx,4), %ecx
      12: 29 cf                        	subl	%ecx, %edi
      14: 0f bf cf                     	movswl	%di, %ecx
      17: c5 fa 2a c1                  	vcvtsi2ss	%ecx, %xmm0, %xmm0
      1b: c5 fa 58 05 00 00 00 00      	vaddss	(%rip), %xmm0, %xmm0    # 0x23 <before+0x23>
      23: c5 f0 57 c9                  	vxorps	%xmm1, %xmm1, %xmm1
      27: c5 fa 5e c1                  	vdivss	%xmm1, %xmm0, %xmm0
      2b: c5 ea 2a d0                  	vcvtsi2ss	%eax, %xmm2, %xmm2
      2f: c5 ea 5e c9                  	vdivss	%xmm1, %xmm2, %xmm1
      33: c5 f2 59 c0                  	vmulss	%xmm0, %xmm1, %xmm0
      37: c3                           	retq
      38: 0f 1f 84 00 00 00 00 00      	nopl	(%rax,%rax)

0000000000000040 <after>:
      40: 0f b7 c7                     	movzwl	%di, %eax
      43: 69 c0 cd cc 00 00            	imull	$0xcccd, %eax, %eax     # imm = 0xCCCD
      49: c1 e8 12                     	shrl	$0x12, %eax
      4c: 83 e0 fc                     	andl	$-0x4, %eax
      4f: 8d 04 80                     	leal	(%rax,%rax,4), %eax
      52: f7 d8                        	negl	%eax
      54: 8d 44 07 f6                  	leal	-0xa(%rdi,%rax), %eax
      58: c5 f9 6e c7                  	vmovd	%edi, %xmm0
      5c: c5 f9 c4 c0 01               	vpinsrw	$0x1, %eax, %xmm0, %xmm0
      61: c4 e2 79 33 c8               	vpmovzxwd	%xmm0, %xmm1    # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
      66: c5 f8 5b c9                  	vcvtdq2ps	%xmm1, %xmm1
      6a: c4 e2 79 23 c0               	vpmovsxwd	%xmm0, %xmm0
      6f: c5 f8 5b c0                  	vcvtdq2ps	%xmm0, %xmm0
      73: c4 e3 71 0c c0 02            	vblendps	$0x2, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
      79: c5 f0 57 c9                  	vxorps	%xmm1, %xmm1, %xmm1
      7d: c5 f8 5e c1                  	vdivps	%xmm1, %xmm0, %xmm0
      81: c5 fa 16 c8                  	vmovshdup	%xmm0, %xmm1    # xmm1 = xmm0[1,1,3,3]
      85: c5 fa 59 c1                  	vmulss	%xmm1, %xmm0, %xmm0
      89: c3                           	retq
```

The LLVM MCA summaries are:
```
BEFORE:
Iterations:        100
Instructions:      1500
Total Cycles:      2418
Total uOps:        2000

AFTER:
Iterations:        100
Instructions:      1900
Total Cycles:      590
Total uOps:        2200

```

https://github.com/llvm/llvm-project/pull/84389