[llvm] [InstCombine] Make `(binop ({s|u}itofp),({s|u}itofp))` transform more flexible to mismatched signs (PR #84389)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 8 10:35:15 PST 2024
goldsteinn wrote:
> > Could you please have a look at [dtcxzyw/llvm-opt-benchmark#336 (comment)](https://github.com/dtcxzyw/llvm-opt-benchmark/pull/336#discussion_r1517575694)?
>
> So it seems this transform is enabling SLP vectorization in a case where its not profitable:
>
> ```
> ;; Before
> ; *** IR Dump After SimplifyCFGPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
> %r0 = urem i16 %inp, 20
> %ui0 = uitofp i16 %r0 to float
> %fadd0 = fadd float %ui0, -1.000000e+01
> %fdiv0 = fdiv float %fadd0, 0.000000e+00
> %ui1 = uitofp i16 %inp to float
> %fdiv1 = fdiv float %ui1, 0.000000e+00
> %r = fmul float %fdiv1, %fdiv0
> ret float %r
> }
> ; *** IR Dump After SLPVectorizerPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
> %r0 = urem i16 %inp, 20
> %ui0 = uitofp i16 %r0 to float
> %fadd0 = fadd float %ui0, -1.000000e+01
> %fdiv0 = fdiv float %fadd0, 0.000000e+00
> %ui1 = uitofp i16 %inp to float
> %fdiv1 = fdiv float %ui1, 0.000000e+00
> %r = fmul float %fdiv1, %fdiv0
> ret float %r
> }
> ;; After
> ; *** IR Dump After SimplifyCFGPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
> %r0 = urem i16 %inp, 20
> %1 = add nsw i16 %r0, -10
> %fadd0 = sitofp i16 %1 to float
> %fdiv0 = fdiv float %fadd0, 0.000000e+00
> %ui1 = uitofp i16 %inp to float
> %fdiv1 = fdiv float %ui1, 0.000000e+00
> %r = fmul float %fdiv1, %fdiv0
> ret float %r
> }
> ; *** IR Dump After SLPVectorizerPass on regress ***
> ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
> define float @regress(i16 %inp) local_unnamed_addr #0 {
> %r0 = urem i16 %inp, 20
> %1 = add nsw i16 %r0, -10
> %2 = insertelement <2 x i16> poison, i16 %inp, i32 0
> %3 = insertelement <2 x i16> %2, i16 %1, i32 1
> %4 = uitofp <2 x i16> %3 to <2 x float>
> %5 = sitofp <2 x i16> %3 to <2 x float>
> %6 = shufflevector <2 x float> %4, <2 x float> %5, <2 x i32> <i32 0, i32 3>
> %7 = fdiv <2 x float> %6, zeroinitializer
> %8 = extractelement <2 x float> %7, i32 0
> %9 = extractelement <2 x float> %7, i32 1
> %r = fmul float %8, %9
> ret float %r
> }
> ```
>
> looking into a fix.
Actually, I think its profitable because it saves a division (now vectorized). For example on x86:
```
0000000000000000 <before>:
0: 0f b7 c7 movzwl %di, %eax
3: 69 c8 cd cc 00 00 imull $0xcccd, %eax, %ecx # imm = 0xCCCD
9: c1 e9 12 shrl $0x12, %ecx
c: 83 e1 fc andl $-0x4, %ecx
f: 8d 0c 89 leal (%rcx,%rcx,4), %ecx
12: 29 cf subl %ecx, %edi
14: 0f bf cf movswl %di, %ecx
17: c5 fa 2a c1 vcvtsi2ss %ecx, %xmm0, %xmm0
1b: c5 fa 58 05 00 00 00 00 vaddss (%rip), %xmm0, %xmm0 # 0x23 <before+0x23>
23: c5 f0 57 c9 vxorps %xmm1, %xmm1, %xmm1
27: c5 fa 5e c1 vdivss %xmm1, %xmm0, %xmm0
2b: c5 ea 2a d0 vcvtsi2ss %eax, %xmm2, %xmm2
2f: c5 ea 5e c9 vdivss %xmm1, %xmm2, %xmm1
33: c5 f2 59 c0 vmulss %xmm0, %xmm1, %xmm0
37: c3 retq
38: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax)
0000000000000040 <after>:
40: 0f b7 c7 movzwl %di, %eax
43: 69 c0 cd cc 00 00 imull $0xcccd, %eax, %eax # imm = 0xCCCD
49: c1 e8 12 shrl $0x12, %eax
4c: 83 e0 fc andl $-0x4, %eax
4f: 8d 04 80 leal (%rax,%rax,4), %eax
52: f7 d8 negl %eax
54: 8d 44 07 f6 leal -0xa(%rdi,%rax), %eax
58: c5 f9 6e c7 vmovd %edi, %xmm0
5c: c5 f9 c4 c0 01 vpinsrw $0x1, %eax, %xmm0, %xmm0
61: c4 e2 79 33 c8 vpmovzxwd %xmm0, %xmm1 # xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
66: c5 f8 5b c9 vcvtdq2ps %xmm1, %xmm1
6a: c4 e2 79 23 c0 vpmovsxwd %xmm0, %xmm0
6f: c5 f8 5b c0 vcvtdq2ps %xmm0, %xmm0
73: c4 e3 71 0c c0 02 vblendps $0x2, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
79: c5 f0 57 c9 vxorps %xmm1, %xmm1, %xmm1
7d: c5 f8 5e c1 vdivps %xmm1, %xmm0, %xmm0
81: c5 fa 16 c8 vmovshdup %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
85: c5 fa 59 c1 vmulss %xmm1, %xmm0, %xmm0
89: c3 retq
```
The LLVM MCA summaries are:
```
BEFORE:
Iterations: 100
Instructions: 1500
Total Cycles: 2418
Total uOps: 2000
AFTER:
Iterations: 100
Instructions: 1900
Total Cycles: 590
Total uOps: 2200
```
https://github.com/llvm/llvm-project/pull/84389
More information about the llvm-commits
mailing list