[llvm] r201275 - Remove a very old instcombine where we would turn sequences of selects into
Nadav Rotem
nrotem at apple.com
Wed Feb 12 16:36:36 PST 2014
Sounds good. Thanks!
On Feb 12, 2014, at 4:34 PM, Owen Anderson <resistor at mac.com> wrote:
> On Feb 12, 2014, at 4:10 PM, Nadav Rotem <nrotem at apple.com> wrote:
>
>>
>> On Feb 12, 2014, at 4:08 PM, Chandler Carruth <chandlerc at google.com> wrote:
>>
>>>
>>> On Wed, Feb 12, 2014 at 4:05 PM, Nadav Rotem <nrotem at apple.com> wrote:
>>> On Feb 12, 2014, at 3:54 PM, Owen Anderson <resistor at mac.com> wrote:
>>>
>>>> Author: resistor
>>>> Date: Wed Feb 12 17:54:07 2014
>>>> New Revision: 201275
>>>>
>>>> URL: http://llvm.org/viewvc/llvm-project?rev=201275&view=rev
>>>> Log:
>>>> Remove a very old instcombine where
>>>
>>> For vectors this optimization is always a win. Can you please bring it back and add a check that the comparison type is a vector?
>>>
>>> Do you have test cases which regressed? I find it odd that I saw no regressions from any of the heavily auto-vectorized benchmarks on x86-64.
>>
>> I don’t have a specific testcase that regressed, this is purely theoretical.
>
> I don’t observe this optimization actually being profitable for vectors in practice. I haven’t found a single target where applying that instcombine resulted in better code.
>
> —Owen
>
> Pre-instcombine IR:
> define <4 x float> @foo4(<4 x float> %a) #0 {
> %b = fcmp ogt <4 x float> %a, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
> %c = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
> %d = fcmp olt <4 x float> %c, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
> %e = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
> %f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
> ret <4 x float> %f
> }
>
> Post-instcombine IR:
> define <4 x float> @foo4(<4 x float> %a) #0 {
> %b = fcmp ogt <4 x float> %a, zeroinitializer
> %d1 = fcmp olt <4 x float> %a, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
> %d = select <4 x i1> %b, <4 x i1> %d1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>
> %e = select <4 x i1> %b, <4 x float> %a, <4 x float> zeroinitializer
> %f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
> ret <4 x float> %f
> }
>
> Pre-instcombine X86:
> _foo4: ## @foo4
> .cfi_startproc
> ## BB#0:
> xorps %xmm1, %xmm1
> maxps %xmm1, %xmm0
> minps LCPI0_0(%rip), %xmm0
> retq
> .cfi_endproc
>
> Post-instcombine X86:
> _foo4: ## @foo4
> .cfi_startproc
> ## BB#0:
> vxorps %xmm1, %xmm1, %xmm1
> vcmpltps %xmm0, %xmm1, %xmm2
> vmovaps LCPI0_0(%rip), %xmm3
> vcmpltps %xmm3, %xmm0, %xmm4
> vmovaps LCPI0_1(%rip), %xmm5
> vblendvps %xmm2, %xmm4, %xmm5, %xmm2
> vpslld $31, %xmm2, %xmm2
> vmaxps %xmm1, %xmm0, %xmm0
> vblendvps %xmm2, %xmm0, %xmm3, %xmm0
> retq
> .cfi_endproc
>
> Pre-instcombine ARMv7:
> _foo4: @ @foo4
> @ BB#0:
> vmov d17, r2, r3
> vmov d16, r0, r1
> vcgt.f32 q10, q8, #0
> vmov.f32 q9, #1.000000e+00
> vand q8, q8, q10
> vcgt.f32 q10, q9, q8
> vbsl q10, q8, q9
> vmov r0, r1, d20
> vmov r2, r3, d21
> bx lr
>
> Post-instcombine ARMv7:
> _foo4: @ @foo4
> @ BB#0:
> vmov d17, r2, r3
> vmov.f32 q9, #1.000000e+00
> vmov d16, r0, r1
> vmov.i16 d20, #0xf
> vcgt.f32 q12, q8, #0
> vmovn.i32 d26, q12
> vneg.s16 d20, d20
> vcgt.f32 q11, q9, q8
> vshl.i16 d26, d26, #15
> vand q8, q8, q12
> vshl.s16 d20, d26, d20
> vmov.f64 d26, #5.000000e-01
> vmovn.i32 d22, q11
> vmov.i16 d26, #0x1
> vbsl d20, d22, d26
> vmov.i32 q11, #0x1f
> vmovl.u16 q10, d20
> vneg.s32 q11, q11
> vshl.i32 q10, q10, #31
> vshl.s32 q10, q10, q11
> vbsl q10, q8, q9
> vmov r0, r1, d20
> vmov r2, r3, d21
> bx lr
>
> Pre-instcombine PPC64 w/ Altivec:
> _foo4: ; @foo4
> .cfi_startproc
> ; BB#0:
> li r12, lo16(LCPI0_0)
> lis r4, ha16(LCPI0_0)
> vxor v3, v3, v3
> lvx v4, r4, r12
> vcmpgtfp v3, v2, v3
> vand v2, v2, v3
> mfspr r2, 256
> oris r3, r2, 6144
> vcmpgtfp v3, v4, v2
> vandc v4, v4, v3
> vand v2, v2, v3
> vor v2, v2, v4
> mtspr 256, r3
> mtspr 256, r2
> blr
> .cfi_endproc
>
> Post-instcombine PPC64 w/ Altivec:
> _foo4: ; @foo4
> .cfi_startproc
> ; BB#0:
> li r12, lo16(LCPI0_0)
> lis r4, ha16(LCPI0_0)
> vxor v3, v3, v3
> vspltisw v5, 1
> lvx v4, r4, r12
> vcmpgtfp v3, v2, v3
> vspltisw v1, -16
> vspltisw v6, 15
> vandc v5, v5, v3
> vsubuwm v1, v6, v1
> vand v18, v2, v3
> mfspr r2, 256
> vcmpgtfp v0, v4, v2
> oris r3, r2, 56832
> vand v0, v0, v3
> vor v5, v0, v5
> vslw v5, v5, v1
> vsraw v5, v5, v1
> vandc v19, v4, v5
> vand v2, v18, v5
> vor v2, v2, v19
> mtspr 256, r3
> mtspr 256, r2
> blr
> .cfi_endproc
>
> Pre-instcombine MIPS64:
> foo4: # @foo4
> .cfi_startproc
> .frame $sp,0,$ra
> .mask 0x00000000,0
> .fmask 0x00000000,0
> .set noreorder
> .set nomacro
> .set noat
> # BB#0:
> lui $1, %hi(%neg(%gp_rel(foo4)))
> daddu $1, $1, $25
> daddiu $1, $1, %lo(%neg(%gp_rel(foo4)))
> mtc1 $zero, $f0
> c.ule.s $f15, $f0
> mov.s $f1, $f0
> movf.s $f1, $f15, $fcc0
> c.ule.s $f14, $f0
> mov.s $f2, $f0
> movf.s $f2, $f14, $fcc0
> ld $1, %got_page($CPI0_0)($1)
> lwc1 $f3, %got_ofst($CPI0_0)($1)
> c.ule.s $f13, $f0
> mov.s $f4, $f0
> movf.s $f4, $f13, $fcc0
> c.olt.s $f2, $f3
> mov.s $f5, $f3
> movt.s $f5, $f2, $fcc0
> c.olt.s $f1, $f3
> mov.s $f2, $f3
> movt.s $f2, $f1, $fcc0
> c.ule.s $f16, $f0
> movf.s $f0, $f16, $fcc0
> c.olt.s $f0, $f3
> mov.s $f1, $f3
> movt.s $f1, $f0, $fcc0
> swc1 $f1, 12($4)
> swc1 $f2, 8($4)
> swc1 $f5, 4($4)
> c.olt.s $f4, $f3
> movt.s $f3, $f4, $fcc0
> jr $ra
> swc1 $f3, 0($4)
> .set at
> .set macro
> .set reorder
> .end foo4
> $tmp0:
> .size foo4, ($tmp0)-foo4
> .cfi_endproc
>
> Post-instcombine MIPS64:
> foo4: # @foo4
> .cfi_startproc
> .frame $sp,0,$ra
> .mask 0x00000000,0
> .fmask 0x00000000,0
> .set noreorder
> .set nomacro
> .set noat
> # BB#0:
> lui $1, %hi(%neg(%gp_rel(foo4)))
> daddu $1, $1, $25
> daddiu $1, $1, %lo(%neg(%gp_rel(foo4)))
> ld $1, %got_page($CPI0_0)($1)
> lwc1 $f0, %got_ofst($CPI0_0)($1)
> addiu $1, $zero, 0
> addiu $2, $zero, 1
> c.olt.s $f15, $f0
> addiu $3, $zero, 0
> movt $3, $2, $fcc0
> mtc1 $zero, $f1
> c.olt.s $f16, $f0
> addiu $5, $zero, 0
> movt $5, $2, $fcc0
> c.ule.s $f15, $f1
> addiu $6, $zero, 1
> movf $6, $3, $fcc0
> c.olt.s $f14, $f0
> addiu $3, $zero, 0
> movt $3, $2, $fcc0
> c.olt.s $f13, $f0
> movt $1, $2, $fcc0
> c.ule.s $f16, $f1
> mov.s $f2, $f1
> movf.s $f2, $f16, $fcc0
> c.ule.s $f15, $f1
> mov.s $f3, $f1
> movf.s $f3, $f15, $fcc0
> c.ule.s $f14, $f1
> mov.s $f4, $f1
> movf.s $f4, $f14, $fcc0
> c.ule.s $f13, $f1
> mov.s $f5, $f1
> movf.s $f5, $f13, $fcc0
> c.ule.s $f14, $f1
> addiu $7, $zero, 1
> movf $7, $3, $fcc0
> mov.s $f6, $f0
> movn.s $f6, $f4, $7
> mov.s $f4, $f0
> movn.s $f4, $f3, $6
> c.ule.s $f16, $f1
> addiu $3, $zero, 1
> movf $3, $5, $fcc0
> mov.s $f3, $f0
> movn.s $f3, $f2, $3
> swc1 $f3, 12($4)
> swc1 $f4, 8($4)
> swc1 $f6, 4($4)
> c.ule.s $f13, $f1
> movf $2, $1, $fcc0
> movn.s $f0, $f5, $2
> jr $ra
> swc1 $f0, 0($4)
> .set at
> .set macro
> .set reorder
> .end foo4
> $tmp0:
> .size foo4, ($tmp0)-foo4
> .cfi_endproc
>
More information about the llvm-commits
mailing list