[llvm] r201275 - Remove a very old instcombine where we would turn sequences of selects into

Nadav Rotem nrotem at apple.com
Wed Feb 12 16:36:36 PST 2014


Sounds good. Thanks!

On Feb 12, 2014, at 4:34 PM, Owen Anderson <resistor at mac.com> wrote:

> On Feb 12, 2014, at 4:10 PM, Nadav Rotem <nrotem at apple.com> wrote:
> 
>> 
>> On Feb 12, 2014, at 4:08 PM, Chandler Carruth <chandlerc at google.com> wrote:
>> 
>>> 
>>> On Wed, Feb 12, 2014 at 4:05 PM, Nadav Rotem <nrotem at apple.com> wrote:
>>> On Feb 12, 2014, at 3:54 PM, Owen Anderson <resistor at mac.com> wrote:
>>> 
>>>> Author: resistor
>>>> Date: Wed Feb 12 17:54:07 2014
>>>> New Revision: 201275
>>>> 
>>>> URL: http://llvm.org/viewvc/llvm-project?rev=201275&view=rev
>>>> Log:
>>>> Remove a very old instcombine where
>>> 
>>> For vectors this optimization is always a win.  Can you please bring it back and add a check that the comparison type is a vector?
>>> 
>>> Do you have test cases which regressed? I find it odd that I saw no regressions from any of the heavily auto-vectorized benchmarks on x86-64.
>> 
>> I don’t have a specific testcase that regressed, this is purely theoretical.  
> 
> I don’t observe this optimization actually being profitable for vectors in practice.  I haven’t found a single target where applying that instcombine resulted in better code.
> 
> —Owen
> 
> Pre-instcombine IR:
> define <4 x float> @foo4(<4 x float> %a) #0 {
>  %b = fcmp ogt <4 x float> %a, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
>  %c = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
>  %d = fcmp olt <4 x float> %c, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
>  %e = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
>  %f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
>  ret <4 x float> %f
> }
> 
> Post-instcombine IR:
> define <4 x float> @foo4(<4 x float> %a) #0 {
>  %b = fcmp ogt <4 x float> %a, zeroinitializer
>  %d1 = fcmp olt <4 x float> %a, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
>  %d = select <4 x i1> %b, <4 x i1> %d1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>
>  %e = select <4 x i1> %b, <4 x float> %a, <4 x float> zeroinitializer
>  %f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
>  ret <4 x float> %f
> }
> 
> Pre-instcombine X86:
> _foo4:                                  ## @foo4
> 	.cfi_startproc
> ## BB#0:
> 	xorps	%xmm1, %xmm1
> 	maxps	%xmm1, %xmm0
> 	minps	LCPI0_0(%rip), %xmm0
> 	retq
> 	.cfi_endproc
> 
> Post-instcombine X86:
> _foo4:                                  ## @foo4
> 	.cfi_startproc
> ## BB#0:
> 	vxorps	%xmm1, %xmm1, %xmm1
> 	vcmpltps	%xmm0, %xmm1, %xmm2
> 	vmovaps	LCPI0_0(%rip), %xmm3
> 	vcmpltps	%xmm3, %xmm0, %xmm4
> 	vmovaps	LCPI0_1(%rip), %xmm5
> 	vblendvps	%xmm2, %xmm4, %xmm5, %xmm2
> 	vpslld	$31, %xmm2, %xmm2
> 	vmaxps	%xmm1, %xmm0, %xmm0
> 	vblendvps	%xmm2, %xmm0, %xmm3, %xmm0
> 	retq
> 	.cfi_endproc
> 
> Pre-instcombine ARMv7:
> _foo4:                                  @ @foo4
> @ BB#0:
> 	vmov	d17, r2, r3
> 	vmov	d16, r0, r1
> 	vcgt.f32	q10, q8, #0
> 	vmov.f32	q9, #1.000000e+00
> 	vand	q8, q8, q10
> 	vcgt.f32	q10, q9, q8
> 	vbsl	q10, q8, q9
> 	vmov	r0, r1, d20
> 	vmov	r2, r3, d21
> 	bx	lr
> 
> Post-instcombine ARMv7:
> _foo4:                                  @ @foo4
> @ BB#0:
> 	vmov	d17, r2, r3
> 	vmov.f32	q9, #1.000000e+00
> 	vmov	d16, r0, r1
> 	vmov.i16	d20, #0xf
> 	vcgt.f32	q12, q8, #0
> 	vmovn.i32	d26, q12
> 	vneg.s16	d20, d20
> 	vcgt.f32	q11, q9, q8
> 	vshl.i16	d26, d26, #15
> 	vand	q8, q8, q12
> 	vshl.s16	d20, d26, d20
> 	vmov.f64	d26, #5.000000e-01
> 	vmovn.i32	d22, q11
> 	vmov.i16	d26, #0x1
> 	vbsl	d20, d22, d26
> 	vmov.i32	q11, #0x1f
> 	vmovl.u16	q10, d20
> 	vneg.s32	q11, q11
> 	vshl.i32	q10, q10, #31
> 	vshl.s32	q10, q10, q11
> 	vbsl	q10, q8, q9
> 	vmov	r0, r1, d20
> 	vmov	r2, r3, d21
> 	bx	lr
> 
> Pre-instcombine PPC64 w/ Altivec:
> _foo4:                                  ; @foo4
> 	.cfi_startproc
> ; BB#0:
> 	li r12, lo16(LCPI0_0)
> 	lis r4, ha16(LCPI0_0)
> 	vxor v3, v3, v3
> 	lvx v4, r4, r12
> 	vcmpgtfp v3, v2, v3
> 	vand v2, v2, v3
> 	mfspr r2, 256
> 	oris r3, r2, 6144
> 	vcmpgtfp v3, v4, v2
> 	vandc v4, v4, v3
> 	vand v2, v2, v3
> 	vor v2, v2, v4
> 	mtspr 256, r3
> 	mtspr 256, r2
> 	blr
> 	.cfi_endproc
> 
> Post-instcombine PPC64 w/ Altivec:
> _foo4:                                  ; @foo4
> 	.cfi_startproc
> ; BB#0:
> 	li r12, lo16(LCPI0_0)
> 	lis r4, ha16(LCPI0_0)
> 	vxor v3, v3, v3
> 	vspltisw v5, 1
> 	lvx v4, r4, r12
> 	vcmpgtfp v3, v2, v3
> 	vspltisw v1, -16
> 	vspltisw v6, 15
> 	vandc v5, v5, v3
> 	vsubuwm v1, v6, v1
> 	vand v18, v2, v3
> 	mfspr r2, 256
> 	vcmpgtfp v0, v4, v2
> 	oris r3, r2, 56832
> 	vand v0, v0, v3
> 	vor v5, v0, v5
> 	vslw v5, v5, v1
> 	vsraw v5, v5, v1
> 	vandc v19, v4, v5
> 	vand v2, v18, v5
> 	vor v2, v2, v19
> 	mtspr 256, r3
> 	mtspr 256, r2
> 	blr
> 	.cfi_endproc
> 
> Pre-instcombine MIPS64:
> foo4:                                   # @foo4
> 	.cfi_startproc
> 	.frame	$sp,0,$ra
> 	.mask 	0x00000000,0
> 	.fmask	0x00000000,0
> 	.set	noreorder
> 	.set	nomacro
> 	.set	noat
> # BB#0:
> 	lui	$1, %hi(%neg(%gp_rel(foo4)))
> 	daddu	$1, $1, $25
> 	daddiu	$1, $1, %lo(%neg(%gp_rel(foo4)))
> 	mtc1	$zero, $f0
> 	c.ule.s	$f15, $f0
> 	mov.s	$f1, $f0
> 	movf.s	$f1, $f15, $fcc0
> 	c.ule.s	$f14, $f0
> 	mov.s	$f2, $f0
> 	movf.s	$f2, $f14, $fcc0
> 	ld	$1, %got_page($CPI0_0)($1)
> 	lwc1	$f3, %got_ofst($CPI0_0)($1)
> 	c.ule.s	$f13, $f0
> 	mov.s	$f4, $f0
> 	movf.s	$f4, $f13, $fcc0
> 	c.olt.s	$f2, $f3
> 	mov.s	$f5, $f3
> 	movt.s	$f5, $f2, $fcc0
> 	c.olt.s	$f1, $f3
> 	mov.s	$f2, $f3
> 	movt.s	$f2, $f1, $fcc0
> 	c.ule.s	$f16, $f0
> 	movf.s	$f0, $f16, $fcc0
> 	c.olt.s	$f0, $f3
> 	mov.s	$f1, $f3
> 	movt.s	$f1, $f0, $fcc0
> 	swc1	$f1, 12($4)
> 	swc1	$f2, 8($4)
> 	swc1	$f5, 4($4)
> 	c.olt.s	$f4, $f3
> 	movt.s	$f3, $f4, $fcc0
> 	jr	$ra
> 	swc1	$f3, 0($4)
> 	.set	at
> 	.set	macro
> 	.set	reorder
> 	.end	foo4
> $tmp0:
> 	.size	foo4, ($tmp0)-foo4
> 	.cfi_endproc
> 
> Post-instcombine MIPS64:
> foo4:                                   # @foo4
> 	.cfi_startproc
> 	.frame	$sp,0,$ra
> 	.mask 	0x00000000,0
> 	.fmask	0x00000000,0
> 	.set	noreorder
> 	.set	nomacro
> 	.set	noat
> # BB#0:
> 	lui	$1, %hi(%neg(%gp_rel(foo4)))
> 	daddu	$1, $1, $25
> 	daddiu	$1, $1, %lo(%neg(%gp_rel(foo4)))
> 	ld	$1, %got_page($CPI0_0)($1)
> 	lwc1	$f0, %got_ofst($CPI0_0)($1)
> 	addiu	$1, $zero, 0
> 	addiu	$2, $zero, 1
> 	c.olt.s	$f15, $f0
> 	addiu	$3, $zero, 0
> 	movt	$3, $2, $fcc0
> 	mtc1	$zero, $f1
> 	c.olt.s	$f16, $f0
> 	addiu	$5, $zero, 0
> 	movt	$5, $2, $fcc0
> 	c.ule.s	$f15, $f1
> 	addiu	$6, $zero, 1
> 	movf	$6, $3, $fcc0
> 	c.olt.s	$f14, $f0
> 	addiu	$3, $zero, 0
> 	movt	$3, $2, $fcc0
> 	c.olt.s	$f13, $f0
> 	movt	$1, $2, $fcc0
> 	c.ule.s	$f16, $f1
> 	mov.s	$f2, $f1
> 	movf.s	$f2, $f16, $fcc0
> 	c.ule.s	$f15, $f1
> 	mov.s	$f3, $f1
> 	movf.s	$f3, $f15, $fcc0
> 	c.ule.s	$f14, $f1
> 	mov.s	$f4, $f1
> 	movf.s	$f4, $f14, $fcc0
> 	c.ule.s	$f13, $f1
> 	mov.s	$f5, $f1
> 	movf.s	$f5, $f13, $fcc0
> 	c.ule.s	$f14, $f1
> 	addiu	$7, $zero, 1
> 	movf	$7, $3, $fcc0
> 	mov.s	$f6, $f0
> 	movn.s	$f6, $f4, $7
> 	mov.s	$f4, $f0
> 	movn.s	$f4, $f3, $6
> 	c.ule.s	$f16, $f1
> 	addiu	$3, $zero, 1
> 	movf	$3, $5, $fcc0
> 	mov.s	$f3, $f0
> 	movn.s	$f3, $f2, $3
> 	swc1	$f3, 12($4)
> 	swc1	$f4, 8($4)
> 	swc1	$f6, 4($4)
> 	c.ule.s	$f13, $f1
> 	movf	$2, $1, $fcc0
> 	movn.s	$f0, $f5, $2
> 	jr	$ra
> 	swc1	$f0, 0($4)
> 	.set	at
> 	.set	macro
> 	.set	reorder
> 	.end	foo4
> $tmp0:
> 	.size	foo4, ($tmp0)-foo4
> 	.cfi_endproc
> 





More information about the llvm-commits mailing list