[llvm] r201275 - Remove a very old instcombine where we would turn sequences of selects into

Owen Anderson resistor at mac.com
Wed Feb 12 16:34:51 PST 2014


On Feb 12, 2014, at 4:10 PM, Nadav Rotem <nrotem at apple.com> wrote:

> 
> On Feb 12, 2014, at 4:08 PM, Chandler Carruth <chandlerc at google.com> wrote:
> 
>> 
>> On Wed, Feb 12, 2014 at 4:05 PM, Nadav Rotem <nrotem at apple.com> wrote:
>> On Feb 12, 2014, at 3:54 PM, Owen Anderson <resistor at mac.com> wrote:
>> 
>> > Author: resistor
>> > Date: Wed Feb 12 17:54:07 2014
>> > New Revision: 201275
>> >
>> > URL: http://llvm.org/viewvc/llvm-project?rev=201275&view=rev
>> > Log:
>> > Remove a very old instcombine where
>> 
>> For vectors this optimization is always a win.  Can you please bring it back and add a check that the comparison type is a vector?
>> 
>> Do you have test cases which regressed? I find it odd that I saw no regressions from any of the heavily auto-vectorized benchmarks on x86-64.
> 
> I don’t have a specific testcase that regressed, this is purely theoretical.  

I don’t observe this optimization actually being profitable for vectors in practice.  I haven’t found a single target where applying that instcombine resulted in better code.

—Owen

Pre-instcombine IR:
define <4 x float> @foo4(<4 x float> %a) #0 {
  %b = fcmp ogt <4 x float> %a, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
  %c = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
  %d = fcmp olt <4 x float> %c, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %e = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
  %f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  ret <4 x float> %f
}

Post-instcombine IR:
define <4 x float> @foo4(<4 x float> %a) #0 {
  %b = fcmp ogt <4 x float> %a, zeroinitializer
  %d1 = fcmp olt <4 x float> %a, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  %d = select <4 x i1> %b, <4 x i1> %d1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>
  %e = select <4 x i1> %b, <4 x float> %a, <4 x float> zeroinitializer
  %f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
  ret <4 x float> %f
}

Pre-instcombine X86:
_foo4:                                  ## @foo4
	.cfi_startproc
## BB#0:
	xorps	%xmm1, %xmm1
	maxps	%xmm1, %xmm0
	minps	LCPI0_0(%rip), %xmm0
	retq
	.cfi_endproc

Post-instcombine X86:
_foo4:                                  ## @foo4
	.cfi_startproc
## BB#0:
	vxorps	%xmm1, %xmm1, %xmm1
	vcmpltps	%xmm0, %xmm1, %xmm2
	vmovaps	LCPI0_0(%rip), %xmm3
	vcmpltps	%xmm3, %xmm0, %xmm4
	vmovaps	LCPI0_1(%rip), %xmm5
	vblendvps	%xmm2, %xmm4, %xmm5, %xmm2
	vpslld	$31, %xmm2, %xmm2
	vmaxps	%xmm1, %xmm0, %xmm0
	vblendvps	%xmm2, %xmm0, %xmm3, %xmm0
	retq
	.cfi_endproc

Pre-instcombine ARMv7:
_foo4:                                  @ @foo4
@ BB#0:
	vmov	d17, r2, r3
	vmov	d16, r0, r1
	vcgt.f32	q10, q8, #0
	vmov.f32	q9, #1.000000e+00
	vand	q8, q8, q10
	vcgt.f32	q10, q9, q8
	vbsl	q10, q8, q9
	vmov	r0, r1, d20
	vmov	r2, r3, d21
	bx	lr

Post-instcombine ARMv7:
_foo4:                                  @ @foo4
@ BB#0:
	vmov	d17, r2, r3
	vmov.f32	q9, #1.000000e+00
	vmov	d16, r0, r1
	vmov.i16	d20, #0xf
	vcgt.f32	q12, q8, #0
	vmovn.i32	d26, q12
	vneg.s16	d20, d20
	vcgt.f32	q11, q9, q8
	vshl.i16	d26, d26, #15
	vand	q8, q8, q12
	vshl.s16	d20, d26, d20
	vmov.f64	d26, #5.000000e-01
	vmovn.i32	d22, q11
	vmov.i16	d26, #0x1
	vbsl	d20, d22, d26
	vmov.i32	q11, #0x1f
	vmovl.u16	q10, d20
	vneg.s32	q11, q11
	vshl.i32	q10, q10, #31
	vshl.s32	q10, q10, q11
	vbsl	q10, q8, q9
	vmov	r0, r1, d20
	vmov	r2, r3, d21
	bx	lr

Pre-instcombine PPC64 w/ Altivec:
_foo4:                                  ; @foo4
	.cfi_startproc
; BB#0:
	li r12, lo16(LCPI0_0)
	lis r4, ha16(LCPI0_0)
	vxor v3, v3, v3
	lvx v4, r4, r12
	vcmpgtfp v3, v2, v3
	vand v2, v2, v3
	mfspr r2, 256
	oris r3, r2, 6144
	vcmpgtfp v3, v4, v2
	vandc v4, v4, v3
	vand v2, v2, v3
	vor v2, v2, v4
	mtspr 256, r3
	mtspr 256, r2
	blr
	.cfi_endproc

Post-instcombine PPC64 w/ Altivec:
_foo4:                                  ; @foo4
	.cfi_startproc
; BB#0:
	li r12, lo16(LCPI0_0)
	lis r4, ha16(LCPI0_0)
	vxor v3, v3, v3
	vspltisw v5, 1
	lvx v4, r4, r12
	vcmpgtfp v3, v2, v3
	vspltisw v1, -16
	vspltisw v6, 15
	vandc v5, v5, v3
	vsubuwm v1, v6, v1
	vand v18, v2, v3
	mfspr r2, 256
	vcmpgtfp v0, v4, v2
	oris r3, r2, 56832
	vand v0, v0, v3
	vor v5, v0, v5
	vslw v5, v5, v1
	vsraw v5, v5, v1
	vandc v19, v4, v5
	vand v2, v18, v5
	vor v2, v2, v19
	mtspr 256, r3
	mtspr 256, r2
	blr
	.cfi_endproc

Pre-instcombine MIPS64:
foo4:                                   # @foo4
	.cfi_startproc
	.frame	$sp,0,$ra
	.mask 	0x00000000,0
	.fmask	0x00000000,0
	.set	noreorder
	.set	nomacro
	.set	noat
# BB#0:
	lui	$1, %hi(%neg(%gp_rel(foo4)))
	daddu	$1, $1, $25
	daddiu	$1, $1, %lo(%neg(%gp_rel(foo4)))
	mtc1	$zero, $f0
	c.ule.s	$f15, $f0
	mov.s	$f1, $f0
	movf.s	$f1, $f15, $fcc0
	c.ule.s	$f14, $f0
	mov.s	$f2, $f0
	movf.s	$f2, $f14, $fcc0
	ld	$1, %got_page($CPI0_0)($1)
	lwc1	$f3, %got_ofst($CPI0_0)($1)
	c.ule.s	$f13, $f0
	mov.s	$f4, $f0
	movf.s	$f4, $f13, $fcc0
	c.olt.s	$f2, $f3
	mov.s	$f5, $f3
	movt.s	$f5, $f2, $fcc0
	c.olt.s	$f1, $f3
	mov.s	$f2, $f3
	movt.s	$f2, $f1, $fcc0
	c.ule.s	$f16, $f0
	movf.s	$f0, $f16, $fcc0
	c.olt.s	$f0, $f3
	mov.s	$f1, $f3
	movt.s	$f1, $f0, $fcc0
	swc1	$f1, 12($4)
	swc1	$f2, 8($4)
	swc1	$f5, 4($4)
	c.olt.s	$f4, $f3
	movt.s	$f3, $f4, $fcc0
	jr	$ra
	swc1	$f3, 0($4)
	.set	at
	.set	macro
	.set	reorder
	.end	foo4
$tmp0:
	.size	foo4, ($tmp0)-foo4
	.cfi_endproc

Post-instcombine MIPS64:
foo4:                                   # @foo4
	.cfi_startproc
	.frame	$sp,0,$ra
	.mask 	0x00000000,0
	.fmask	0x00000000,0
	.set	noreorder
	.set	nomacro
	.set	noat
# BB#0:
	lui	$1, %hi(%neg(%gp_rel(foo4)))
	daddu	$1, $1, $25
	daddiu	$1, $1, %lo(%neg(%gp_rel(foo4)))
	ld	$1, %got_page($CPI0_0)($1)
	lwc1	$f0, %got_ofst($CPI0_0)($1)
	addiu	$1, $zero, 0
	addiu	$2, $zero, 1
	c.olt.s	$f15, $f0
	addiu	$3, $zero, 0
	movt	$3, $2, $fcc0
	mtc1	$zero, $f1
	c.olt.s	$f16, $f0
	addiu	$5, $zero, 0
	movt	$5, $2, $fcc0
	c.ule.s	$f15, $f1
	addiu	$6, $zero, 1
	movf	$6, $3, $fcc0
	c.olt.s	$f14, $f0
	addiu	$3, $zero, 0
	movt	$3, $2, $fcc0
	c.olt.s	$f13, $f0
	movt	$1, $2, $fcc0
	c.ule.s	$f16, $f1
	mov.s	$f2, $f1
	movf.s	$f2, $f16, $fcc0
	c.ule.s	$f15, $f1
	mov.s	$f3, $f1
	movf.s	$f3, $f15, $fcc0
	c.ule.s	$f14, $f1
	mov.s	$f4, $f1
	movf.s	$f4, $f14, $fcc0
	c.ule.s	$f13, $f1
	mov.s	$f5, $f1
	movf.s	$f5, $f13, $fcc0
	c.ule.s	$f14, $f1
	addiu	$7, $zero, 1
	movf	$7, $3, $fcc0
	mov.s	$f6, $f0
	movn.s	$f6, $f4, $7
	mov.s	$f4, $f0
	movn.s	$f4, $f3, $6
	c.ule.s	$f16, $f1
	addiu	$3, $zero, 1
	movf	$3, $5, $fcc0
	mov.s	$f3, $f0
	movn.s	$f3, $f2, $3
	swc1	$f3, 12($4)
	swc1	$f4, 8($4)
	swc1	$f6, 4($4)
	c.ule.s	$f13, $f1
	movf	$2, $1, $fcc0
	movn.s	$f0, $f5, $2
	jr	$ra
	swc1	$f0, 0($4)
	.set	at
	.set	macro
	.set	reorder
	.end	foo4
$tmp0:
	.size	foo4, ($tmp0)-foo4
	.cfi_endproc





More information about the llvm-commits mailing list