[llvm] r201275 - Remove a very old instcombine where we would turn sequences of selects into
Owen Anderson
resistor at mac.com
Wed Feb 12 16:34:51 PST 2014
On Feb 12, 2014, at 4:10 PM, Nadav Rotem <nrotem at apple.com> wrote:
>
> On Feb 12, 2014, at 4:08 PM, Chandler Carruth <chandlerc at google.com> wrote:
>
>>
>> On Wed, Feb 12, 2014 at 4:05 PM, Nadav Rotem <nrotem at apple.com> wrote:
>> On Feb 12, 2014, at 3:54 PM, Owen Anderson <resistor at mac.com> wrote:
>>
>> > Author: resistor
>> > Date: Wed Feb 12 17:54:07 2014
>> > New Revision: 201275
>> >
>> > URL: http://llvm.org/viewvc/llvm-project?rev=201275&view=rev
>> > Log:
>> > Remove a very old instcombine where
>>
>> For vectors this optimization is always a win. Can you please bring it back and add a check that the comparison type is a vector?
>>
>> Do you have test cases which regressed? I find it odd that I saw no regressions from any of the heavily auto-vectorized benchmarks on x86-64.
>
> I don’t have a specific testcase that regressed, this is purely theoretical.
I don’t observe this optimization actually being profitable for vectors in practice. I haven’t found a single target where applying that instcombine resulted in better code.
—Owen
Pre-instcombine IR:
define <4 x float> @foo4(<4 x float> %a) #0 {
%b = fcmp ogt <4 x float> %a, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
%c = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
%d = fcmp olt <4 x float> %c, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
%e = select <4 x i1> %b, <4 x float> %a, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
%f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
ret <4 x float> %f
}
Post-instcombine IR:
define <4 x float> @foo4(<4 x float> %a) #0 {
%b = fcmp ogt <4 x float> %a, zeroinitializer
%d1 = fcmp olt <4 x float> %a, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
%d = select <4 x i1> %b, <4 x i1> %d1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>
%e = select <4 x i1> %b, <4 x float> %a, <4 x float> zeroinitializer
%f = select <4 x i1> %d, <4 x float> %e, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
ret <4 x float> %f
}
Pre-instcombine X86:
_foo4: ## @foo4
.cfi_startproc
## BB#0:
xorps %xmm1, %xmm1
maxps %xmm1, %xmm0
minps LCPI0_0(%rip), %xmm0
retq
.cfi_endproc
Post-instcombine X86:
_foo4: ## @foo4
.cfi_startproc
## BB#0:
vxorps %xmm1, %xmm1, %xmm1
vcmpltps %xmm0, %xmm1, %xmm2
vmovaps LCPI0_0(%rip), %xmm3
vcmpltps %xmm3, %xmm0, %xmm4
vmovaps LCPI0_1(%rip), %xmm5
vblendvps %xmm2, %xmm4, %xmm5, %xmm2
vpslld $31, %xmm2, %xmm2
vmaxps %xmm1, %xmm0, %xmm0
vblendvps %xmm2, %xmm0, %xmm3, %xmm0
retq
.cfi_endproc
Pre-instcombine ARMv7:
_foo4: @ @foo4
@ BB#0:
vmov d17, r2, r3
vmov d16, r0, r1
vcgt.f32 q10, q8, #0
vmov.f32 q9, #1.000000e+00
vand q8, q8, q10
vcgt.f32 q10, q9, q8
vbsl q10, q8, q9
vmov r0, r1, d20
vmov r2, r3, d21
bx lr
Post-instcombine ARMv7:
_foo4: @ @foo4
@ BB#0:
vmov d17, r2, r3
vmov.f32 q9, #1.000000e+00
vmov d16, r0, r1
vmov.i16 d20, #0xf
vcgt.f32 q12, q8, #0
vmovn.i32 d26, q12
vneg.s16 d20, d20
vcgt.f32 q11, q9, q8
vshl.i16 d26, d26, #15
vand q8, q8, q12
vshl.s16 d20, d26, d20
vmov.f64 d26, #5.000000e-01
vmovn.i32 d22, q11
vmov.i16 d26, #0x1
vbsl d20, d22, d26
vmov.i32 q11, #0x1f
vmovl.u16 q10, d20
vneg.s32 q11, q11
vshl.i32 q10, q10, #31
vshl.s32 q10, q10, q11
vbsl q10, q8, q9
vmov r0, r1, d20
vmov r2, r3, d21
bx lr
Pre-instcombine PPC64 w/ Altivec:
_foo4: ; @foo4
.cfi_startproc
; BB#0:
li r12, lo16(LCPI0_0)
lis r4, ha16(LCPI0_0)
vxor v3, v3, v3
lvx v4, r4, r12
vcmpgtfp v3, v2, v3
vand v2, v2, v3
mfspr r2, 256
oris r3, r2, 6144
vcmpgtfp v3, v4, v2
vandc v4, v4, v3
vand v2, v2, v3
vor v2, v2, v4
mtspr 256, r3
mtspr 256, r2
blr
.cfi_endproc
Post-instcombine PPC64 w/ Altivec:
_foo4: ; @foo4
.cfi_startproc
; BB#0:
li r12, lo16(LCPI0_0)
lis r4, ha16(LCPI0_0)
vxor v3, v3, v3
vspltisw v5, 1
lvx v4, r4, r12
vcmpgtfp v3, v2, v3
vspltisw v1, -16
vspltisw v6, 15
vandc v5, v5, v3
vsubuwm v1, v6, v1
vand v18, v2, v3
mfspr r2, 256
vcmpgtfp v0, v4, v2
oris r3, r2, 56832
vand v0, v0, v3
vor v5, v0, v5
vslw v5, v5, v1
vsraw v5, v5, v1
vandc v19, v4, v5
vand v2, v18, v5
vor v2, v2, v19
mtspr 256, r3
mtspr 256, r2
blr
.cfi_endproc
Pre-instcombine MIPS64:
foo4: # @foo4
.cfi_startproc
.frame $sp,0,$ra
.mask 0x00000000,0
.fmask 0x00000000,0
.set noreorder
.set nomacro
.set noat
# BB#0:
lui $1, %hi(%neg(%gp_rel(foo4)))
daddu $1, $1, $25
daddiu $1, $1, %lo(%neg(%gp_rel(foo4)))
mtc1 $zero, $f0
c.ule.s $f15, $f0
mov.s $f1, $f0
movf.s $f1, $f15, $fcc0
c.ule.s $f14, $f0
mov.s $f2, $f0
movf.s $f2, $f14, $fcc0
ld $1, %got_page($CPI0_0)($1)
lwc1 $f3, %got_ofst($CPI0_0)($1)
c.ule.s $f13, $f0
mov.s $f4, $f0
movf.s $f4, $f13, $fcc0
c.olt.s $f2, $f3
mov.s $f5, $f3
movt.s $f5, $f2, $fcc0
c.olt.s $f1, $f3
mov.s $f2, $f3
movt.s $f2, $f1, $fcc0
c.ule.s $f16, $f0
movf.s $f0, $f16, $fcc0
c.olt.s $f0, $f3
mov.s $f1, $f3
movt.s $f1, $f0, $fcc0
swc1 $f1, 12($4)
swc1 $f2, 8($4)
swc1 $f5, 4($4)
c.olt.s $f4, $f3
movt.s $f3, $f4, $fcc0
jr $ra
swc1 $f3, 0($4)
.set at
.set macro
.set reorder
.end foo4
$tmp0:
.size foo4, ($tmp0)-foo4
.cfi_endproc
Post-instcombine MIPS64:
foo4: # @foo4
.cfi_startproc
.frame $sp,0,$ra
.mask 0x00000000,0
.fmask 0x00000000,0
.set noreorder
.set nomacro
.set noat
# BB#0:
lui $1, %hi(%neg(%gp_rel(foo4)))
daddu $1, $1, $25
daddiu $1, $1, %lo(%neg(%gp_rel(foo4)))
ld $1, %got_page($CPI0_0)($1)
lwc1 $f0, %got_ofst($CPI0_0)($1)
addiu $1, $zero, 0
addiu $2, $zero, 1
c.olt.s $f15, $f0
addiu $3, $zero, 0
movt $3, $2, $fcc0
mtc1 $zero, $f1
c.olt.s $f16, $f0
addiu $5, $zero, 0
movt $5, $2, $fcc0
c.ule.s $f15, $f1
addiu $6, $zero, 1
movf $6, $3, $fcc0
c.olt.s $f14, $f0
addiu $3, $zero, 0
movt $3, $2, $fcc0
c.olt.s $f13, $f0
movt $1, $2, $fcc0
c.ule.s $f16, $f1
mov.s $f2, $f1
movf.s $f2, $f16, $fcc0
c.ule.s $f15, $f1
mov.s $f3, $f1
movf.s $f3, $f15, $fcc0
c.ule.s $f14, $f1
mov.s $f4, $f1
movf.s $f4, $f14, $fcc0
c.ule.s $f13, $f1
mov.s $f5, $f1
movf.s $f5, $f13, $fcc0
c.ule.s $f14, $f1
addiu $7, $zero, 1
movf $7, $3, $fcc0
mov.s $f6, $f0
movn.s $f6, $f4, $7
mov.s $f4, $f0
movn.s $f4, $f3, $6
c.ule.s $f16, $f1
addiu $3, $zero, 1
movf $3, $5, $fcc0
mov.s $f3, $f0
movn.s $f3, $f2, $3
swc1 $f3, 12($4)
swc1 $f4, 8($4)
swc1 $f6, 4($4)
c.ule.s $f13, $f1
movf $2, $1, $fcc0
movn.s $f0, $f5, $2
jr $ra
swc1 $f0, 0($4)
.set at
.set macro
.set reorder
.end foo4
$tmp0:
.size foo4, ($tmp0)-foo4
.cfi_endproc
More information about the llvm-commits
mailing list