[PATCH] D13417: [MachineCombiner] make slack optional in critical path cost calculation (PR25016)

Gerolf Hoflehner via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 3 19:39:40 PST 2015


I think we need your patch. It would be nice to have also a test case that shows an actual increase in critical path length and a measurable performance difference. In the test cases so far the vadd sequences take longer w/o actually increasing the critical path. So just by looking at a reassociation case in isolation I wouldn’t worry about modifying the heuristic. However, in a scenario eg. with reassoc first and then a mul+add -> madd the bigger depth of the re-associated instruction could result in not performing the mul+ add combine later. It could be that the extra depth may have eaten the slack the madd needs to fit within the critical path. 

Also, since reassoc attempts to reduce depth shouldn’t the equation ignore latencies and the code finally look something like if (MustReduceDepth) return NewDepth < OldDepth else if (MustReduceCPL) return NewCycleCount < OldCycleCount else return NewCycleCount <= OldCycleCount?

Thanks for effort and being persistent.

Cheers
Gerolf
> On Oct 23, 2015, at 9:04 AM, Sanjay Patel <spatel at rotateright.com> wrote:
> 
> spatel added a comment.
> 
> And here's the MTM debug output showing why reassociation is happening: in the original sequence, the depth of the last fadd is '6' as expected from 2 dependent ops, but the height of this sequence increases the critical path to '10'. Machine combiner currently uses that info to trigger a reassociation, but this increases the depth of the fadds to '9'.
> 
>  Depths for BB#0:
>      0 Instructions
>  0	%vreg4<def> = COPY %RDI; GR64:%vreg4
>  0	%vreg3<def> = COPY %XMM3; FR64:%vreg3
>  0	%vreg2<def> = COPY %XMM2; FR64:%vreg2
>  0	%vreg1<def> = COPY %XMM1; FR64:%vreg1
>  0	%vreg0<def> = COPY %XMM0; FR64:%vreg0
>  0	%vreg5<def> = MOV32rm %vreg4, 1, %noreg, 0, %noreg; mem:LD4[%b01] GR32:%vreg5 GR64:%vreg4
>  0	%vreg6<def> = MOV32rm %vreg4, 1, %noreg, 24, %noreg; mem:LD4[%b3] GR32:%vreg6 GR64:%vreg4
>  0	%vreg7<def,tied1> = ADD32rm %vreg5<tied0>, %vreg4, 1, %noreg, 8, %noreg, %EFLAGS<imp-def,dead>; mem:LD4[%b1] GR32:%vreg7,%vreg5 GR64:%vreg4
>  0	%vreg8<def,tied1> = ADD32rm %vreg6<tied0>, %vreg4, 1, %noreg, 16, %noreg, %EFLAGS<imp-def,dead>; mem:LD4[%b2] GR32:%vreg8,%vreg6 GR64:%vreg4
>  5	%vreg9<def,tied1> = ADD32rr %vreg7<tied0>, %vreg8<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg9,%vreg7,%vreg8
>  6	%vreg10<def,tied1> = ADD32rr %vreg9<tied0>, %vreg6, %EFLAGS<imp-def,dead>; GR32:%vreg10,%vreg9,%vreg6
>  7	%vreg11<def,tied1> = ADD32rr %vreg9<tied0>, %vreg10<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg11,%vreg9,%vreg10
>  8	%vreg12<def,tied1> = ADD32rr %vreg11<tied0>, %vreg11, %EFLAGS<imp-def,dead>; GR32:%vreg12,%vreg11,%vreg11
>  9	%vreg13<def,tied1> = ADD32rr %vreg11<tied0>, %vreg12<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg13,%vreg11,%vreg12
>  10	MOV32mr %vreg4, 1, %noreg, 24, %noreg, %vreg13<kill>; mem:ST4[%b3] GR64:%vreg4 GR32:%vreg13
>  0	%vreg14<def> = VADDSDrr %vreg0, %vreg1; FR64:%vreg14,%vreg0,%vreg1
>  0	%vreg15<def> = VADDSDrr %vreg2, %vreg3; FR64:%vreg15,%vreg2,%vreg3
>  3	%vreg16<def> = VADDSDrr %vreg14<kill>, %vreg15<kill>; FR64:%vreg16,%vreg14,%vreg15
>  6	%XMM0<def> = COPY %vreg16; FR64:%vreg16
>  6	RETQ %XMM0
>  Heights for BB#0:
>     14 Instructions
>     3c @ SBPort1 (3 ops x12)
>     1c @ SBPort4 (1 ops x12)
>     1c @ SBPort5 (1 ops x12)
>     1c @ SBPort05 (1 ops x6)
>     2c @ SBPort15 (4 ops x6)
>     3c @ SBPort23 (6 ops x6)
>     4c @ SBPort015 (11 ops x4)
>     3c @ SBPortAny (18 ops x2)
>  6	0	RETQ %XMM0
>  6	0	%XMM0<def> = COPY %vreg16; FR64:%vreg16
>  6	3	%vreg16<def> = VADDSDrr %vreg14<kill>, %vreg15<kill>; FR64:%vreg16,%vreg14,%vreg15
>  6	6	%vreg15<def> = VADDSDrr %vreg2, %vreg3; FR64:%vreg15,%vreg2,%vreg3
>  6	6	%vreg14<def> = VADDSDrr %vreg0, %vreg1; FR64:%vreg14,%vreg0,%vreg1
>  10	0	MOV32mr %vreg4, 1, %noreg, 24, %noreg, %vreg13<kill>; mem:ST4[%b3] GR64:%vreg4 GR32:%vreg13
>  10	1	%vreg13<def,tied1> = ADD32rr %vreg11<tied0>, %vreg12<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg13,%vreg11,%vreg12
>  10	2	%vreg12<def,tied1> = ADD32rr %vreg11<tied0>, %vreg11, %EFLAGS<imp-def,dead>; GR32:%vreg12,%vreg11,%vreg11
>  10	3	%vreg11<def,tied1> = ADD32rr %vreg9<tied0>, %vreg10<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg11,%vreg9,%vreg10
>  10	4	%vreg10<def,tied1> = ADD32rr %vreg9<tied0>, %vreg6, %EFLAGS<imp-def,dead>; GR32:%vreg10,%vreg9,%vreg6
>  10	5	%vreg9<def,tied1> = ADD32rr %vreg7<tied0>, %vreg8<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg9,%vreg7,%vreg8
>  10	10	%vreg8<def,tied1> = ADD32rm %vreg6<tied0>, %vreg4, 1, %noreg, 16, %noreg, %EFLAGS<imp-def,dead>; mem:LD4[%b2] GR32:%vreg8,%vreg6 GR64:%vreg4
>  10	10	%vreg7<def,tied1> = ADD32rm %vreg5<tied0>, %vreg4, 1, %noreg, 8, %noreg, %EFLAGS<imp-def,dead>; mem:LD4[%b1] GR32:%vreg7,%vreg5 GR64:%vreg4
>  10	10	%vreg6<def> = MOV32rm %vreg4, 1, %noreg, 24, %noreg; mem:LD4[%b3] GR32:%vreg6 GR64:%vreg4
>  10	10	%vreg5<def> = MOV32rm %vreg4, 1, %noreg, 0, %noreg; mem:LD4[%b01] GR32:%vreg5 GR64:%vreg4
>  10	6	%vreg0<def> = COPY %XMM0; FR64:%vreg0
>  10	6	%vreg1<def> = COPY %XMM1; FR64:%vreg1
>  10	6	%vreg2<def> = COPY %XMM2; FR64:%vreg2
>  10	6	%vreg3<def> = COPY %XMM3; FR64:%vreg3
>  10	10	%vreg4<def> = COPY %RDI; GR64:%vreg4
>  BB#0 Live-ins: XMM0 at 6 XMM1 at 6 XMM2 at 6 XMM3 at 6 DIL at 10
>  Critical path: 10
>  NEW INSTR   %vreg17<def> = UNKNOWN %vreg1, %vreg15<kill>
> 
>  NEW INSTR   %vreg16<def> = UNKNOWN %vreg0, %vreg17<kill>
> 
>  DEPENDENCE DATA FOR 0x10884d880
>   NewRootDepth: 6 NewRootLatency: 3
>   RootDepth: 3 RootLatency: 3 RootSlack: 4
>   NewRootDepth + NewRootLatency = 9
>   RootDepth + RootLatency + RootSlack = 10
> 
> Depths after reassociation:
> 
>  Depths for BB#0:
>      0 Instructions
>  0	%vreg4<def> = COPY %RDI; GR64:%vreg4
>  0	%vreg3<def> = COPY %XMM3; FR64:%vreg3
>  0	%vreg2<def> = COPY %XMM2; FR64:%vreg2
>  0	%vreg1<def> = COPY %XMM1; FR64:%vreg1
>  0	%vreg0<def> = COPY %XMM0; FR64:%vreg0
>  0	%vreg5<def> = MOV32rm %vreg4, 1, %noreg, 0, %noreg; mem:LD4[%b01] GR32:%vreg5 GR64:%vreg4
>  0	%vreg6<def> = MOV32rm %vreg4, 1, %noreg, 24, %noreg; mem:LD4[%b3] GR32:%vreg6 GR64:%vreg4
>  0	%vreg7<def,tied1> = ADD32rm %vreg5<tied0>, %vreg4, 1, %noreg, 8, %noreg, %EFLAGS<imp-def,dead>; mem:LD4[%b1] GR32:%vreg7,%vreg5 GR64:%vreg4
>  0	%vreg8<def,tied1> = ADD32rm %vreg6<tied0>, %vreg4, 1, %noreg, 16, %noreg, %EFLAGS<imp-def,dead>; mem:LD4[%b2] GR32:%vreg8,%vreg6 GR64:%vreg4
>  5	%vreg9<def,tied1> = ADD32rr %vreg7<tied0>, %vreg8<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg9,%vreg7,%vreg8
>  6	%vreg10<def,tied1> = ADD32rr %vreg9<tied0>, %vreg6, %EFLAGS<imp-def,dead>; GR32:%vreg10,%vreg9,%vreg6
>  7	%vreg11<def,tied1> = ADD32rr %vreg9<tied0>, %vreg10<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg11,%vreg9,%vreg10
>  8	%vreg12<def,tied1> = ADD32rr %vreg11<tied0>, %vreg11, %EFLAGS<imp-def,dead>; GR32:%vreg12,%vreg11,%vreg11
>  9	%vreg13<def,tied1> = ADD32rr %vreg11<tied0>, %vreg12<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg13,%vreg11,%vreg12
>  10	MOV32mr %vreg4, 1, %noreg, 24, %noreg, %vreg13<kill>; mem:ST4[%b3] GR64:%vreg4 GR32:%vreg13
>  0	%vreg15<def> = VADDSDrr %vreg2, %vreg3; FR64:%vreg15,%vreg2,%vreg3
>  3	%vreg17<def> = VADDSDrr %vreg1, %vreg15<kill>; FR64:%vreg17,%vreg1,%vreg15
>  6	%vreg16<def> = VADDSDrr %vreg0, %vreg17<kill>; FR64:%vreg16,%vreg0,%vreg17
>  9	%XMM0<def> = COPY %vreg16; FR64:%vreg16
>  9	RETQ %XMM0
> 
> 
> http://reviews.llvm.org/D13417
> 
> 
> 



More information about the llvm-commits mailing list