[PATCH] D48102: Improve handling of COPY instructions with identical value numbers

Mon Jun 18 08:29:18 PDT 2018

tpr added a comment.

  ********** INTERVALS **********
  %67 [3184r,3216r:0)  0 at 3184r weight:0.000000e+00
  %72 [384B,544r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000004 [384B,544r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000002 [384B,528r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000001 [384B,528r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000008 [2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3184r:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r weight:0.000000e+00
  %86 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L00000004 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L0000000B [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi weight:0.000000e+00

  ********** MACHINEINSTRS **********
  # Machine code for function _amdgpu_cs_main: NoPHIs, TracksLiveness

  2864B	bb.66.Flow:
  	; predecessors: %bb.63, %bb.67
  	  successors: %bb.68(0x80000000); %bb.68(100.00%)

  2896B	  S_BRANCH %bb.68

  2912B	bb.67 (%ir-block.152):
  	; predecessors: %bb.64, %bb.65
  	  successors: %bb.66(0x80000000); %bb.66(100.00%)

  2928B	  $exec = S_OR_B64 $exec, %5:sreg_64, implicit-def $scc
  2944B	  %8:vreg_1 = COPY %89:vreg_1
  2960B	  %40:sreg_64_xexec = V_CMP_NE_U32_e64 0, %8:vreg_1, implicit $exec
  2976B	  %39:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %40:sreg_64_xexec, implicit $exec
  2992B	  %82:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
  3008B	  undef %81.sub0:vreg_128 = COPY %82:vgpr_32
  3024B	  %81.sub1:vreg_128 = COPY %82:vgpr_32
  3040B	  %81.sub2:vreg_128 = COPY %39:vgpr_32
  3056B	  %73:vreg_128 = COPY %81:vreg_128
  3072B	  %72:vreg_128 = COPY %73:vreg_128
  3088B	  S_BRANCH %bb.66

  3104B	bb.68 (%ir-block.156):
  	; predecessors: %bb.66
  	  successors: %bb.4(0x04000000), %bb.1(0x7c000000); %bb.4(3.12%), %bb.1(96.88%)

  3120B	  %64:vgpr_32 = V_ADD_I32_e32 32, %85:vgpr_32, implicit-def dead $vcc, implicit $exec
  3136B	  V_CMP_EQ_U32_e32 0, %64:vgpr_32, implicit-def $vcc, implicit $exec
  3152B	  %63:vgpr_32 = COPY %64:vgpr_32
  3168B	  $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
  3184B	  %67:vreg_128 = COPY %72:vreg_128
  3200B	  %85:vgpr_32 = COPY %63:vgpr_32
  3216B	  %86:vreg_128 = COPY %67:vreg_128
  3232B	  S_CBRANCH_VCCNZ %bb.4, implicit killed $vcc
  3248B	  S_BRANCH %bb.1

  # End machine code for function _amdgpu_cs_main.

  2576B	%72:vreg_128 = COPY %86:vreg_128
  	Considering merging to VReg_128 with %72 in %86
  		RHS = %72 [384B,544r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000004 [384B,544r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000002 [384B,528r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000001 [384B,528r:0)[2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3264B:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r L00000008 [2576r,2608B:2)[2864B,2912B:0)[3072r,3104B:1)[3104B,3184r:0)  0 at 2864B-phi 1 at 3072r 2 at 2576r weight:0.000000e+00
  		LHS = %86 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L00000004 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L0000000B [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi weight:0.000000e+00
  		merge %86:0 at 3216r into %72:0 at 2864B --> @2864B
  		merge %72:2 at 2576r into %86:4 at 1920B --> @1920B
  		LHST = %86 %86 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L00000004 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L0000000B [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi weight:0.000000e+00
  		merge %86:0 at 3216r into %72:0 at 2864B --> @2864B
  		merge %72:2 at 2576r into %86:4 at 1920B --> @1920B
  		joined lanes: [224r,240B:1)[240B,384B:2)[384B,544r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2624r:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r
  		merge %86:0 at 3216r into %72:0 at 2864B --> @2864B
  		merge %72:2 at 2576r into %86:4 at 1920B --> @1920B
  		joined lanes: [224r,240B:1)[240B,384B:2)[384B,528r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r
  		merge %86:0 at 3216r into %72:0 at 2864B --> @2864B
  		merge %72:2 at 2576r into %86:4 at 1920B --> @1920B
  		joined lanes: [224r,240B:1)[240B,384B:2)[384B,528r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r
  		merge %72:2 at 2576r into %86:4 at 1920B --> @1920B
  		joined lanes: [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:5)[3072r,3104B:6)[3104B,3184r:5)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 2864B-phi 6 at 3072r
  	Joined SubRanges %86 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2576r:4)[2608B,2624r:4)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi L00000001 [224r,240B:1)[240B,384B:2)[384B,528r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r L00000002 [224r,240B:1)[240B,384B:2)[384B,528r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r L00000004 [224r,240B:1)[240B,384B:2)[384B,544r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2624r:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r L00000008 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:5)[3072r,3104B:6)[3104B,3184r:5)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 2864B-phi 6 at 3072r weight:0.000000e+00
  		Expecting instruction removal at 3216r
  		 checking:  L00000001 [224r,240B:1)[240B,384B:2)[384B,528r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r
  		 checking:  L00000002 [224r,240B:1)[240B,384B:2)[384B,528r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r
  		 checking:  L00000004 [224r,240B:1)[240B,384B:2)[384B,544r:0)[640B,1808B:2)[1904r,1920B:3)[1920B,2624r:4)[2864B,2912B:0)[3072r,3104B:5)[3104B,3264B:0)  0 at 2864B-phi 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 3072r
  		 checking:  L00000008 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,2912B:5)[3072r,3104B:6)[3104B,3184r:5)[3216r,3264B:0)  0 at 3216r 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 2864B-phi 6 at 3072r
  		Prune sublane 00000008 at 3216r
  Assertion failed: (I->end <= std::next(I)->start), function verify, file ../lib/CodeGen/LiveInterval.cpp, line 1022.

  (lldb) up 5
  frame #5: 0x00000001012cf283 llc`(anonymous namespace)::JoinVals::pruneSubRegValues(this=0x00007fff5fbfa9e8, LI=0x0000000106b0a9e0, ShrinkMask=0x0000000106d00be0) + 1619 at RegisterCoalescer.cpp:2872
     2869	        }
     2870	        // Mark value number as unused.
     2871	        ValueOut->markUnused();
  -> 2872	        S.verify();
     2873	        continue;
     2874	      }
     2875	      // If a subrange ends at the copy, then a value was copied but only
  (lldb) p S.dump()
   L00000008 [224r,240B:1)[240B,384B:2)[640B,1808B:2)[1904r,1920B:3)[1920B,2608B:4)[2864B,3264B:5)[3072r,3104B:6)[3104B,3184r:5)  0 at x 1 at 224r 2 at 240B-phi 3 at 1904r 4 at 1920B-phi 5 at 2864B-phi 6 at 3072r

It was trying to coalesce %72 and %86, and it did that thing where it notices that a value of %86 is copied (at 3216) from %67, which is itself copied (at 3184) from %72. %72's value at that point is 5 at 2864B-phi, which reaches 3184 by branching over [2912B,3104B), in which another value 6 at 3072r is defined.

In lane L00000008, it wants to remove the segment [3216r,3264B:0) and extend [2864B,2912B:5) to 3264B, but it ends up with [2864B,3264B:5), whereas it should have [2864B,2912B:5)[3104B,3264B:5).

Repository:
  rL LLVM

https://reviews.llvm.org/D48102