[PATCH] D16836: [CodeGenPrepare] Don't transform select instructions into branches when both of operands are cheap

Junmo Park via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 2 23:07:23 PST 2016

flyingforyou added a comment.

  for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
    %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
    %j.126 = phi i32 [ -1, %for.cond1.preheader ], [ %j.2, %for.body4 ]
    %scevgep = getelementptr float, float* getelementptr inbounds (%struct.GlobalData, %struct.GlobalData* @global_data, i32 0, i32 0, i32 0), i64 %indvars.iv
    %3 = load float, float* %scevgep, align 4, !tbaa !5
    %cmp5 = fcmp olt float %3, 0.000000e+00
    %tmp = trunc i64 %indvars.iv to i32
    %j.2 = select i1 %cmp5, i32 %tmp, i32 %j.126
    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
    %exitcond = icmp eq i64 32000, %indvars.iv.next
    br i1 %exitcond, label %for.cond.cleanup3, label %for.body4

In this case, %tmp, %j.126 is target for sinking.

This code will translate to arm64 likes below.
after patch.

  40b684:	8b080e69 	add	x9, x19, x8, lsl #3
  40b688:	6d400520 	ldp	d0, d1, [x9]
  40b68c:	1e602008 	fcmp	d0, #0.0
  40b690:	1a94410a 	csel	w10, w8, w20, mi
  40b694:	1e602028 	fcmp	d1, #0.0
  40b698:	6d410520 	ldp	d0, d1, [x9,#16]
  40b69c:	91000508 	add	x8, x8, #0x1
  40b6a0:	1a8a410a 	csel	w10, w8, w10, mi
  40b6a4:	1e602008 	fcmp	d0, #0.0
  40b6a8:	fd401120 	ldr	d0, [x9,#32]
  40b6ac:	1a88554a 	csinc	w10, w10, w8, pl
  40b6b0:	1e602028 	fcmp	d1, #0.0
  40b6b4:	91000908 	add	x8, x8, #0x2
  40b6b8:	1a8a410a 	csel	w10, w8, w10, mi
  40b6bc:	1e602008 	fcmp	d0, #0.0
  40b6c0:	1a885554 	csinc	w20, w10, w8, pl
  40b6c4:	91000908 	add	x8, x8, #0x2
  40b6c8:	eb1b011f 	cmp	x8, x27
  40b6cc:	54fffdc1 	b.ne	40b684 <s331+0xb4>

before patch

  40b684:	fc697a60 	ldr	d0, [x19,x9,lsl #3]
  40b688:	2a0903e0 	mov	w0, w9
  40b68c:	1e602008 	fcmp	d0, #0.0
  40b690:	54000044 	b.mi	40b698 <s331+0xc8>
  40b694:	2a1403e0 	mov	w0, w20
  40b698:	8b090e68 	add	x8, x19, x9, lsl #3
  40b69c:	91000529 	add	x9, x9, #0x1
  40b6a0:	2a0903e1 	mov	w1, w9
  40b6a4:	fd400500 	ldr	d0, [x8,#8]
  40b6a8:	1e602008 	fcmp	d0, #0.0
  40b6ac:	54000044 	b.mi	40b6b4 <s331+0xe4>
  40b6b0:	2a0003e1 	mov	w1, w0
  40b6b4:	fd400900 	ldr	d0, [x8,#16]
  40b6b8:	91000529 	add	x9, x9, #0x1
  40b6bc:	2a0903e0 	mov	w0, w9
  40b6c0:	1e602008 	fcmp	d0, #0.0
  40b6c4:	54000044 	b.mi	40b6cc <s331+0xfc>
  40b6c8:	2a0103e0 	mov	w0, w1
  40b6cc:	fd400d00 	ldr	d0, [x8,#24]
  40b6d0:	91000529 	add	x9, x9, #0x1
  40b6d4:	2a0903e1 	mov	w1, w9
  40b6d8:	1e602008 	fcmp	d0, #0.0
  40b6dc:	54000044 	b.mi	40b6e4 <s331+0x114>
  40b6e0:	2a0003e1 	mov	w1, w0
  40b6e4:	fd401100 	ldr	d0, [x8,#32]
  40b6e8:	91000528 	add	x8, x9, #0x1
  40b6ec:	2a0803f4 	mov	w20, w8
  40b6f0:	1e602008 	fcmp	d0, #0.0
  40b6f4:	54000044 	b.mi	40b6fc <s331+0x12c>
  40b6f8:	2a0103f4 	mov	w20, w1
  40b6fc:	91000509 	add	x9, x8, #0x1
  40b700:	eb1b013f 	cmp	x9, x27
  40b704:	54fffc01 	b.ne	40b684 <s331+0xb4>

We transform just for hiding mov instruction. I don't think it's proper transformation regarding performance.

But I will consider adding some limitation as you said Hal. If you have any sugesstions, please let me know.


