[PATCH] D36104: [AArch64] Coalesce Copy Zero during instruction selection

Wed Jun 20 13:44:30 PDT 2018

SirishP reopened this revision.
SirishP added a comment.
This revision is now accepted and ready to land.
Herald added a reviewer: javed.absar.
Herald added a subscriber: dmgreen.

I am planning to revert this change. This works with tests like test/CodeGen/AArch64/copy-zero-reg.ll. However, if there are multiple branches, this patch degrades in performance due to large number of mov instructions in each fall through. Here's an example of test case, where it degrades in performance.

target triple = "aarch64-none-linux-gnu"

%struct.foo = type {i8* }

; Function Attrs: nounwind
define void @func(i32* nocapture readonly %initval, i32* nocapture readonly %compptr, i16* nocapture readonly %coef_block) local_unnamed_addr {
entry:

  br label %for.body

for.body:                                         ; preds = %for.inc, %entry

  %inptr.0106 = phi i16* [ %coef_block, %entry ], [ %inptr.1, %for.inc ]
  %ctr.0105 = phi i32 [ 8, %entry ], [ %dec, %for.inc ]
  %qptr.0104 = phi i32* [ %compptr, %entry ], [ %qptr.1, %for.inc ]
  %wsptr.0103 = phi i32* [ %initval, %entry ], [ %wsptr.1, %for.inc ]
  %arrayidx = getelementptr inbounds i16, i16* %inptr.0106, i64 8
  %0 = load i16, i16* %arrayidx, align 2
  %arrayidx3 = getelementptr inbounds i16, i16* %inptr.0106, i64 16
  %1 = load i16, i16* %arrayidx3, align 2
  %2 = or i16 %1, %0
  %3 = icmp eq i16 %2, 0
  br i1 %3, label %land.lhs.true7, label %if.end

land.lhs.true7:                                   ; preds = %for.body

  %arrayidx8 = getelementptr inbounds i16, i16* %inptr.0106, i64 24
  %true7 = load i16, i16* %arrayidx8, align 2
  %cmp10 = icmp eq i16 %true7, 0
  br i1 %cmp10, label %land.lhs.true12, label %if.end

land.lhs.true12:                                  ; preds = %land.lhs.true7

  %arrayidx13 = getelementptr inbounds i16, i16* %inptr.0106, i64 32
  %true12 = load i16, i16* %arrayidx13, align 2
  %cmp15 = icmp eq i16 %true12, 0
  br i1 %cmp15, label %land.lhs.true17, label %if.end

land.lhs.true17:                                  ; preds = %land.lhs.true12

  %arrayidx18 = getelementptr inbounds i16, i16* %inptr.0106, i64 40
  %true17 = load i16, i16* %arrayidx18, align 2
  %cmp20 = icmp eq i16 %true17, 0
  br i1 %cmp20, label %land.lhs.true22, label %if.end

land.lhs.true22:                                  ; preds = %land.lhs.true17

  %arrayidx23 = getelementptr inbounds i16, i16* %inptr.0106, i64 48
  %true22 = load i16, i16* %arrayidx23, align 2
  %cmp25 = icmp eq i16 %true22, 0
  br i1 %cmp25, label %land.lhs.true27, label %if.end

land.lhs.true27:                                  ; preds = %land.lhs.true22

  %arrayidx28 = getelementptr inbounds i16, i16* %inptr.0106, i64 56
  %true27 = load i16, i16* %arrayidx28, align 2
  %cmp30 = icmp eq i16 %true27, 0
  br i1 %cmp30, label %if.then, label %if.end

if.then:                                          ; preds = %land.lhs.true27

  %4 = load i16, i16* %inptr.0106, align 2
  %conv33 = sext i16 %4 to i32
  %5 = load i32, i32* %qptr.0104, align 4
  %mul = shl nsw i32 %conv33, 2
  %shl = mul i32 %mul, %5
  store i32 %shl, i32* %wsptr.0103, align 4
  br label %for.inc

if.end:                                           ; preds = %land.lhs.true27, %land.lhs.true22, %land.lhs.true17, %land.lhs.true12, %land.lhs.true7, %for.body

  %6 = phi i16 [ 0, %land.lhs.true27 ], [ 0, %land.lhs.true22 ], [ 0, %land.lhs.true17 ], [ 0, %land.lhs.true12 ], [ 0, %land.lhs.true7 ], [ %1, %for.body ]
  %conv40 = sext i16 %6 to i32
  %arrayidx41 = getelementptr inbounds i32, i32* %qptr.0104, i64 16
  %7 = load i32, i32* %arrayidx41, align 4
  %mul42 = mul nsw i32 %7, %conv40
  store i32 %mul42, i32* %wsptr.0103, align 4
  br label %for.inc

for.inc:                                          ; preds = %if.end, %if.then

  %conv54.sink = phi i32 [ %mul42, %if.end ], [ %shl, %if.then ]
  %arrayidx55 = getelementptr inbounds i32, i32* %wsptr.0103, i64 56
  store i32 %conv54.sink, i32* %arrayidx55, align 4
  %wsptr.1 = getelementptr inbounds i32, i32* %wsptr.0103, i64 1
  %qptr.1 = getelementptr inbounds i32, i32* %qptr.0104, i64 1
  %inptr.1 = getelementptr inbounds i16, i16* %inptr.0106, i64 1
  %dec = add nsw i32 %ctr.0105, -1
  %cmp = icmp ugt i32 %ctr.0105, 1
  br i1 %cmp, label %for.body, label %for.end

for.end:                                          ; preds = %for.inc

  ret void

}

This is slightly smaller version of libjpeg code.

The intention of this patch is to remove a BB that has one mov instruction. In order to do that, this patch pessmizes MachineSinking by introducing a copy, such that mov instruction is NOT moved to the BB. Optimization downstream gets rid of the BB with only mov instruction. This works well
if we have only one fall through branch as there is only one "extra" mov instruction.

If we have multiple fall throughs, we will have a lot of redundant movs. In such a case, it's better to have this BB which has one mov instruction.

This is causing degradation in jpeg, fft and other codebases. I believe if we want to remove a BB with only one branch instruction, we should not pessimize Machine Sinking at all, and find some other solution.

Repository:
  rL LLVM

https://reviews.llvm.org/D36104