[LLVMbugs] [Bug 15057] New: sign-extension folding for ARM (and other backends) depends too much on block-placement

Thu Jan 24 03:30:11 PST 2013

http://llvm.org/bugs/show_bug.cgi?id=15057

             Bug #: 15057
           Summary: sign-extension folding for ARM (and other backends)
                    depends too much on block-placement
           Product: new-bugs
           Version: trunk
          Platform: PC
        OS/Version: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: bjorn.desutter at elis.ugent.be
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

When compiling a file test.c existing only of

void foo(short * in, int * out, int nr) __attribute__((noinline));

void foo(short * in, int * out, int nr) {
  int i, value;

  for (i = 0; i < nr; i++) {
         value = in[i];
         if (value>2047)
                value = 2047;
         else if (value<-2048)
                value = -2048;
         out[i]=value;
  }

  return;
}

the code quality depends too heavily on use of -block-placement. With 

   clang -g -m32 -std=c89 -c -o test.bc -emit-llvm test.c
   opt -O3 -o A.bc test.bc
   llc -O3 A.bc -march=arm

the code is compiled into

foo:
   push {r4, r5, r6, lr}
   cmp  r2, #0  
   ble  .LBB0_5
   mov  r12, #255       
   mov  lr, #255        
   mov  r3, #0  
   orr  r12, r12, #65280        
   orr  lr, lr, #1792
.LBB0_2:
   ldrsh r4, [r0]                 # load i16 and sign-extend
   mov  r5, lr  
   cmp  r4, #2048       
   bge  .LBB0_4 
   and  r5, r4, r12               # i32 to i16
   lsl  r5, r5, #16               # and sign-extend
   asr  r6, r5, #16               # again
   ldr  r5, .LCPI0_0    
   cmn  r6, #2048       
   movge r5, r4
.LBB0_4:
   str  r5, [r1, r3, lsl #2]
   add  r3, r3, #1
   add  r0, r0, #2      
   cmp  r3, r2
   blt  .LBB0_2
.LBB0_5:
   pop     {r4, r5, r6, lr}
   mov     pc, lr

Note the completely superfluous conversions I commented on. 

When compiling with 

   clang -g -m32 -std=c89 -c -o test.bc -emit-llvm test.c
   opt -O3 -block-placement -o B.bc test.bc
   llc -O3 B.bc -march=arm

the following much better code is generated, without superfluous conversions:

foo:
   push {r4, lr}
   cmp  r2, #0  
   ble  .LBB0_5
   mov  r12, #255       
   mov  r3, #0
   orr  r12, r12, #1792
.LBB0_2:
   ldrsh r4, [r0]        
   mov  lr, r12
   cmp  r4, #2048
   bge  .LBB0_4
   ldr  lr, .LCPI0_0
   cmn  r4, #2048
   movge lr, r4
.LBB0_4:
   str  lr, [r1, r3, lsl #2]
   add  r3, r3, #1
   add  r0, r0, #2
   cmp  r3, r2  
   blt  .LBB0_2
.LBB0_5:
   pop {r4, lr}
   mov pc, lr

Apparently, in this case the sign-extension in the bytecode gets moved and
folded much better. 

The good code can also be obtained by

   clang -g -m32 -std=c89 -c -o test.bc -emit-llvm test.c
   opt -O3 -block-placement -o C1.bc test.bc
   opt -O3 -o C2.bc C1.bc
   llc -O3 C2.bc -march=arm

and the bad code with 

   clang -g -m32 -std=c89 -c -o test.bc -emit-llvm test.c
   opt -O3 -o D1.bc test.bc
   opt -O3 -block-placement -o D2.bc D1.bc
   llc -O3 D2.bc -march=arm

Note that in these two compilations, only the order of the two opt runs has
changed. So it is not only that -block-placement should be enabled, but it
should also enabled soon enough during the optimization or the chance for
optimizing the code is gone. 

To see what is going wrong, here are the relevant disassembled bytecode
fragments of the two versions:

C1 and C2 (which leads to good code) contain:

  %0 = load i16* %arrayidx, align 2, !dbg !15
  %conv = sext i16 %0 to i32, !dbg !15
  %cmp1 = icmp sgt i16 %0, 2047, !dbg !25
  br i1 %cmp1, label %if.end6, label %if.else, !dbg !25
if.else:                                          ; preds = %for.body
  %cmp3 = icmp slt i32 %conv, -2048, !dbg !26
  %.conv = select i1 %cmp3, i32 -2048, i32 %conv, !dbg !26
  br label %if.end6, !dbg !26

whereas both D1 and D2 contain:

  %0 = load i16* %arrayidx, align 2, !dbg !21
  tail call void @llvm.dbg.value(metadata !{i32 %conv}, i64 0, metadata !23),
!dbg !21
  %cmp1 = icmp sgt i16 %0, 2047, !dbg !24
  br i1 %cmp1, label %if.end6, label %if.else, !dbg !24
if.else:                                          ; preds = %for.body
  %conv = sext i16 %0 to i32, !dbg !21
  %cmp3 = icmp slt i16 %0, -2048, !dbg !25
  %.conv = select i1 %cmp3, i32 -2048, i32 %conv, !dbg !25
  br label %if.end6, !dbg !25

Which leads to the bad code. 

Bjorn

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.