[LLVMbugs] [Bug 7137] New: Dead store elimination removes only one store per pass in simple case

Fri May 14 00:28:06 PDT 2010

http://llvm.org/bugs/show_bug.cgi?id=7137

           Summary: Dead store elimination removes only one store per pass
                    in simple case
           Product: libraries
           Version: trunk
          Platform: PC
        OS/Version: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Scalar Optimizations
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: matti.niemenmaa+llvmbugs at iki.fi
                CC: llvmbugs at cs.uiuc.edu

The dead store elimination pass removes only one store every time it is run on
the IR given by clang for the following C.

#include <stdio.h>
#include <stdlib.h>
int main() {
    long *x = malloc(512);
    x[0] = 100;
    x[1] = 200;
    x[2] = 300;
    x[3] = 400;
    putchar((int)x[0]);
    free(x);
    putchar(10);
    return 0;
}

The IR generated by 'clang -emit-llvm -c' looks like this to start with:

target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
define i32 @main() nounwind {
  %1 = alloca i32, align 4                        ; <i32*> [#uses=3]
  %x = alloca i64*, align 8                       ; <i64**> [#uses=7]
  store i32 0, i32* %1
  %2 = call i8* @malloc(i64 512)                  ; <i8*> [#uses=1]
  %3 = bitcast i8* %2 to i64*                     ; <i64*> [#uses=1]
  store i64* %3, i64** %x
  %4 = load i64** %x                              ; <i64*> [#uses=1]
  %5 = getelementptr inbounds i64* %4, i64 0      ; <i64*> [#uses=1]
  store i64 100, i64* %5
  %6 = load i64** %x                              ; <i64*> [#uses=1]
  %7 = getelementptr inbounds i64* %6, i64 1      ; <i64*> [#uses=1]
  store i64 200, i64* %7
  %8 = load i64** %x                              ; <i64*> [#uses=1]
  %9 = getelementptr inbounds i64* %8, i64 2      ; <i64*> [#uses=1]
  store i64 300, i64* %9
  %10 = load i64** %x                             ; <i64*> [#uses=1]
  %11 = getelementptr inbounds i64* %10, i64 3    ; <i64*> [#uses=1]
  store i64 400, i64* %11
  %12 = load i64** %x                             ; <i64*> [#uses=1]
  %13 = getelementptr inbounds i64* %12, i64 0    ; <i64*> [#uses=1]
  %14 = load i64* %13                             ; <i64> [#uses=1]
  %15 = trunc i64 %14 to i32                      ; <i32> [#uses=1]
  %16 = call i32 @putchar(i32 %15)                ; <i32> [#uses=0]
  %17 = load i64** %x                             ; <i64*> [#uses=1]
  %18 = bitcast i64* %17 to i8*                   ; <i8*> [#uses=1]
  call void @free(i8* %18) nounwind
  %19 = call i32 @putchar(i32 10)                 ; <i32> [#uses=0]
  store i32 0, i32* %1
  %20 = load i32* %1                              ; <i32> [#uses=1]
  ret i32 %20
}
declare noalias i8* @malloc(i64) nounwind
declare i32 @putchar(i32)
declare void @free(i8*) nounwind

One round of 'opt -O3' (using opt from SVN trunk, r103765) later main looks
like:

define i32 @main() nounwind {
  %1 = tail call i8* @malloc(i64 512)             ; <i8*> [#uses=4]
  %2 = bitcast i8* %1 to i64*                     ; <i64*> [#uses=1]
  store i64 100, i64* %2
  %3 = getelementptr inbounds i8* %1, i64 8       ; <i8*> [#uses=1]
  %4 = bitcast i8* %3 to i64*                     ; <i64*> [#uses=1]
  store i64 200, i64* %4
  %5 = getelementptr inbounds i8* %1, i64 16      ; <i8*> [#uses=1]
  %6 = bitcast i8* %5 to i64*                     ; <i64*> [#uses=1]
  store i64 300, i64* %6
  %7 = tail call i32 @putchar(i32 100) nounwind   ; <i32> [#uses=0]
  tail call void @free(i8* %1) nounwind
  %8 = tail call i32 @putchar(i32 10) nounwind    ; <i32> [#uses=0]
  ret i32 0
}

It may be enough of a bug that it's not already been optimized to just the two
putchar calls, given that the malloc/free and stores are all redundant.

If we run 'opt -dse' on the above, we're left with:

define i32 @main() nounwind {
  %1 = tail call i8* @malloc(i64 512)             ; <i8*> [#uses=3]
  %2 = bitcast i8* %1 to i64*                     ; <i64*> [#uses=1]
  store i64 100, i64* %2
  %3 = getelementptr inbounds i8* %1, i64 8       ; <i8*> [#uses=1]
  %4 = bitcast i8* %3 to i64*                     ; <i64*> [#uses=1]
  store i64 200, i64* %4
  %5 = tail call i32 @putchar(i32 100) nounwind   ; <i32> [#uses=0]
  tail call void @free(i8* %1) nounwind
  %6 = tail call i32 @putchar(i32 10) nounwind    ; <i32> [#uses=0]
  ret i32 0
}

Only one store was removed! This means that in order to get rid of all the
stores, we need to run DSE three more times after the original -O3 passes. And
indeed, 'opt -O3 -dse -dse -dse' is sufficient to get rid of all the stores; we
then only need another '-instcombine' to get rid of the malloc/free.

This seems terribly suboptimal: can't it be done in a single pass? Ideally, the
original 'opt -O3' would've reduced the whole thing.

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.