[llvm-bugs] [Bug 26642] New: Miscompilation caused by stack adjustment code clobbering used registers

Tue Feb 16 13:27:44 PST 2016

https://llvm.org/bugs/show_bug.cgi?id=26642

            Bug ID: 26642
           Summary: Miscompilation caused by stack adjustment code
                    clobbering used registers
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: AArch64
          Assignee: unassignedbugs at nondot.org
          Reporter: andrew.b.adams at gmail.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Some time on Friday one of the Halide tests starting returning wrong values on
the arm64 buildbot:

http://buildbot.halide-lang.org:8010/builders/arm64-linux-64-trunk/builds/69

(Halide is a language built on LLVM. Our buildbots pull and test against trunk
llvm every four hours or so.)

The cause seems to be stack adjustment code that clobbers an in-use register.
Below is .ll from llvm 3.7, and the asm it produces, and .ll from trunk llvm,
and the asm it produces. Pay attention to x9. It's the address of one of the
outputs (the one that's coming out as wrong).

In the working code, x9 is used as a temporary to adjust the stack downwards,
and then is loaded from [x1], and then used as the address to write the output
to near the end of the function (str w11, [x9])

In the broken code, x9 is loaded from the argument, *then* used as a temporary
to adjust the stack downward (clobbering it), and then used as an address to
write to by the same store instruction. This now becomes a useless store to the
stack instead of actually writing the output value.

Good ll and asm:

; ModuleID = 'halide_module_f10'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnueabihf"

%struct.halide_filter_argument_t = type { i8*, i32, i32, i32, i32,
%struct.halide_scalar_value_t*, %struct.halide_scalar_value_t*,
%struct.halide_scalar_value_t* }
%struct.halide_scalar_value_t = type { %union.anon }
%union.anon = type { double }
%struct.halide_filter_metadata_t = type { i32, i32,
%struct.halide_filter_argument_t*, i8*, i8* }
%struct.buffer_t = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8,
i8, [2 x i8] }

@str = private constant [6 x i8] c"f10.0\00", align 32
@str.2 = private constant [6 x i8] c"f10.1\00", align 32
@0 = private constant [2 x %struct.halide_filter_argument_t]
[%struct.halide_filter_argument_t { i8* getelementptr inbounds ([6 x i8], [6 x
i8]* @str, i32 0, i32 0), i32 2, i32 0, i32 0, i32 32,
%struct.halide_scalar_value_t* null, %struct.halide_scalar_value_t* null,
%struct.halide_scalar_value_t* null }, %struct.halide_filter_argument_t { i8*
getelementptr inbounds ([6 x i8], [6 x i8]* @str.2, i32 0, i32 0), i32 2, i32
0, i32 0, i32 32, %struct.halide_scalar_value_t* null,
%struct.halide_scalar_value_t* null, %struct.halide_scalar_value_t* null }]
@str.3 = private constant [35 x i8] c"arm-64-linux-no_asserts-no_runtime\00",
align 32
@str.4 = private constant [4 x i8] c"f10\00", align 32
@f10_metadata = constant %struct.halide_filter_metadata_t { i32 0, i32 2,
%struct.halide_filter_argument_t* getelementptr inbounds ([2 x
%struct.halide_filter_argument_t], [2 x %struct.halide_filter_argument_t]* @0,
i32 0, i32 0), i8* getelementptr inbounds ([35 x i8], [35 x i8]* @str.3, i32 0,
i32 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @str.4, i32 0, i32 0) }

; Function Attrs: nounwind
define i32 @__f10(%struct.buffer_t* noalias nocapture %f10.0.buffer,
%struct.buffer_t* noalias nocapture %f10.1.buffer) #0 {
entry:
  %f8.0.host59 = alloca [13 x <8 x i32>], align 32
  %f8.1.host60 = alloca [13 x <8 x i32>], align 32
  %buf_host = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 1
  %f10.0.host = load i8*, i8** %buf_host, align 8
  %buf_dev = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 0
  %f10.0.dev = load i64, i64* %buf_dev, align 8
  %0 = icmp eq i64 %f10.0.dev, 0
  %1 = icmp eq i8* %f10.0.host, null
  %f10.0.host_and_dev_are_null = and i1 %1, %0
  %buf_host10 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 1
  %f10.1.host = load i8*, i8** %buf_host10, align 8
  %buf_dev11 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 0
  %f10.1.dev = load i64, i64* %buf_dev11, align 8
  %2 = icmp eq i64 %f10.1.dev, 0
  %3 = icmp eq i8* %f10.1.host, null
  %f10.1.host_and_dev_are_null = and i1 %3, %2
  br i1 %f10.0.host_and_dev_are_null, label %true_bb, label %after_bb

true_bb:                                          ; preds = %entry
  %buf_elem_size27 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 5
  store i32 4, i32* %buf_elem_size27, align 4
  %buf_extent29 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 2, i64 0
  %4 = bitcast i32* %buf_extent29 to i8*
  call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 48, i32 4, i1 false)
  br label %after_bb

after_bb:                                         ; preds = %entry, %true_bb
  br i1 %f10.1.host_and_dev_are_null, label %after_bb42.thread, label
%after_bb42

after_bb42.thread:                                ; preds = %after_bb
  %buf_elem_size43 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 5
  store i32 4, i32* %buf_elem_size43, align 4
  %buf_extent45 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 2, i64 0
  %5 = bitcast i32* %buf_extent45 to i8*
  call void @llvm.memset.p0i8.i64(i8* %5, i8 0, i64 48, i32 4, i1 false)
  br label %destructor_block

after_bb42:                                       ; preds = %after_bb
  br i1 %f10.0.host_and_dev_are_null, label %destructor_block, label %"for
f8.s0.v0"

"for f8.s0.v0":                                   ; preds = %after_bb42, %"for
f8.s0.v0"
  %indvars.iv61 = phi i64 [ %9, %"for f8.s0.v0" ], [ -1, %after_bb42 ]
  %f8.s0.v0 = phi i32 [ %14, %"for f8.s0.v0" ], [ -1, %after_bb42 ]
  %6 = sub nsw i32 100, %f8.s0.v0
  %7 = sext i32 %6 to i64
  %8 = mul nsw i64 %7, %indvars.iv61
  %9 = add nsw i64 %indvars.iv61, 1
  %10 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.0.host59, i64 0, i64 0, i64 %9
  %11 = trunc i64 %8 to i32
  store i32 %11, i32* %10, align 4, !tbaa !4
  %12 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.1.host60, i64 0, i64 0, i64 %9
  %13 = trunc i64 %indvars.iv61 to i32
  store i32 %13, i32* %12, align 4, !tbaa !6
  %14 = add nsw i32 %f8.s0.v0, 1
  %exitcond63 = icmp eq i64 %9, 100
  br i1 %exitcond63, label %"for f8.s1.r30.x$r.preheader", label %"for
f8.s0.v0"

"for f8.s1.r30.x$r.preheader":                    ; preds = %"for f8.s0.v0"
  %15 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.0.host59, i64 0, i64 0, i64 0
  %16 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.1.host60, i64 0, i64 0, i64 0
  %t24.pre = load i32, i32* %15, align 32, !tbaa !7
  %.pre = load i32, i32* %16, align 32, !tbaa !18
  br label %"for f8.s1.r30.x$r"

"for f8.s1.r30.x$r":                              ; preds = %"for
f8.s1.r30.x$r.preheader", %"for f8.s1.r30.x$r"
  %17 = phi i32 [ %.pre, %"for f8.s1.r30.x$r.preheader" ], [ %f8.1.value, %"for
f8.s1.r30.x$r" ]
  %t24 = phi i32 [ %t24.pre, %"for f8.s1.r30.x$r.preheader" ], [ %f8.0.value,
%"for f8.s1.r30.x$r" ]
  %indvars.iv = phi i64 [ 0, %"for f8.s1.r30.x$r.preheader" ], [ %18, %"for
f8.s1.r30.x$r" ]
  %18 = add nuw nsw i64 %indvars.iv, 1
  %19 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.0.host59, i64 0, i64 0, i64 %18
  %t25 = load i32, i32* %19, align 4, !tbaa !4
  %20 = icmp slt i32 %t24, %t25
  %f8.0.value = select i1 %20, i32 %t25, i32 %t24
  %21 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.1.host60, i64 0, i64 0, i64 %18
  %22 = load i32, i32* %21, align 4, !tbaa !6
  %f8.1.value = select i1 %20, i32 %22, i32 %17
  store i32 %f8.0.value, i32* %15, align 32, !tbaa !7
  store i32 %f8.1.value, i32* %16, align 32, !tbaa !18
  %exitcond = icmp eq i64 %18, 100
  br i1 %exitcond, label %"consume f8", label %"for f8.s1.r30.x$r"

"consume f8":                                     ; preds = %"for
f8.s1.r30.x$r"
  %23 = bitcast i8* %f10.0.host to i32*
  store i32 %f8.0.value, i32* %23, align 4, !tbaa !29
  %24 = bitcast i8* %f10.1.host to i32*
  store i32 %f8.1.value, i32* %24, align 4, !tbaa !41
  br label %destructor_block

destructor_block:                                 ; preds = %after_bb42.thread,
%"consume f8", %after_bb42
  ret i32 0
}

; Function Attrs: nounwind
define i32 @f10(%struct.buffer_t* noalias nocapture %f10.0.buffer,
%struct.buffer_t* noalias nocapture %f10.1.buffer) #0 {
entry:
  %__f10_result = tail call i32 @__f10(%struct.buffer_t* %f10.0.buffer,
%struct.buffer_t* %f10.1.buffer) #0
  ret i32 0
}

; Function Attrs: nounwind
define i32 @f10_argv(i8** nocapture readonly) #0 {
entry:
  %1 = bitcast i8** %0 to %struct.buffer_t**
  %2 = load %struct.buffer_t*, %struct.buffer_t** %1, align 8
  %3 = getelementptr i8*, i8** %0, i64 1
  %4 = bitcast i8** %3 to %struct.buffer_t**
  %5 = load %struct.buffer_t*, %struct.buffer_t** %4, align 8
  %6 = tail call i32 @f10(%struct.buffer_t* %2, %struct.buffer_t* %5)
  ret i32 0
}

; Function Attrs: nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #0

attributes #0 = { nounwind }

!llvm.ident = !{!0, !0}
!llvm.module.flags = !{!1, !2, !3}

!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final 251413)"}
!1 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
!2 = !{i32 2, !"halide_mcpu", !"generic"}
!3 = !{i32 2, !"halide_mattrs", !""}
!4 = !{!"f8.0", !5}
!5 = !{!"Halide buffer"}
!6 = !{!"f8.1", !5}
!7 = !{!"f8.0.width1.base0", !8}
!8 = !{!"f8.0.width2.base0", !9}
!9 = !{!"f8.0.width4.base0", !10}
!10 = !{!"f8.0.width8.base0", !11}
!11 = !{!"f8.0.width16.base0", !12}
!12 = !{!"f8.0.width32.base0", !13}
!13 = !{!"f8.0.width64.base0", !14}
!14 = !{!"f8.0.width128.base0", !15}
!15 = !{!"f8.0.width256.base0", !16}
!16 = !{!"f8.0.width512.base0", !17}
!17 = !{!"f8.0.width1024.base0", !4}
!18 = !{!"f8.1.width1.base0", !19}
!19 = !{!"f8.1.width2.base0", !20}
!20 = !{!"f8.1.width4.base0", !21}
!21 = !{!"f8.1.width8.base0", !22}
!22 = !{!"f8.1.width16.base0", !23}
!23 = !{!"f8.1.width32.base0", !24}
!24 = !{!"f8.1.width64.base0", !25}
!25 = !{!"f8.1.width128.base0", !26}
!26 = !{!"f8.1.width256.base0", !27}
!27 = !{!"f8.1.width512.base0", !28}
!28 = !{!"f8.1.width1024.base0", !6}
!29 = !{!"f10.0.width1.base0", !30}
!30 = !{!"f10.0.width2.base0", !31}
!31 = !{!"f10.0.width4.base0", !32}
!32 = !{!"f10.0.width8.base0", !33}
!33 = !{!"f10.0.width16.base0", !34}
!34 = !{!"f10.0.width32.base0", !35}
!35 = !{!"f10.0.width64.base0", !36}
!36 = !{!"f10.0.width128.base0", !37}
!37 = !{!"f10.0.width256.base0", !38}
!38 = !{!"f10.0.width512.base0", !39}
!39 = !{!"f10.0.width1024.base0", !40}
!40 = !{!"f10.0", !5}
!41 = !{!"f10.1.width1.base0", !42}
!42 = !{!"f10.1.width2.base0", !43}
!43 = !{!"f10.1.width4.base0", !44}
!44 = !{!"f10.1.width8.base0", !45}
!45 = !{!"f10.1.width16.base0", !46}
!46 = !{!"f10.1.width32.base0", !47}
!47 = !{!"f10.1.width64.base0", !48}
!48 = !{!"f10.1.width128.base0", !49}
!49 = !{!"f10.1.width256.base0", !50}
!50 = !{!"f10.1.width512.base0", !51}
!51 = !{!"f10.1.width1024.base0", !52}
!52 = !{!"f10.1", !5}
    .text
    .file    "halide_module_f10"
    .section    .text.__f10,"ax", at progbits
    .globl    __f10
    .align    2
    .type    __f10, at function
__f10:                                  // @__f10
// BB#0:                                // %entry
    stp    x28, x27, [sp, #-32]!
    stp    x29, x30, [sp, #16]
    add    x29, sp, #16            // =16
    sub    x9, sp, #832            // =832
    and    sp, x9, #0xffffffffffffffe0
    ldp     x10, x8, [x0]
    ldp     x11, x9, [x1]
    orr     x12, x8, x10
    orr     x10, x9, x11
    cmp     x12, #0                // =0
    cset     w11, eq
    cmp     x10, #0                // =0
    cset     w10, eq
    cbnz    x12, .LBB0_2
// BB#1:                                // %true_bb
    orr    w12, wzr, #0x4
    stp    xzr, xzr, [x0, #48]
    stp    xzr, xzr, [x0, #32]
    stp    xzr, xzr, [x0, #16]
    str    w12, [x0, #64]
.LBB0_2:                                // %after_bb
    cbz    w10, .LBB0_4
// BB#3:                                // %after_bb42.thread
    orr    w8, wzr, #0x4
    stp    xzr, xzr, [x1, #48]
    stp    xzr, xzr, [x1, #32]
    stp    xzr, xzr, [x1, #16]
    str    w8, [x1, #64]
    b    .LBB0_10
.LBB0_4:                                // %after_bb42
    movz    w10, #0x65
    tbnz    w11, #0, .LBB0_10
// BB#5:
    movn    w11, #0
    movn    x12, #0
    mov     x13, sp
    add    x14, sp, #416           // =416
.LBB0_6:                                // %for f8.s0.v0
                                        // =>This Inner Loop Header: Depth=1
    mul     w15, w10, w12
    add    x12, x12, #1            // =1
    str    w11, [x13], #4
    add    w11, w11, #1            // =1
    str    w15, [x14], #4
    sub    x10, x10, #1            // =1
    cbnz    x10, .LBB0_6
// BB#7:                                // %for f8.s1.r30.x$r.preheader
    ldr    w10, [sp, #416]
    ldr     w11, [sp]
    mov     x12, sp
    orr    x12, x12, #0x4
    add    x13, sp, #416           // =416
    orr    x13, x13, #0x4
    movz    w14, #0x64
.LBB0_8:                                // %for f8.s1.r30.x$r
                                        // =>This Inner Loop Header: Depth=1
    ldr    w15, [x13], #4
    ldr    w16, [x12], #4
    cmp     w10, w15
    csel    w10, w15, w10, lt
    csel    w11, w16, w11, lt
    str    w10, [sp, #416]
    str     w11, [sp]
    sub    x14, x14, #1            // =1
    cbnz    x14, .LBB0_8
// BB#9:                                // %consume f8
    str     w10, [x8]
    str     w11, [x9]
.LBB0_10:                               // %destructor_block
    mov     w0, wzr
    sub    sp, x29, #16            // =16
    ldp    x29, x30, [sp, #16]
    ldp    x28, x27, [sp], #32
    ret
.Lfunc_end0:
    .size    __f10, .Lfunc_end0-__f10

    .section    .text.f10,"ax", at progbits
    .globl    f10
    .align    2
    .type    f10, at function
f10:                                    // @f10
// BB#0:                                // %entry
    stp    x29, x30, [sp, #-16]!
    mov     x29, sp
    bl    __f10
    mov     w0, wzr
    ldp    x29, x30, [sp], #16
    ret
.Lfunc_end1:
    .size    f10, .Lfunc_end1-f10

    .section    .text.f10_argv,"ax", at progbits
    .globl    f10_argv
    .align    2
    .type    f10_argv, at function
f10_argv:                               // @f10_argv
// BB#0:                                // %entry
    stp    x29, x30, [sp, #-16]!
    mov     x29, sp
    ldp     x8, x1, [x0]
    mov     x0, x8
    bl    f10
    mov     w0, wzr
    ldp    x29, x30, [sp], #16
    ret
.Lfunc_end2:
    .size    f10_argv, .Lfunc_end2-f10_argv

    .type    .Lstr, at object           // @str
    .section    .rodata,"a", at progbits
    .align    5
.Lstr:
    .asciz    "f10.0"
    .size    .Lstr, 6

    .type    .Lstr.2, at object         // @str.2
    .align    5
.Lstr.2:
    .asciz    "f10.1"
    .size    .Lstr.2, 6

    .type    .L__unnamed_1, at object   // @0
    .section    .data.rel.ro.local,"aw", at progbits
    .align    4
.L__unnamed_1:
    .xword    .Lstr
    .word    2                       // 0x2
    .word    0                       // 0x0
    .word    0                       // 0x0
    .word    32                      // 0x20
    .xword    0
    .xword    0
    .xword    0
    .xword    .Lstr.2
    .word    2                       // 0x2
    .word    0                       // 0x0
    .word    0                       // 0x0
    .word    32                      // 0x20
    .xword    0
    .xword    0
    .xword    0
    .size    .L__unnamed_1, 96

    .type    .Lstr.3, at object         // @str.3
    .section    .rodata,"a", at progbits
    .align    5
.Lstr.3:
    .asciz    "arm-64-linux-no_asserts-no_runtime"
    .size    .Lstr.3, 35

    .type    .Lstr.4, at object         // @str.4
    .align    5
.Lstr.4:
    .asciz    "f10"
    .size    .Lstr.4, 4

    .type    f10_metadata, at object    // @f10_metadata
    .section    .data.rel.ro.local,"aw", at progbits
    .globl    f10_metadata
    .align    4
f10_metadata:
    .word    0                       // 0x0
    .word    2                       // 0x2
    .xword    .L__unnamed_1
    .xword    .Lstr.3
    .xword    .Lstr.4
    .size    f10_metadata, 32

    .ident    "clang version 3.7.0 (tags/RELEASE_370/final 251413)"
    .ident    "clang version 3.7.0 (tags/RELEASE_370/final 251413)"
    .section    ".note.GNU-stack","", at progbits

Bad ll and asm:

; ModuleID = 'halide_module_f10'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnueabihf"

%struct.halide_filter_argument_t = type { i8*, i32, i32, i32, i32,
%struct.halide_scalar_value_t*, %struct.halide_scalar_value_t*,
%struct.halide_scalar_value_t* }
%struct.halide_scalar_value_t = type { %union.anon }
%union.anon = type { double }
%struct.halide_filter_metadata_t = type { i32, i32,
%struct.halide_filter_argument_t*, i8*, i8* }
%struct.buffer_t = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8,
i8, [2 x i8] }

@str = private constant [6 x i8] c"f10.0\00", align 32
@str.2 = private constant [6 x i8] c"f10.1\00", align 32
@0 = private constant [2 x %struct.halide_filter_argument_t]
[%struct.halide_filter_argument_t { i8* getelementptr inbounds ([6 x i8], [6 x
i8]* @str, i32 0, i32 0), i32 2, i32 0, i32 0, i32 32,
%struct.halide_scalar_value_t* null, %struct.halide_scalar_value_t* null,
%struct.halide_scalar_value_t* null }, %struct.halide_filter_argument_t { i8*
getelementptr inbounds ([6 x i8], [6 x i8]* @str.2, i32 0, i32 0), i32 2, i32
0, i32 0, i32 32, %struct.halide_scalar_value_t* null,
%struct.halide_scalar_value_t* null, %struct.halide_scalar_value_t* null }]
@str.3 = private constant [35 x i8] c"arm-64-linux-no_asserts-no_runtime\00",
align 32
@str.4 = private constant [4 x i8] c"f10\00", align 32
@f10_metadata = constant %struct.halide_filter_metadata_t { i32 0, i32 2,
%struct.halide_filter_argument_t* getelementptr inbounds ([2 x
%struct.halide_filter_argument_t], [2 x %struct.halide_filter_argument_t]* @0,
i32 0, i32 0), i8* getelementptr inbounds ([35 x i8], [35 x i8]* @str.3, i32 0,
i32 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @str.4, i32 0, i32 0) }

; Function Attrs: norecurse nounwind
define i32 @__f10(%struct.buffer_t* noalias nocapture %f10.0.buffer,
%struct.buffer_t* noalias nocapture %f10.1.buffer) #0 {
entry:
  %f8.0.host59 = alloca [13 x <8 x i32>], align 32
  %f8.1.host60 = alloca [13 x <8 x i32>], align 32
  %buf_host = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 1
  %f10.0.host = load i8*, i8** %buf_host, align 8
  %buf_dev = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 0
  %f10.0.dev = load i64, i64* %buf_dev, align 8
  %0 = icmp eq i64 %f10.0.dev, 0
  %1 = icmp eq i8* %f10.0.host, null
  %f10.0.host_and_dev_are_null = and i1 %1, %0
  %buf_host10 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 1
  %f10.1.host = load i8*, i8** %buf_host10, align 8
  %buf_dev11 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 0
  %f10.1.dev = load i64, i64* %buf_dev11, align 8
  %2 = icmp eq i64 %f10.1.dev, 0
  %3 = icmp eq i8* %f10.1.host, null
  %f10.1.host_and_dev_are_null = and i1 %3, %2
  br i1 %f10.0.host_and_dev_are_null, label %true_bb, label %after_bb

true_bb:                                          ; preds = %entry
  %buf_elem_size27 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 5
  store i32 4, i32* %buf_elem_size27, align 4
  %buf_extent29 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.0.buffer, i64 0, i32 2, i64 0
  %4 = bitcast i32* %buf_extent29 to i8*
  call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 48, i32 4, i1 false)
  br label %after_bb

after_bb:                                         ; preds = %entry, %true_bb
  br i1 %f10.1.host_and_dev_are_null, label %after_bb42.thread, label
%after_bb42

after_bb42.thread:                                ; preds = %after_bb
  %buf_elem_size43 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 5
  store i32 4, i32* %buf_elem_size43, align 4
  %buf_extent45 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t*
%f10.1.buffer, i64 0, i32 2, i64 0
  %5 = bitcast i32* %buf_extent45 to i8*
  call void @llvm.memset.p0i8.i64(i8* %5, i8 0, i64 48, i32 4, i1 false)
  br label %destructor_block

after_bb42:                                       ; preds = %after_bb
  br i1 %f10.0.host_and_dev_are_null, label %destructor_block, label %"for
f8.s0.v0"

"for f8.s0.v0":                                   ; preds = %after_bb42, %"for
f8.s0.v0"
  %indvars.iv61 = phi i64 [ %9, %"for f8.s0.v0" ], [ -1, %after_bb42 ]
  %f8.s0.v0 = phi i32 [ %14, %"for f8.s0.v0" ], [ -1, %after_bb42 ]
  %6 = sub nsw i32 100, %f8.s0.v0
  %7 = sext i32 %6 to i64
  %8 = mul nsw i64 %7, %indvars.iv61
  %9 = add nsw i64 %indvars.iv61, 1
  %10 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.0.host59, i64 0, i64 0, i64 %9
  %11 = trunc i64 %8 to i32
  store i32 %11, i32* %10, align 4, !tbaa !4
  %12 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.1.host60, i64 0, i64 0, i64 %9
  %13 = trunc i64 %indvars.iv61 to i32
  store i32 %13, i32* %12, align 4, !tbaa !6
  %14 = add nsw i32 %f8.s0.v0, 1
  %15 = icmp eq i64 %9, 100
  br i1 %15, label %"for f8.s1.r30.x$r.preheader", label %"for f8.s0.v0"

"for f8.s1.r30.x$r.preheader":                    ; preds = %"for f8.s0.v0"
  %16 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.0.host59, i64 0, i64 0, i64 0
  %17 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.1.host60, i64 0, i64 0, i64 0
  %t24.pre = load i32, i32* %16, align 32, !tbaa !7
  %.pre = load i32, i32* %17, align 32, !tbaa !18
  br label %"for f8.s1.r30.x$r"

"for f8.s1.r30.x$r":                              ; preds = %"for
f8.s1.r30.x$r.preheader", %"for f8.s1.r30.x$r"
  %18 = phi i32 [ %.pre, %"for f8.s1.r30.x$r.preheader" ], [ %f8.1.value, %"for
f8.s1.r30.x$r" ]
  %t24 = phi i32 [ %t24.pre, %"for f8.s1.r30.x$r.preheader" ], [ %f8.0.value,
%"for f8.s1.r30.x$r" ]
  %indvars.iv = phi i64 [ 0, %"for f8.s1.r30.x$r.preheader" ], [ %19, %"for
f8.s1.r30.x$r" ]
  %19 = add nuw nsw i64 %indvars.iv, 1
  %20 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.0.host59, i64 0, i64 0, i64 %19
  %t25 = load i32, i32* %20, align 4, !tbaa !4
  %21 = icmp slt i32 %t24, %t25
  %f8.0.value = select i1 %21, i32 %t25, i32 %t24
  %22 = getelementptr inbounds [13 x <8 x i32>], [13 x <8 x i32>]*
%f8.1.host60, i64 0, i64 0, i64 %19
  %23 = load i32, i32* %22, align 4, !tbaa !6
  %f8.1.value = select i1 %21, i32 %23, i32 %18
  store i32 %f8.0.value, i32* %16, align 32, !tbaa !7
  store i32 %f8.1.value, i32* %17, align 32, !tbaa !18
  %24 = icmp eq i64 %19, 100
  br i1 %24, label %"consume f8", label %"for f8.s1.r30.x$r"

"consume f8":                                     ; preds = %"for
f8.s1.r30.x$r"
  %25 = bitcast i8* %f10.0.host to i32*
  store i32 %f8.0.value, i32* %25, align 4, !tbaa !29
  %26 = bitcast i8* %f10.1.host to i32*
  store i32 %f8.1.value, i32* %26, align 4, !tbaa !41
  br label %destructor_block

destructor_block:                                 ; preds = %after_bb42.thread,
%"consume f8", %after_bb42
  ret i32 0
}

; Function Attrs: norecurse nounwind
define i32 @f10(%struct.buffer_t* noalias nocapture %f10.0.buffer,
%struct.buffer_t* noalias nocapture %f10.1.buffer) #0 {
entry:
  %__f10_result = tail call i32 @__f10(%struct.buffer_t* %f10.0.buffer,
%struct.buffer_t* %f10.1.buffer) #2
  ret i32 0
}

; Function Attrs: norecurse nounwind
define i32 @f10_argv(i8** nocapture readonly) #0 {
entry:
  %1 = bitcast i8** %0 to %struct.buffer_t**
  %2 = load %struct.buffer_t*, %struct.buffer_t** %1, align 8
  %3 = getelementptr i8*, i8** %0, i64 1
  %4 = bitcast i8** %3 to %struct.buffer_t**
  %5 = load %struct.buffer_t*, %struct.buffer_t** %4, align 8
  %6 = tail call i32 @f10(%struct.buffer_t* %2, %struct.buffer_t* %5)
  ret i32 0
}

; Function Attrs: argmemonly nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1

attributes #0 = { norecurse nounwind }
attributes #1 = { argmemonly nounwind }
attributes #2 = { nounwind }

!llvm.ident = !{!0, !0}
!llvm.module.flags = !{!1, !2, !3}

!0 = !{!"clang version 3.9.0 (trunk 260979)"}
!1 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
!2 = !{i32 2, !"halide_mcpu", !"generic"}
!3 = !{i32 2, !"halide_mattrs", !""}
!4 = !{!"f8.0", !5}
!5 = !{!"Halide buffer"}
!6 = !{!"f8.1", !5}
!7 = !{!"f8.0.width1.base0", !8}
!8 = !{!"f8.0.width2.base0", !9}
!9 = !{!"f8.0.width4.base0", !10}
!10 = !{!"f8.0.width8.base0", !11}
!11 = !{!"f8.0.width16.base0", !12}
!12 = !{!"f8.0.width32.base0", !13}
!13 = !{!"f8.0.width64.base0", !14}
!14 = !{!"f8.0.width128.base0", !15}
!15 = !{!"f8.0.width256.base0", !16}
!16 = !{!"f8.0.width512.base0", !17}
!17 = !{!"f8.0.width1024.base0", !4}
!18 = !{!"f8.1.width1.base0", !19}
!19 = !{!"f8.1.width2.base0", !20}
!20 = !{!"f8.1.width4.base0", !21}
!21 = !{!"f8.1.width8.base0", !22}
!22 = !{!"f8.1.width16.base0", !23}
!23 = !{!"f8.1.width32.base0", !24}
!24 = !{!"f8.1.width64.base0", !25}
!25 = !{!"f8.1.width128.base0", !26}
!26 = !{!"f8.1.width256.base0", !27}
!27 = !{!"f8.1.width512.base0", !28}
!28 = !{!"f8.1.width1024.base0", !6}
!29 = !{!"f10.0.width1.base0", !30}
!30 = !{!"f10.0.width2.base0", !31}
!31 = !{!"f10.0.width4.base0", !32}
!32 = !{!"f10.0.width8.base0", !33}
!33 = !{!"f10.0.width16.base0", !34}
!34 = !{!"f10.0.width32.base0", !35}
!35 = !{!"f10.0.width64.base0", !36}
!36 = !{!"f10.0.width128.base0", !37}
!37 = !{!"f10.0.width256.base0", !38}
!38 = !{!"f10.0.width512.base0", !39}
!39 = !{!"f10.0.width1024.base0", !40}
!40 = !{!"f10.0", !5}
!41 = !{!"f10.1.width1.base0", !42}
!42 = !{!"f10.1.width2.base0", !43}
!43 = !{!"f10.1.width4.base0", !44}
!44 = !{!"f10.1.width8.base0", !45}
!45 = !{!"f10.1.width16.base0", !46}
!46 = !{!"f10.1.width32.base0", !47}
!47 = !{!"f10.1.width64.base0", !48}
!48 = !{!"f10.1.width128.base0", !49}
!49 = !{!"f10.1.width256.base0", !50}
!50 = !{!"f10.1.width512.base0", !51}
!51 = !{!"f10.1.width1024.base0", !52}
!52 = !{!"f10.1", !5}

    .text
    .file    "halide_module_f10"
    .section    .text.__f10,"ax", at progbits
    .globl    __f10
    .p2align    2
    .type    __f10, at function
__f10:                                  // @__f10
// BB#0:                                // %entry
    ldp        x10, x8, [x0]
    ldp        x11, x9, [x1]
    orr        x12, x8, x10
    orr        x10, x9, x11
    cmp        x12, #0         // =0
    cset     w11, eq
    cmp        x10, #0         // =0
    cset     w10, eq
    cbnz    x12, .LBB0_2
// BB#1:                                // %true_bb
    orr    w12, wzr, #0x4
    stp    xzr, xzr, [x0, #48]
    stp    xzr, xzr, [x0, #32]
    stp    xzr, xzr, [x0, #16]
    str    w12, [x0, #64]
.LBB0_2:                   Success!

   // %after_bb
    cbz    w10, .LBB0_4
// BB#3:                                // %after_bb42.thread
    orr    w8, wzr, #0x4
    mov     w0, wzr
    stp    xzr, xzr, [x1, #48]
    stp    xzr, xzr, [x1, #32]
    stp    xzr, xzr, [x1, #16]
    str    w8, [x1, #64]
    ret
.LBB0_4:                                // %after_bb42
    movz    w10, #0x65
    tbnz    w11, #0, .LBB0_10
// BB#5:
    str    x28, [sp, #-32]!
    sub    x9, sp, #832            // =832
    stp    x29, x30, [sp, #16]
    add    x29, sp, #16            // =16
    and    sp, x9, #0xffffffffffffffe0
    movn    x13, #0
    mov     x11, sp
    add    x12, sp, #416           // =416
.LBB0_6:                                // %"for f8.s0.v0"
                                        // =>This Inner Loop Header: Depth=1
    mul        w14, w10, w13
    str    w13, [x11], #4
    add    x13, x13, #1            // =1
    sub    x10, x10, #1            // =1
    str    w14, [x12], #4
    cmp        x13, #100       // =100
    b.ne    .LBB0_6
// BB#7:                                // %"for f8.s1.r30.x$r.preheader"
    ldr    w10, [sp, #416]
    ldr        w11, [sp]
    orr    w12, wzr, #0x4
    add    x13, sp, #416           // =416
    mov     x14, sp
.LBB0_8:                                // %"for f8.s1.r30.x$r"
                                        // =>This Inner Loop Header: Depth=1
    ldr        w15, [x13, x12]
    ldr        w16, [x14, x12]
    add    x12, x12, #4            // =4
    cmp        w10, w15
    csel    w10, w15, w10, lt
    csel    w11, w16, w11, lt
    str    w10, [sp, #416]
    str        w11, [sp]
    cmp        x12, #404       // =404
    b.ne    .LBB0_8
// BB#9:                                // %"consume f8"
    str        w10, [x8]
    str        w11, [x9]
    sub    sp, x29, #16            // =16
    ldp    x29, x30, [sp, #16]
    ldr    x28, [sp], #32
.LBB0_10:                               // %destructor_block
    mov     w0, wzr
    ret
.Lfunc_end0:
    .size    __f10, .Lfunc_end0-__f10

    .section    .text.f10,"ax", at progbits
    .globl    f10
    .p2align    2
    .type    f10, at function
f10:                                    // @f10
// BB#0:                                // %entry
    stp    x29, x30, [sp, #-16]!
    mov     x29, sp
    bl    __f10
    mov     w0, wzr
    ldp    x29, x30, [sp], #16
    ret
.Lfunc_end1:
    .size    f10, .Lfunc_end1-f10

    .section    .text.f10_argv,"ax", at progbits
    .globl    f10_argv
    .p2align    2
    .type    f10_argv, at function
f10_argv:                               // @f10_argv
// BB#0:                                // %entry
    stp    x29, x30, [sp, #-16]!
    ldp        x8, x1, [x0]
    mov     x29, sp
    mov     x0, x8
    bl    f10
    mov     w0, wzr
    ldp    x29, x30, [sp], #16
    ret
.Lfunc_end2:
    .size    f10_argv, .Lfunc_end2-f10_argv

    .type    .Lstr, at object           // @str
    .section    .rodata,"a", at progbits
    .p2align    5
.Lstr:
    .asciz    "f10.0"
    .size    .Lstr, 6

    .type    .Lstr.2, at object         // @str.2
    .p2align    5
.Lstr.2:
    .asciz    "f10.1"
    .size    .Lstr.2, 6

    .type    .L__unnamed_1, at object   // @0
    .section    .data.rel.ro,"aw", at progbits
    .p2align    4
.L__unnamed_1:
    .xword    .Lstr
    .word    2                       // 0x2
    .word    0                       // 0x0
    .word    0                       // 0x0
    .word    32                      // 0x20
    .xword    0
    .xword    0
    .xword    0
    .xword    .Lstr.2
    .word    2                       // 0x2
    .word    0                       // 0x0
    .word    0                       // 0x0
    .word    32                      // 0x20
    .xword    0
    .xword    0
    .xword    0
    .size    .L__unnamed_1, 96

    .type    .Lstr.3, at object         // @str.3
    .section    .rodata,"a", at progbits
    .p2align    5
.Lstr.3:
    .asciz    "arm-64-linux-no_asserts-no_runtime"
    .size    .Lstr.3, 35

    .type    .Lstr.4, at object         // @str.4
    .p2align    5
.Lstr.4:
    .asciz    "f10"
    .size    .Lstr.4, 4

    .type    f10_metadata, at object    // @f10_metadata
    .section    .data.rel.ro,"aw", at progbits
    .globl    f10_metadata
    .p2align    4
f10_metadata:
    .word    0                       // 0x0
    .word    2                       // 0x2
    .xword    .L__unnamed_1
    .xword    .Lstr.3
    .xword    .Lstr.4
    .size    f10_metadata, 32

    .ident    "clang version 3.9.0 (trunk 260979)"
    .ident    "clang version 3.9.0 (trunk 260979)"
    .section    ".note.GNU-stack","", at progbits

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160216/7a913361/attachment-0001.html>