[LLVMbugs] [Bug 21273] New: inline asm incorrectly handles output operands sometimes

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Tue Oct 14 06:39:07 PDT 2014


http://llvm.org/bugs/show_bug.cgi?id=21273

            Bug ID: 21273
           Summary: inline asm incorrectly handles output operands
                    sometimes
           Product: new-bugs
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: dimitry at andric.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Depending on the optimization level, clang trunk r219624 seems to sometimes
handle output operands incorrectly.  This was reported to me by FreeBSD kernel
developers, who attempted to compile the following:

int ivy_rng_store(long *buf)
{
  long tmp;
  int retry;

  retry = 10;
  __asm __volatile(
    "1:\n\t"
    "rdrand    %2\n\t"    /* read randomness into tmp */
    "jb        2f\n\t" /* CF is set on success, exit retry loop */
    "dec    %0\n\t" /* otherwise, retry-- */
    "jne    1b\n\t" /* and loop if retries are not exhausted */
    "jmp    3f\n"    /* failure, retry is 0, used as return value */
    "2:\n\t"
    "mov    %2,%1\n\t" /* *buf = tmp */
    "3:"
    : "+q" (retry), "=m" (*buf), "=q" (tmp) : : "cc");

  return (retry);
}

E.g., the intent is that 'tmp' is just used for output, but the actual value is
not used outside the inline asm.  It is stored to *buf instead.

However, clang -O0 seems to have trouble keeping the two apart, as the
resulting assembly is:

ivy_rng_store:                          # @ivy_rng_store
    .cfi_startproc
# BB#0:                                 # %entry
    pushq    %rbp
.Ltmp0:
    .cfi_def_cfa_offset 16
.Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
.Ltmp2:
    .cfi_def_cfa_register %rbp
    movq    %rdi, -8(%rbp)
    movl    $10, -20(%rbp)
    movl    -20(%rbp), %eax
    movq    -8(%rbp), %rdi
    #APP
.Ltmp3:
    rdrandq    %rdi
    jb    .Ltmp4
    decl    %eax
    jne    .Ltmp3
    jmp    .Ltmp5
.Ltmp4:
    movq    %rdi, (%rdi)
.Ltmp5:
    #NO_APP
    movl    %eax, -20(%rbp)
    movq    %rdi, -16(%rbp)
    movl    -20(%rbp), %eax
    popq    %rbp
    retq

Clearly, the movq %rdi, (%rdi) is incorrect.  This seems to be magically solved
by enabling optimization, e.g. at -O1 or higher:

ivy_rng_store:                          # @ivy_rng_store
    .cfi_startproc
# BB#0:                                 # %entry
    pushq    %rbp
.Ltmp0:
    .cfi_def_cfa_offset 16
.Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
.Ltmp2:
    .cfi_def_cfa_register %rbp
    movl    $10, %eax
    #APP
.Ltmp3:
    rdrandq    %rcx
    jb    .Ltmp4
    decl    %eax
    jne    .Ltmp3
    jmp    .Ltmp5
.Ltmp4:
    movq    %rcx, (%rdi)
.Ltmp5:
    #NO_APP
    popq    %rbp
    retq

Something similar happens when targeting i386 at -O0:

ivy_rng_store:                          # @ivy_rng_store
# BB#0:                                 # %entry
    pushl    %ebp
    movl    %esp, %ebp
    subl    $12, %esp
    movl    8(%ebp), %eax
    movl    %eax, -4(%ebp)
    movl    $10, -12(%ebp)
    movl    -12(%ebp), %eax
    movl    -4(%ebp), %ecx
    #APP
.Ltmp0:
    rdrandl    %ecx
    jb    .Ltmp1
    decl    %eax
    jne    .Ltmp0
    jmp    .Ltmp2
.Ltmp1:
    movl    %ecx, (%ecx)
.Ltmp2:
    #NO_APP
    movl    %eax, -12(%ebp)
    movl    %ecx, -8(%ebp)
    movl    -12(%ebp), %eax
    addl    $12, %esp
    popl    %ebp
    retl

However, on i386 optimization does not fix it, e.g. at -O1 or higher:

ivy_rng_store:                          # @ivy_rng_store
# BB#0:                                 # %entry
    pushl    %ebp
    movl    %esp, %ebp
    movl    8(%ebp), %ecx
    movl    $10, %eax
    #APP
.Ltmp0:
    rdrandl    %ecx
    jb    .Ltmp1
    decl    %eax
    jne    .Ltmp0
    jmp    .Ltmp2
.Ltmp1:
    movl    %ecx, (%ecx)
.Ltmp2:
    #NO_APP
    popl    %ebp
    retl

Changing the output constraint on 'tmp' to "+q" seems to help on amd64, but on
i386 it still produces incorrect output at -O1 or higher optimization.

I tested the above code with different versions of gcc (4.7 through 5.0), but
the resulting assembly was always as expected, at any optimization level.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20141014/fad7d4eb/attachment.html>


More information about the llvm-bugs mailing list