[llvm-bugs] [Bug 50375] New: [X86][MMX] hoisting behavior of mmx intrinsics and _mm_empty

Mon May 17 08:53:53 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=50375

            Bug ID: 50375
           Summary: [X86][MMX] hoisting behavior of mmx intrinsics and
                    _mm_empty
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: greg.bedwell at sony.com
                CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
                    llvm-dev at redking.me.uk, pengfei.wang at intel.com,
                    spatel+llvm at rotateright.com

I'll preface this by saying that I've only observed this in code generated by
one of our random fuzzers, so as far as I am aware there is no real-world
use-case for this.  Nevertheless I thought it was worthy of at least a question
here.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

$ cat test.cpp
// ----------------------------------------------------------------------------
// From  lib/clang/13.0.0/include/mmintrin.h

typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
typedef long long __v1di __attribute__((__vector_size__(8)));

static __inline__ __m64
    __attribute__((__always_inline__, __nodebug__, __target__("mmx"),
                   __min_vector_width__(64)))
    _mm_srli_si64(__m64 __m, int __count) {
  return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
}

static __inline__ void
    __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
    _mm_empty(void) {
  __builtin_ia32_emms();
}
// ----------------------------------------------------------------------------

using ll = long long;
using uchar = unsigned char;

void test86() {
  volatile ll id18225 = 1775742612LL >> 52;
  for (uchar id18207_idx = 0; (id18207_idx < 21) && (812102100LL == id18225);
       ++id18207_idx) {
    for (uchar id18210_idx = 0; (id18210_idx < 100); ++id18210_idx) {
      __m64 id18213 = {1};
      volatile __m64 id18212 = _mm_srli_si64(id18213, 63);
      _mm_empty();
    }
  }
}

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In theory this testcase has been written to be safe w.r.t. the mmx state. 
Every time _mm_srli_si64 is executed, we should also, in theory, execute
_mm_empty.

If you look at the clang O2 codegen though, we get this:

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

$ bin\clang.exe --version
clang version 13.0.0 (https://github.com/llvm/llvm-project.git
6052a8a53559d667321637f7159353ab724a1141)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: g:\llvm-project\build\bin

$ bin\clang.exe -target x86_64-unknown-unknown -S -O2 test.cpp -o -
        .text
        .file   "test.cpp"
        .globl  _Z6test86v                      # -- Begin function _Z6test86v
        .p2align        4, 0x90
        .type   _Z6test86v, at function
_Z6test86v:                             # @_Z6test86v
        .cfi_startproc
# %bb.0:
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        movq    $0, -16(%rbp)
        xorl    %eax, %eax
        movl    $1, %ecx
        movd    %ecx, %mm0
        psrlq   $63, %mm0
        movq    %mm0, %rcx
        .p2align        4, 0x90
.LBB0_1:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_3 Depth 2
        movq    -16(%rbp), %rdx
        cmpq    $812102100, %rdx                # imm = 0x3067B1D4
        jne     .LBB0_5
# %bb.2:                                #   in Loop: Header=BB0_1 Depth=1
        movb    $100, %dl
        .p2align        4, 0x90
.LBB0_3:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        movq    %rcx, -8(%rbp)
        emms
        movq    %rcx, -8(%rbp)
        emms
        movq    %rcx, -8(%rbp)
        emms
        movq    %rcx, -8(%rbp)
        emms
        movq    %rcx, -8(%rbp)
        emms
        addb    $-5, %dl
        jne     .LBB0_3
# %bb.4:                                #   in Loop: Header=BB0_1 Depth=1
        addb    $1, %al
        cmpb    $21, %al
        jne     .LBB0_1
.LBB0_5:
        popq    %rbp
        .cfi_def_cfa %rsp, 8
        retq
.Lfunc_end0:
        .size   _Z6test86v, .Lfunc_end0-_Z6test86v
        .cfi_endproc
                                        # -- End function
        .ident  "clang version 13.0.0 (https://github.com/llvm/llvm-project.git
6052a8a53559d667321637f7159353ab724a1141)"
        .section        ".note.GNU-stack","", at progbits
        .addrsig

$

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The compiler has realised that that the _mm_srli_si64 is invariant and has
hoisted the psrlq out of the loop so it is now executed unconditionally.  The
emms instructions have been left inside the inner loop though, and in this
particular case due to the outer loop condition are never executed. My
assumption is that it's not been hoisted due to the hassideeffects flag. My
question is, should every instruction that might change the mmx technology
state also be marked as hassideeffects, or is that an overkill?  Is there a
better approach?  (assuming that anyone actually cares enough about MMX to
implement one :) ).

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210517/c06d4635/attachment.html>