[llvm-bugs] [Bug 50375] New: [X86][MMX] hoisting behavior of mmx intrinsics and _mm_empty
via llvm-bugs
llvm-bugs at lists.llvm.org
Mon May 17 08:53:53 PDT 2021
https://bugs.llvm.org/show_bug.cgi?id=50375
Bug ID: 50375
Summary: [X86][MMX] hoisting behavior of mmx intrinsics and
_mm_empty
Product: libraries
Version: trunk
Hardware: PC
OS: Windows NT
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: greg.bedwell at sony.com
CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
llvm-dev at redking.me.uk, pengfei.wang at intel.com,
spatel+llvm at rotateright.com
I'll preface this by saying that I've only observed this in code generated by
one of our random fuzzers, so as far as I am aware there is no real-world
use-case for this. Nevertheless I thought it was worthy of at least a question
here.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
$ cat test.cpp
// ----------------------------------------------------------------------------
// From lib/clang/13.0.0/include/mmintrin.h
typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
typedef long long __v1di __attribute__((__vector_size__(8)));
static __inline__ __m64
__attribute__((__always_inline__, __nodebug__, __target__("mmx"),
__min_vector_width__(64)))
_mm_srli_si64(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
}
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("mmx")))
_mm_empty(void) {
__builtin_ia32_emms();
}
// ----------------------------------------------------------------------------
using ll = long long;
using uchar = unsigned char;
void test86() {
volatile ll id18225 = 1775742612LL >> 52;
for (uchar id18207_idx = 0; (id18207_idx < 21) && (812102100LL == id18225);
++id18207_idx) {
for (uchar id18210_idx = 0; (id18210_idx < 100); ++id18210_idx) {
__m64 id18213 = {1};
volatile __m64 id18212 = _mm_srli_si64(id18213, 63);
_mm_empty();
}
}
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In theory this testcase has been written to be safe w.r.t. the mmx state.
Every time _mm_srli_si64 is executed, we should also, in theory, execute
_mm_empty.
If you look at the clang O2 codegen though, we get this:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
$ bin\clang.exe --version
clang version 13.0.0 (https://github.com/llvm/llvm-project.git
6052a8a53559d667321637f7159353ab724a1141)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: g:\llvm-project\build\bin
$ bin\clang.exe -target x86_64-unknown-unknown -S -O2 test.cpp -o -
.text
.file "test.cpp"
.globl _Z6test86v # -- Begin function _Z6test86v
.p2align 4, 0x90
.type _Z6test86v, at function
_Z6test86v: # @_Z6test86v
.cfi_startproc
# %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
movq $0, -16(%rbp)
xorl %eax, %eax
movl $1, %ecx
movd %ecx, %mm0
psrlq $63, %mm0
movq %mm0, %rcx
.p2align 4, 0x90
.LBB0_1: # =>This Loop Header: Depth=1
# Child Loop BB0_3 Depth 2
movq -16(%rbp), %rdx
cmpq $812102100, %rdx # imm = 0x3067B1D4
jne .LBB0_5
# %bb.2: # in Loop: Header=BB0_1 Depth=1
movb $100, %dl
.p2align 4, 0x90
.LBB0_3: # Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
movq %rcx, -8(%rbp)
emms
movq %rcx, -8(%rbp)
emms
movq %rcx, -8(%rbp)
emms
movq %rcx, -8(%rbp)
emms
movq %rcx, -8(%rbp)
emms
addb $-5, %dl
jne .LBB0_3
# %bb.4: # in Loop: Header=BB0_1 Depth=1
addb $1, %al
cmpb $21, %al
jne .LBB0_1
.LBB0_5:
popq %rbp
.cfi_def_cfa %rsp, 8
retq
.Lfunc_end0:
.size _Z6test86v, .Lfunc_end0-_Z6test86v
.cfi_endproc
# -- End function
.ident "clang version 13.0.0 (https://github.com/llvm/llvm-project.git
6052a8a53559d667321637f7159353ab724a1141)"
.section ".note.GNU-stack","", at progbits
.addrsig
$
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The compiler has realised that that the _mm_srli_si64 is invariant and has
hoisted the psrlq out of the loop so it is now executed unconditionally. The
emms instructions have been left inside the inner loop though, and in this
particular case due to the outer loop condition are never executed. My
assumption is that it's not been hoisted due to the hassideeffects flag. My
question is, should every instruction that might change the mmx technology
state also be marked as hassideeffects, or is that an overkill? Is there a
better approach? (assuming that anyone actually cares enough about MMX to
implement one :) ).
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210517/c06d4635/attachment.html>
More information about the llvm-bugs
mailing list