<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Double precision min/max vectorization [x86, SSE2, AVX]"
href="https://bugs.llvm.org/show_bug.cgi?id=38705">38705</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Double precision min/max vectorization [x86, SSE2, AVX]
</td>
</tr>
<tr>
<th>Product</th>
<td>clang
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>-New Bugs
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedclangbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>kobalicek.petr@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>Dobule precision min/max is sometimes vectorized and sometimes doesn't. I have
observed that when I use my own min/max functions clang vectorizes successfully
my sample, however, when I replace them with std::min/max the vectorization
fails for some reason which prevents vectorization of surrounding code.
The first `quadBoundingBoxA()` is compiled perfectly, the second
`quadBoundingBoxB()` is compiled to mixed scalar+simd code. The code is
stripped from a project that I work on so its size is a bit larger than I would
normally post, so please forgive me that.
The main problem is that I don't know whether the std::min/max should vectorize
well or not. I would expect it to work, but the snippet provided shows the
opposite so I decided to report it instead.
// =======================================================
// Sample Code
// =======================================================
#include <algorithm>
#include <cmath>
#include <stdint.h>
// Point structure [x, y]
struct Point {
double x, y;
inline Point() noexcept = default;
constexpr Point(const Point&) noexcept = default;
constexpr Point(double x, double y) noexcept
: x(x), y(y) {}
};
// Box structure [x0, y0, x1, y1]
struct Box {
double x0, y0, x1, y1;
inline void reset(double x0, double y0, double x1, double y1) noexcept {
this->x0 = x0;
this->y0 = y0;
this->x1 = x1;
this->y1 = y1;
}
};
// Overloads to make vector processing simpler.
static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x,
-a.y); }
static constexpr Point operator+(const Point& a, double b) noexcept { return
Point(a.x + b, a.y + b); }
static constexpr Point operator-(const Point& a, double b) noexcept { return
Point(a.x - b, a.y - b); }
static constexpr Point operator*(const Point& a, double b) noexcept { return
Point(a.x * b, a.y * b); }
static constexpr Point operator/(const Point& a, double b) noexcept { return
Point(a.x / b, a.y / b); }
static constexpr Point operator+(const Point& a, const Point& b) noexcept {
return Point(a.x + b.x, a.y + b.y); }
static constexpr Point operator-(const Point& a, const Point& b) noexcept {
return Point(a.x - b.x, a.y - b.y); }
static constexpr Point operator*(const Point& a, const Point& b) noexcept {
return Point(a.x * b.x, a.y * b.y); }
static constexpr Point operator/(const Point& a, const Point& b) noexcept {
return Point(a.x / b.x, a.y / b.y); }
static constexpr Point operator+(double a, const Point& b) noexcept { return
Point(a + b.x, a + b.y); }
static constexpr Point operator-(double a, const Point& b) noexcept { return
Point(a - b.x, a - b.y); }
static constexpr Point operator*(double a, const Point& b) noexcept { return
Point(a * b.x, a * b.y); }
static constexpr Point operator/(double a, const Point& b) noexcept { return
Point(a / b.x, a / b.y); }
// Min/Max - different semantics compared to std.
template<typename T> constexpr T myMin(const T& a, const T& b) noexcept {
return b < a ? b : a; }
template<typename T> constexpr T myMax(const T& a, const T& b) noexcept {
return a < b ? b : a; }
// Linear interpolation, works with points as well.
template<typename V, typename T = double>
inline V lerp(const V& a, const V& b, const T& t) noexcept {
return (a * (1.0 - t)) + (b * t);
}
// Merge a point into a box by possibly increasing its bounds.
inline void boxMergePoint(Box& box, const Point& p) noexcept {
box.x0 = myMin(box.x0, p.x);
box.y0 = myMin(box.y0, p.y);
box.x1 = myMax(box.x1, p.x);
box.y1 = myMax(box.y1, p.y);
}
// THIS CODE COMPILES GREAT.
void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept {
// Bounding box of start and end points.
bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y),
myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y));
Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);
t.x = myMax(t.x, 0.0);
t.y = myMax(t.y, 0.0);
t.x = myMin(t.x, 1.0);
t.y = myMin(t.y, 1.0);
boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
lerp(bez[1], bez[2], t), t));
}
// THIS CODE DOESN'T AUTOVECTORIZE WELL.
void quadBoundingBoxB(const Point bez[3], Box& bBox) noexcept {
// Bounding box of start and end points.
bBox.reset(std::min(bez[0].x, bez[2].x), std::min(bez[0].y, bez[2].y),
std::max(bez[0].x, bez[2].x), std::max(bez[0].y, bez[2].y));
Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);
// ------- THE MAIN DIFFERENCE -------
t.x = std::max(t.x, 0.0);
t.y = std::max(t.y, 0.0);
t.x = std::min(t.x, 1.0);
t.y = std::min(t.y, 1.0);
// ------- THE MAIN DIFFERENCE -------
boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
lerp(bez[1], bez[2], t), t));
}
// =======================================================
// quadBoundingBoxA [-std=c++17 -O3 -mavx2 -fno-math-errno]
// =======================================================
# This code is perfect, everything vectorized as intended.
.LCPI0_0:
.quad 4607182418800017408 # double 1
.quad 4607182418800017408 # double 1
quadBoundingBoxA(Point const*, Box&): # @quadBoundingBoxA(Point const*,
Box&)
vmovupd xmm0, xmmword ptr [rdi]
vmovupd xmm1, xmmword ptr [rdi + 16]
vmovupd xmm2, xmmword ptr [rdi + 32]
vminpd xmm3, xmm2, xmm0
vmaxpd xmm4, xmm2, xmm0
vsubpd xmm5, xmm0, xmm1
vaddpd xmm6, xmm1, xmm1
vsubpd xmm6, xmm0, xmm6
vaddpd xmm6, xmm2, xmm6
vdivpd xmm5, xmm5, xmm6
vxorpd xmm6, xmm6, xmm6
vmaxpd xmm5, xmm6, xmm5
vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 =
[1.000000e+00,1.000000e+00]
vminpd xmm5, xmm6, xmm5
vsubpd xmm6, xmm6, xmm5
vmulpd xmm0, xmm0, xmm6
vmulpd xmm7, xmm1, xmm5
vaddpd xmm0, xmm7, xmm0
vmulpd xmm1, xmm1, xmm6
vmulpd xmm2, xmm2, xmm5
vaddpd xmm1, xmm2, xmm1
vmulpd xmm0, xmm6, xmm0
vmulpd xmm1, xmm5, xmm1
vaddpd xmm0, xmm0, xmm1
vminpd xmm1, xmm0, xmm3
vmovupd xmmword ptr [rsi], xmm1
vmaxpd xmm0, xmm0, xmm4
vmovupd xmmword ptr [rsi + 16], xmm0
ret
// =======================================================
// quadBoundingBoxB [-std=c++17 -O3 -mavx2 -fno-math-errno]
// =======================================================
# This code contains mixed scalar and vector code, missed opportunity.
.LCPI1_0:
.quad 4607182418800017408 # double 1
.quad 4607182418800017408 # double 1
quadBoundingBoxB(Point const*, Box&): # @quadBoundingBoxB(Point const*,
Box&)
vmovupd xmm3, xmmword ptr [rdi]
vmovupd xmm2, xmmword ptr [rdi + 32]
vminpd xmm9, xmm2, xmm3
vmaxpd xmm8, xmm2, xmm3
vmovupd xmm4, xmmword ptr [rdi + 16]
vsubsd xmm5, xmm3, xmm4
vpermilpd xmm6, xmm4, 1 # xmm6 = xmm4[1,0]
vpermilpd xmm7, xmm3, 1 # xmm7 = xmm3[1,0]
vsubsd xmm0, xmm7, xmm6
vaddsd xmm1, xmm4, xmm4
vaddsd xmm6, xmm6, xmm6
vsubsd xmm1, xmm3, xmm1
vsubsd xmm6, xmm7, xmm6
vaddsd xmm1, xmm2, xmm1
vdivsd xmm1, xmm5, xmm1
vpermilpd xmm5, xmm2, 1 # xmm5 = xmm2[1,0]
vaddsd xmm5, xmm5, xmm6
vdivsd xmm0, xmm0, xmm5
vmovsd qword ptr [rsp - 16], xmm1
vmovsd qword ptr [rsp - 24], xmm0
vxorpd xmm5, xmm5, xmm5
vucomisd xmm5, xmm1
lea rax, [rsp - 8]
lea rcx, [rsp - 16]
cmova rcx, rax
mov qword ptr [rsp - 8], 0
mov rcx, qword ptr [rcx]
vucomisd xmm5, xmm0
lea rdx, [rsp - 24]
cmova rdx, rax
mov qword ptr [rsp - 16], rcx
mov qword ptr [rsp - 8], 0
mov rax, qword ptr [rdx]
mov qword ptr [rsp - 24], rax
vmovsd xmm0, qword ptr [rsp - 16] # xmm0 = mem[0],zero
vmovq xmm1, rcx
vmovq xmm5, rax
vpunpcklqdq xmm1, xmm1, xmm5 # xmm1 = xmm1[0],xmm5[0]
vmovapd xmm5, xmmword ptr [rip + .LCPI1_0] # xmm5 =
[1.000000e+00,1.000000e+00]
vcmpltpd xmm1, xmm5, xmm1
vmovhpd xmm0, xmm0, qword ptr [rsp - 24] # xmm0 = xmm0[0],mem[0]
vblendvpd xmm0, xmm0, xmm5, xmm1
vsubpd xmm1, xmm5, xmm0
vmulpd xmm3, xmm3, xmm1
vmulpd xmm5, xmm4, xmm0
vaddpd xmm3, xmm5, xmm3
vmulpd xmm4, xmm4, xmm1
vmulpd xmm2, xmm2, xmm0
vaddpd xmm2, xmm2, xmm4
vmulpd xmm1, xmm1, xmm3
vmulpd xmm0, xmm2, xmm0
vaddpd xmm0, xmm1, xmm0
vminpd xmm1, xmm0, xmm9
vmovupd xmmword ptr [rsi], xmm1
vmaxpd xmm0, xmm0, xmm8
vmovupd xmmword ptr [rsi + 16], xmm0
ret</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>