[llvm-bugs] [Bug 38705] New: Double precision min/max vectorization [x86, SSE2, AVX]
via llvm-bugs
llvm-bugs at lists.llvm.org
Sat Aug 25 15:36:16 PDT 2018
https://bugs.llvm.org/show_bug.cgi?id=38705
Bug ID: 38705
Summary: Double precision min/max vectorization [x86, SSE2,
AVX]
Product: clang
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: -New Bugs
Assignee: unassignedclangbugs at nondot.org
Reporter: kobalicek.petr at gmail.com
CC: llvm-bugs at lists.llvm.org
Dobule precision min/max is sometimes vectorized and sometimes doesn't. I have
observed that when I use my own min/max functions clang vectorizes successfully
my sample, however, when I replace them with std::min/max the vectorization
fails for some reason which prevents vectorization of surrounding code.
The first `quadBoundingBoxA()` is compiled perfectly, the second
`quadBoundingBoxB()` is compiled to mixed scalar+simd code. The code is
stripped from a project that I work on so its size is a bit larger than I would
normally post, so please forgive me that.
The main problem is that I don't know whether the std::min/max should vectorize
well or not. I would expect it to work, but the snippet provided shows the
opposite so I decided to report it instead.
// =======================================================
// Sample Code
// =======================================================
#include <algorithm>
#include <cmath>
#include <stdint.h>
// Point structure [x, y]
struct Point {
double x, y;
inline Point() noexcept = default;
constexpr Point(const Point&) noexcept = default;
constexpr Point(double x, double y) noexcept
: x(x), y(y) {}
};
// Box structure [x0, y0, x1, y1]
struct Box {
double x0, y0, x1, y1;
inline void reset(double x0, double y0, double x1, double y1) noexcept {
this->x0 = x0;
this->y0 = y0;
this->x1 = x1;
this->y1 = y1;
}
};
// Overloads to make vector processing simpler.
static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x,
-a.y); }
static constexpr Point operator+(const Point& a, double b) noexcept { return
Point(a.x + b, a.y + b); }
static constexpr Point operator-(const Point& a, double b) noexcept { return
Point(a.x - b, a.y - b); }
static constexpr Point operator*(const Point& a, double b) noexcept { return
Point(a.x * b, a.y * b); }
static constexpr Point operator/(const Point& a, double b) noexcept { return
Point(a.x / b, a.y / b); }
static constexpr Point operator+(const Point& a, const Point& b) noexcept {
return Point(a.x + b.x, a.y + b.y); }
static constexpr Point operator-(const Point& a, const Point& b) noexcept {
return Point(a.x - b.x, a.y - b.y); }
static constexpr Point operator*(const Point& a, const Point& b) noexcept {
return Point(a.x * b.x, a.y * b.y); }
static constexpr Point operator/(const Point& a, const Point& b) noexcept {
return Point(a.x / b.x, a.y / b.y); }
static constexpr Point operator+(double a, const Point& b) noexcept { return
Point(a + b.x, a + b.y); }
static constexpr Point operator-(double a, const Point& b) noexcept { return
Point(a - b.x, a - b.y); }
static constexpr Point operator*(double a, const Point& b) noexcept { return
Point(a * b.x, a * b.y); }
static constexpr Point operator/(double a, const Point& b) noexcept { return
Point(a / b.x, a / b.y); }
// Min/Max - different semantics compared to std.
template<typename T> constexpr T myMin(const T& a, const T& b) noexcept {
return b < a ? b : a; }
template<typename T> constexpr T myMax(const T& a, const T& b) noexcept {
return a < b ? b : a; }
// Linear interpolation, works with points as well.
template<typename V, typename T = double>
inline V lerp(const V& a, const V& b, const T& t) noexcept {
return (a * (1.0 - t)) + (b * t);
}
// Merge a point into a box by possibly increasing its bounds.
inline void boxMergePoint(Box& box, const Point& p) noexcept {
box.x0 = myMin(box.x0, p.x);
box.y0 = myMin(box.y0, p.y);
box.x1 = myMax(box.x1, p.x);
box.y1 = myMax(box.y1, p.y);
}
// THIS CODE COMPILES GREAT.
void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept {
// Bounding box of start and end points.
bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y),
myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y));
Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);
t.x = myMax(t.x, 0.0);
t.y = myMax(t.y, 0.0);
t.x = myMin(t.x, 1.0);
t.y = myMin(t.y, 1.0);
boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
lerp(bez[1], bez[2], t), t));
}
// THIS CODE DOESN'T AUTOVECTORIZE WELL.
void quadBoundingBoxB(const Point bez[3], Box& bBox) noexcept {
// Bounding box of start and end points.
bBox.reset(std::min(bez[0].x, bez[2].x), std::min(bez[0].y, bez[2].y),
std::max(bez[0].x, bez[2].x), std::max(bez[0].y, bez[2].y));
Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);
// ------- THE MAIN DIFFERENCE -------
t.x = std::max(t.x, 0.0);
t.y = std::max(t.y, 0.0);
t.x = std::min(t.x, 1.0);
t.y = std::min(t.y, 1.0);
// ------- THE MAIN DIFFERENCE -------
boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
lerp(bez[1], bez[2], t), t));
}
// =======================================================
// quadBoundingBoxA [-std=c++17 -O3 -mavx2 -fno-math-errno]
// =======================================================
# This code is perfect, everything vectorized as intended.
.LCPI0_0:
.quad 4607182418800017408 # double 1
.quad 4607182418800017408 # double 1
quadBoundingBoxA(Point const*, Box&): # @quadBoundingBoxA(Point const*,
Box&)
vmovupd xmm0, xmmword ptr [rdi]
vmovupd xmm1, xmmword ptr [rdi + 16]
vmovupd xmm2, xmmword ptr [rdi + 32]
vminpd xmm3, xmm2, xmm0
vmaxpd xmm4, xmm2, xmm0
vsubpd xmm5, xmm0, xmm1
vaddpd xmm6, xmm1, xmm1
vsubpd xmm6, xmm0, xmm6
vaddpd xmm6, xmm2, xmm6
vdivpd xmm5, xmm5, xmm6
vxorpd xmm6, xmm6, xmm6
vmaxpd xmm5, xmm6, xmm5
vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 =
[1.000000e+00,1.000000e+00]
vminpd xmm5, xmm6, xmm5
vsubpd xmm6, xmm6, xmm5
vmulpd xmm0, xmm0, xmm6
vmulpd xmm7, xmm1, xmm5
vaddpd xmm0, xmm7, xmm0
vmulpd xmm1, xmm1, xmm6
vmulpd xmm2, xmm2, xmm5
vaddpd xmm1, xmm2, xmm1
vmulpd xmm0, xmm6, xmm0
vmulpd xmm1, xmm5, xmm1
vaddpd xmm0, xmm0, xmm1
vminpd xmm1, xmm0, xmm3
vmovupd xmmword ptr [rsi], xmm1
vmaxpd xmm0, xmm0, xmm4
vmovupd xmmword ptr [rsi + 16], xmm0
ret
// =======================================================
// quadBoundingBoxB [-std=c++17 -O3 -mavx2 -fno-math-errno]
// =======================================================
# This code contains mixed scalar and vector code, missed opportunity.
.LCPI1_0:
.quad 4607182418800017408 # double 1
.quad 4607182418800017408 # double 1
quadBoundingBoxB(Point const*, Box&): # @quadBoundingBoxB(Point const*,
Box&)
vmovupd xmm3, xmmword ptr [rdi]
vmovupd xmm2, xmmword ptr [rdi + 32]
vminpd xmm9, xmm2, xmm3
vmaxpd xmm8, xmm2, xmm3
vmovupd xmm4, xmmword ptr [rdi + 16]
vsubsd xmm5, xmm3, xmm4
vpermilpd xmm6, xmm4, 1 # xmm6 = xmm4[1,0]
vpermilpd xmm7, xmm3, 1 # xmm7 = xmm3[1,0]
vsubsd xmm0, xmm7, xmm6
vaddsd xmm1, xmm4, xmm4
vaddsd xmm6, xmm6, xmm6
vsubsd xmm1, xmm3, xmm1
vsubsd xmm6, xmm7, xmm6
vaddsd xmm1, xmm2, xmm1
vdivsd xmm1, xmm5, xmm1
vpermilpd xmm5, xmm2, 1 # xmm5 = xmm2[1,0]
vaddsd xmm5, xmm5, xmm6
vdivsd xmm0, xmm0, xmm5
vmovsd qword ptr [rsp - 16], xmm1
vmovsd qword ptr [rsp - 24], xmm0
vxorpd xmm5, xmm5, xmm5
vucomisd xmm5, xmm1
lea rax, [rsp - 8]
lea rcx, [rsp - 16]
cmova rcx, rax
mov qword ptr [rsp - 8], 0
mov rcx, qword ptr [rcx]
vucomisd xmm5, xmm0
lea rdx, [rsp - 24]
cmova rdx, rax
mov qword ptr [rsp - 16], rcx
mov qword ptr [rsp - 8], 0
mov rax, qword ptr [rdx]
mov qword ptr [rsp - 24], rax
vmovsd xmm0, qword ptr [rsp - 16] # xmm0 = mem[0],zero
vmovq xmm1, rcx
vmovq xmm5, rax
vpunpcklqdq xmm1, xmm1, xmm5 # xmm1 = xmm1[0],xmm5[0]
vmovapd xmm5, xmmword ptr [rip + .LCPI1_0] # xmm5 =
[1.000000e+00,1.000000e+00]
vcmpltpd xmm1, xmm5, xmm1
vmovhpd xmm0, xmm0, qword ptr [rsp - 24] # xmm0 = xmm0[0],mem[0]
vblendvpd xmm0, xmm0, xmm5, xmm1
vsubpd xmm1, xmm5, xmm0
vmulpd xmm3, xmm3, xmm1
vmulpd xmm5, xmm4, xmm0
vaddpd xmm3, xmm5, xmm3
vmulpd xmm4, xmm4, xmm1
vmulpd xmm2, xmm2, xmm0
vaddpd xmm2, xmm2, xmm4
vmulpd xmm1, xmm1, xmm3
vmulpd xmm0, xmm2, xmm0
vaddpd xmm0, xmm1, xmm0
vminpd xmm1, xmm0, xmm9
vmovupd xmmword ptr [rsi], xmm1
vmaxpd xmm0, xmm0, xmm8
vmovupd xmmword ptr [rsi + 16], xmm0
ret
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180825/023e979d/attachment.html>
More information about the llvm-bugs
mailing list