[Libclc-dev] [PATCH v2 1/1] Implement generic mad_sat

Wed Aug 20 07:39:47 PDT 2014

Hi Jan,

I noticed that this isn't in yet (probably because the underlying LLVM
issue is still being worked on).

I found another issue while running a modified piglit run for my final
vload/vstore patch.  I had enabled 3-element vectors in piglit to
fully test the vload3/vstore3 changes, and it turns out that mad_sat
doesn't currently support 3-element vectors.

The generic/lib/integer/mad_sat implementation defines mad_sat with
3-element vector types, but the
generic/include/clc/integer/integer-gentype.inc file doesn't define
int3 or uint3.  This manifests itself as a crash when compiling
kernels that use mad_sat(int3, int3, int3). I suspect mul24 and mad24
also are similarly affected (and I can check that if needed when I
have some time).

Given that 3-element vectors were added in OpenCL 1.1, and the
libclc.llvm.org page lists libclc as supporting CL 1.1, we should
probably just add that definition in to that file while we're here.

--Aaron

On Tue, Aug 5, 2014 at 5:32 PM, Jan Vesely <jan.vesely at rutgers.edu> wrote:
> On Tue, 2014-08-05 at 14:51 -0500, Aaron Watry wrote:
>> On Tue, Aug 5, 2014 at 10:06 AM, Jan Vesely <jan.vesely at rutgers.edu> wrote:
>> > v2: Fix trailing whitespace
>> >     Fix signed long overflow
>> >     improve comment
>> >
>> > Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
>> > ---
>> >
>> > NOTE: Use http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20140804/229312.html to avoid llvm segfault
>> >
>> >  generic/include/clc/clc.h               |  1 +
>> >  generic/include/clc/integer/mad_sat.h   |  3 ++
>> >  generic/include/clc/integer/mad_sat.inc |  1 +
>> >  generic/lib/SOURCES                     |  1 +
>> >  generic/lib/clcmacro.h                  | 22 ++++++++++
>> >  generic/lib/integer/mad_sat.cl          | 72 +++++++++++++++++++++++++++++++++
>> >  6 files changed, 100 insertions(+)
>> >  create mode 100644 generic/include/clc/integer/mad_sat.h
>> >  create mode 100644 generic/include/clc/integer/mad_sat.inc
>> >  create mode 100644 generic/lib/integer/mad_sat.cl
>> >
>> > diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
>> > index 9815c56..aca9b53 100644
>> > --- a/generic/include/clc/clc.h
>> > +++ b/generic/include/clc/clc.h
>> > @@ -82,6 +82,7 @@
>> >  #include <clc/integer/hadd.h>
>> >  #include <clc/integer/mad24.h>
>> >  #include <clc/integer/mad_hi.h>
>> > +#include <clc/integer/mad_sat.h>
>> >  #include <clc/integer/mul24.h>
>> >  #include <clc/integer/mul_hi.h>
>> >  #include <clc/integer/rhadd.h>
>> > diff --git a/generic/include/clc/integer/mad_sat.h b/generic/include/clc/integer/mad_sat.h
>> > new file mode 100644
>> > index 0000000..3e92372
>> > --- /dev/null
>> > +++ b/generic/include/clc/integer/mad_sat.h
>> > @@ -0,0 +1,3 @@
>> > +#define __CLC_BODY <clc/integer/mad_sat.inc>
>> > +#include <clc/integer/gentype.inc>
>> > +#undef __CLC_BODY
>> > diff --git a/generic/include/clc/integer/mad_sat.inc b/generic/include/clc/integer/mad_sat.inc
>> > new file mode 100644
>> > index 0000000..5da2bdf
>> > --- /dev/null
>> > +++ b/generic/include/clc/integer/mad_sat.inc
>> > @@ -0,0 +1 @@
>> > +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad_sat(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z);
>> > diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
>> > index bfdec7b..7d3fa6b 100644
>> > --- a/generic/lib/SOURCES
>> > +++ b/generic/lib/SOURCES
>> > @@ -19,6 +19,7 @@ integer/clz_if.ll
>> >  integer/clz_impl.ll
>> >  integer/hadd.cl
>> >  integer/mad24.cl
>> > +integer/mad_sat.cl
>> >  integer/mul24.cl
>> >  integer/mul_hi.cl
>> >  integer/rhadd.cl
>> > diff --git a/generic/lib/clcmacro.h b/generic/lib/clcmacro.h
>> > index 730073a..ef102ea 100644
>> > --- a/generic/lib/clcmacro.h
>> > +++ b/generic/lib/clcmacro.h
>> > @@ -41,6 +41,28 @@
>> >      return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
>> >    }
>> >
>> > +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE, ARG3_TYPE) \
>> > +  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, ARG3_TYPE##2 z) { \
>> > +    return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
>> > +  } \
>> > +\
>> > +  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, ARG3_TYPE##3 z) { \
>> > +    return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
>> > +                         FUNCTION(x.z, y.z, z.z)); \
>> > +  } \
>> > +\
>> > +  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, ARG3_TYPE##4 z) { \
>> > +    return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
>> > +  } \
>> > +\
>> > +  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, ARG3_TYPE##8 z) { \
>> > +    return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
>> > +  } \
>> > +\
>> > +  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, ARG3_TYPE##16 z) { \
>> > +    return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), FUNCTION(x.hi, y.hi, z.hi)); \
>> > +  }
>> > +
>> >  #define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
>> >  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
>> >    return BUILTIN(x, y); \
>> > diff --git a/generic/lib/integer/mad_sat.cl b/generic/lib/integer/mad_sat.cl
>> > new file mode 100644
>> > index 0000000..7048931
>> > --- /dev/null
>> > +++ b/generic/lib/integer/mad_sat.cl
>> > @@ -0,0 +1,72 @@
>> > +#include <clc/clc.h>
>> > +#include "../clcmacro.h"
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF char mad_sat(char x, char y, char z) {
>> > +  return clamp((short)mad24((short)x, (short)y, (short)z), (short)CHAR_MIN, (short) CHAR_MAX);
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF uchar mad_sat(uchar x, uchar y, uchar z) {
>> > +  return clamp((ushort)mad24((ushort)x, (ushort)y, (ushort)z), (ushort)0, (ushort) UCHAR_MAX);
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF short mad_sat(short x, short y, short z) {
>> > +  return clamp((int)mad24((int)x, (int)y, (int)z), (int)SHRT_MIN, (int) SHRT_MAX);
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF ushort mad_sat(ushort x, ushort y, ushort z) {
>> > +  return clamp((uint)mad24((uint)x, (uint)y, (uint)z), (uint)0, (uint) USHRT_MAX);
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF int mad_sat(int x, int y, int z) {
>> > +  int mhi = mul_hi(x, y);
>> > +  uint mlo = x * y;
>> > +  long m = upsample(mhi, mlo);
>> > +  m += z;
>> > +  if (m > INT_MAX)
>> > +    return INT_MAX;
>> > +  if (m < INT_MIN)
>> > +    return INT_MIN;
>> > +  return m;
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF uint mad_sat(uint x, uint y, uint z) {
>> > +  if (mul_hi(x, y) != 0)
>> > +    return UINT_MAX;
>> > +  return add_sat(x * y, z);
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF long mad_sat(long x, long y, long z) {
>> > +  long hi = mul_hi(x, y);
>> > +  ulong ulo = x * y;
>> > +  long  slo = x * y;
>> > +  /* Big overflow of more than 2 bits, add can't fix this */
>> > +  if (((x < 0) == (y < 0)) && hi != 0)
>> > +    return LONG_MAX;
>> > +  /* Low overflow in mul and z not neg enough to correct it */
>> > +  if (hi == 0 && ulo >= LONG_MAX && (z > 0 || (ulo + z) > LONG_MAX))
>> > +    return LONG_MAX;
>> > +  /* Big overflow of more than 2 bits, add can't fix this */
>> > +  if (((x < 0) != (y < 0)) && hi != -1)
>> > +    return LONG_MIN;
>> > +  /* Low overflow in mul and z not pos enough to correct it */
>> > +  if (hi == -1 && ulo <= ((ulong)LONG_MAX + 1UL) && (z < 0 || z < (LONG_MAX - ulo)))
>> > +    return LONG_MIN;
>> > +  /* We have checked all conditions, any overflow in addtion returns
>>
>> s/addtion/addition/
>
> fixed in v3
>
>>
>> The code returns all test passes on evergreen using the piglit unit
>> tests for mad_sat (all data types) after I applied your LLVM
>> workaround.
>>
>> Otherwise, I'm trying to find out if there's any redundant boolean
>> logic in the above bits. Especially I'm wondering if we need to keep
>> checking if hi is greater than, less than, equal to, or not equal to 0
>> and -1, or if we can just pull that part out to a quick overflow test
>> at the beginning.
>
> I've tried to combine the MIN/MAX branches, but the problem is that not
> all overflows are equal. Some sign bit overflows (hi is 0 or -1), can be
> 'corrected' by the follow up addition, so I decided to make the code
> more readable and keep the cases separate.
>
>>
>> Either way, I've successfully tested this version of the code with
>> your LLVM FlattenCFG.cpp patch and gotten successful unit test passes
>> on CEDAR (Radeon 5400).  I believe that radeonsi will probably still
>> fail due to the ulong instruction selection issue that I noted
>> yesterday (unless the FlattenCFG change also affects this in a
>> slightly different way), but that doesn't seem like an issue with this
>> patch so much as the back-end.
>
>
> thanks for testing,
> jan
>
>>
>> --Aaron
>>
>>
>> > +   * the correct value */
>> > +  return ulo + z;
>> > +}
>> > +
>> > +_CLC_OVERLOAD _CLC_DEF ulong mad_sat(ulong x, ulong y, ulong z) {
>> > +  if (mul_hi(x, y) != 0)
>> > +    return ULONG_MAX;
>> > +  return add_sat(x * y, z);
>> > +}
>> > +
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, mad_sat, char, char, char)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, mad_sat, uchar, uchar, uchar)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, mad_sat, short, short, short)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, mad_sat, ushort, ushort, ushort)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, mad_sat, int, int, int)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, mad_sat, uint, uint, uint)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, mad_sat, long, long, long)
>> > +_CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, mad_sat, ulong, ulong, ulong)
>> > --
>> > 1.9.3
>> >
>
> --
> Jan Vesely <jan.vesely at rutgers.edu>