[Libclc-dev] [PATCH] Add mul_hi implementation
Aaron Watry
awatry at gmail.com
Tue Aug 13 11:02:35 PDT 2013
Just a note on this one:
I've tested char/uchar/short/ushort types on R600 (Cedar)
successfully, but the int/uint/long/ulong versions fail. I am 99%
sure that the failures are due to deficiencies in the R600 back-end
(i.e. handling long arithmetic correctly), and not in the code (which
I've tested compiled C versions of).
--Aaron
On Tue, Aug 13, 2013 at 12:59 PM, Aaron Watry <awatry at gmail.com> wrote:
> Everything except long/ulong is handled by just casting to the next larger type,
> doing the math and then shifting/casting the result.
>
> For 64-bit types, we break the high/low parts of each operand apart, and do
> a FOIL-based multiplication. The algorithm was originally from StackOverflow
> and modified for CL-based purposes and extended to handle ulong.
>
> If we have concerns about the source, we can use the POCL implementation
> instead since I believe that also has a compatible license declaration.
> ---
> generic/include/clc/clc.h | 1 +
> generic/include/clc/integer/mul_hi.h | 2 +
> generic/include/clc/integer/mul_hi.inc | 1 +
> generic/lib/SOURCES | 1 +
> generic/lib/integer/mul_hi.cl | 126 +++++++++++++++++++++++++++++++++
> 5 files changed, 131 insertions(+)
> create mode 100644 generic/include/clc/integer/mul_hi.h
> create mode 100644 generic/include/clc/integer/mul_hi.inc
> create mode 100644 generic/lib/integer/mul_hi.cl
>
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index b906245..bc873c3 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -67,6 +67,7 @@
> #include <clc/integer/clz.h>
> #include <clc/integer/mad24.h>
> #include <clc/integer/mul24.h>
> +#include <clc/integer/mul_hi.h>
> #include <clc/integer/rotate.h>
> #include <clc/integer/sub_sat.h>
> #include <clc/integer/upsample.h>
> diff --git a/generic/include/clc/integer/mul_hi.h b/generic/include/clc/integer/mul_hi.h
> new file mode 100644
> index 0000000..27b95d8
> --- /dev/null
> +++ b/generic/include/clc/integer/mul_hi.h
> @@ -0,0 +1,2 @@
> +#define __CLC_BODY <clc/integer/mul_hi.inc>
> +#include <clc/integer/gentype.inc>
> diff --git a/generic/include/clc/integer/mul_hi.inc b/generic/include/clc/integer/mul_hi.inc
> new file mode 100644
> index 0000000..ce9e5c0
> --- /dev/null
> +++ b/generic/include/clc/integer/mul_hi.inc
> @@ -0,0 +1 @@
> +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul_hi(__CLC_GENTYPE x, __CLC_GENTYPE y);
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index 9ac08bd..b8322f2 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -13,6 +13,7 @@ integer/clz_if.ll
> integer/clz_impl.ll
> integer/mad24.cl
> integer/mul24.cl
> +integer/mul_hi.cl
> integer/rotate.cl
> integer/sub_sat.cl
> integer/sub_sat_if.ll
> diff --git a/generic/lib/integer/mul_hi.cl b/generic/lib/integer/mul_hi.cl
> new file mode 100644
> index 0000000..53386af
> --- /dev/null
> +++ b/generic/lib/integer/mul_hi.cl
> @@ -0,0 +1,126 @@
> +#include <clc/clc.h>
> +
> +//For all types EXCEPT long, which is implemented separately
> +#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
> + _CLC_OVERLOAD _CLC_DECL GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
> + return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
> + } \
> +
> +//FOIL-based long mul_hi
> +//
> +// Original Source: http://stackoverflow.com/a/1546152
> +//
> +// Summary: Treat mul_hi(long x, long y) as:
> +// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
> +// and b and d are the low-order parts of x and y.
> +// Thinking back to algebra, we use FOIL to do the work.
> +#define __CLC_B32 0xffffffffUL
> +
> +inline void __clc_mul_hi_mixed(ulong *result, ulong innerTerm)
> +{
> + // the high part of innerTerm is actually the easy part
> + result[1] += innerTerm >> 32;
> +
> + // the low order a*d might carry out of the low order result
> + ulong was = result[0];
> +
> + result[0] += (innerTerm & __CLC_B32) << 32;
> +
> + if (result[0] < was) // carry!
> + ++result[1];
> +}
> +
> +inline ulong __clc_mul_hi_negate(ulong *result)
> +{
> + ulong t = result[0] = ~result[0];
> + result[1] = ~result[1];
> + if (++result[0] < t)
> + ++result[1];
> + return result[1];
> +}
> +
> +_CLC_OVERLOAD _CLC_DECL long mul_hi(long sx, long sy){
> + ulong x, y, result[2] = { 0 }, a, b, c, d;
> +
> + x = (ulong)abs(sx);
> + y = (ulong)abs(sy);
> +
> + a = x >> 32;
> + b = x & __CLC_B32;
> + c = y >> 32;
> + d = y & __CLC_B32;
> +
> + // the highest and lowest order terms are easy
> + result[1] = a * c;
> + result[0] = b * d;
> +
> + // now have the mixed terms ad + bc to worry about
> + __clc_mul_hi_mixed(result, a * d);
> + __clc_mul_hi_mixed(result, b * c);
> +
> + // now deal with the sign
> + long result_negative = sx < 0 ^ sy < 0;
> + return (long)(result_negative ? __clc_mul_hi_negate(result) : result[1]);
> +}
> +
> +_CLC_OVERLOAD _CLC_DECL ulong mul_hi(ulong sx, ulong sy){
> + ulong x, y, result[2] = { 0, 0 }, a, b, c, d;
> +
> + x = sx;
> + y = sy;
> +
> + a = x >> 32;
> + b = x & __CLC_B32;
> + c = y >> 32;
> + d = y & __CLC_B32;
> +
> + // the highest and lowest order terms are easy
> + result[1] = a * c;
> + result[0] = b * d;
> +
> + // now have the mixed terms ad + bc to worry about
> + __clc_mul_hi_mixed(result, a * d);
> + __clc_mul_hi_mixed(result, b * c);
> +
> + // no sign to deal with, just return the high half
> + return result[1];
> +}
> +
> +#define __CLC_MUL_HI_VEC(GENTYPE) \
> + _CLC_OVERLOAD _CLC_DECL GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
> + return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
> + } \
> + _CLC_OVERLOAD _CLC_DECL GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
> + return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
> + } \
> + _CLC_OVERLOAD _CLC_DECL GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
> + return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
> + } \
> + _CLC_OVERLOAD _CLC_DECL GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
> + return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
> + } \
> + _CLC_OVERLOAD _CLC_DECL GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
> + return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
> + } \
> +
> +#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
> + __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
> + __CLC_MUL_HI_VEC(TYPE)
> +
> +#define __CLC_MUL_HI_TYPES() \
> + __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
> + __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
> + __CLC_MUL_HI_DEC_IMPL(int, short, 16) \
> + __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
> + __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
> + __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
> + __CLC_MUL_HI_VEC(long) \
> + __CLC_MUL_HI_VEC(ulong)
> +
> +__CLC_MUL_HI_TYPES()
> +
> +#undef __CLC_MUL_HI_TYPES
> +#undef __CLC_MUL_HI_DEC_IMPL
> +#undef __CLC_MUL_HI_IMPL
> +#undef __CLC_MUL_HI_VEC
> +#undef __CLC_B32
> --
> 1.8.1.2
>
More information about the Libclc-dev
mailing list