[Libclc-dev] [PATCH] Add mul_hi implementation [v2]

Fri Aug 16 16:46:33 PDT 2013

On Thu, Aug 15, 2013 at 04:35:01PM -0500, Aaron Watry wrote:
> Everything except long/ulong is handled by just casting to the next larger type,
> doing the math and then shifting/casting the result.
> 
> For 64-bit types, we break the high/low parts of each operand apart, and do
> a FOIL-based multiplication.
> 

Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

> v2:
>   Discard the stack-overflow implementation due to copyright concerns.
>   - The implementation is still FOIL-based, but discards the previous code.
> ---
>  generic/include/clc/clc.h              |   1 +
>  generic/include/clc/integer/mul_hi.h   |   2 +
>  generic/include/clc/integer/mul_hi.inc |   1 +
>  generic/lib/SOURCES                    |   1 +
>  generic/lib/integer/mul_hi.cl          | 109 +++++++++++++++++++++++++++++++++
>  5 files changed, 114 insertions(+)
>  create mode 100644 generic/include/clc/integer/mul_hi.h
>  create mode 100644 generic/include/clc/integer/mul_hi.inc
>  create mode 100644 generic/lib/integer/mul_hi.cl
> 
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index ae212ed..305f058 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -68,6 +68,7 @@
>  #include <clc/integer/hadd.h>
>  #include <clc/integer/mad24.h>
>  #include <clc/integer/mul24.h>
> +#include <clc/integer/mul_hi.h>
>  #include <clc/integer/rhadd.h>
>  #include <clc/integer/rotate.h>
>  #include <clc/integer/sub_sat.h>
> diff --git a/generic/include/clc/integer/mul_hi.h b/generic/include/clc/integer/mul_hi.h
> new file mode 100644
> index 0000000..27b95d8
> --- /dev/null
> +++ b/generic/include/clc/integer/mul_hi.h
> @@ -0,0 +1,2 @@
> +#define __CLC_BODY <clc/integer/mul_hi.inc>
> +#include <clc/integer/gentype.inc>
> diff --git a/generic/include/clc/integer/mul_hi.inc b/generic/include/clc/integer/mul_hi.inc
> new file mode 100644
> index 0000000..ce9e5c0
> --- /dev/null
> +++ b/generic/include/clc/integer/mul_hi.inc
> @@ -0,0 +1 @@
> +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul_hi(__CLC_GENTYPE x, __CLC_GENTYPE y);
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index 5a68d07..d6e2d8c 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -14,6 +14,7 @@ integer/clz_impl.ll
>  integer/hadd.cl
>  integer/mad24.cl
>  integer/mul24.cl
> +integer/mul_hi.cl
>  integer/rhadd.cl
>  integer/rotate.cl
>  integer/sub_sat.cl
> diff --git a/generic/lib/integer/mul_hi.cl b/generic/lib/integer/mul_hi.cl
> new file mode 100644
> index 0000000..4c02efb
> --- /dev/null
> +++ b/generic/lib/integer/mul_hi.cl
> @@ -0,0 +1,109 @@
> +#include <clc/clc.h>
> +
> +//For all types EXCEPT long, which is implemented separately
> +#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
> +    _CLC_OVERLOAD _CLC_DECL GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
> +        return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
> +    } \
> +
> +//FOIL-based long mul_hi
> +//
> +// Summary: Treat mul_hi(long x, long y) as:
> +// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
> +// and b and d are the low-order parts of x and y.
> +// Thinking back to algebra, we use FOIL to do the work.
> +
> +_CLC_OVERLOAD _CLC_DECL long mul_hi(long x, long y){
> +    long f, o, i;
> +    ulong l;
> +
> +    //Move the high/low halves of x/y into the lower 32-bits of variables so
> +    //that we can multiply them without worrying about overflow.
> +    long x_hi = x >> 32;
> +    long x_lo = x & UINT_MAX;
> +    long y_hi = y >> 32;
> +    long y_lo = y & UINT_MAX;
> +
> +    //Multiply all of the components according to FOIL method
> +    f = x_hi * y_hi;
> +    o = x_hi * y_lo;
> +    i = x_lo * y_hi;
> +    l = x_lo * y_lo;
> +
> +    //Now add the components back together in the following steps:
> +    //F: doesn't need to be modified
> +    //O/I: Need to be added together.
> +    //L: Shift right by 32-bits, then add into the sum of O and I
> +    //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
> +    //
> +    //We use hadd to give us a bit of extra precision for the intermediate sums
> +    //but as a result, we shift by 31 bits instead of 32
> +    return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
> +}
> +
> +_CLC_OVERLOAD _CLC_DECL ulong mul_hi(ulong x, ulong y){
> +    ulong f, o, i;
> +    ulong l;
> +
> +    //Move the high/low halves of x/y into the lower 32-bits of variables so
> +    //that we can multiply them without worrying about overflow.
> +    ulong x_hi = x >> 32;
> +    ulong x_lo = x & UINT_MAX;
> +    ulong y_hi = y >> 32;
> +    ulong y_lo = y & UINT_MAX;
> +
> +    //Multiply all of the components according to FOIL method
> +    f = x_hi * y_hi;
> +    o = x_hi * y_lo;
> +    i = x_lo * y_hi;
> +    l = x_lo * y_lo;
> +
> +    //Now add the components back together, taking care to respect the fact that:
> +    //F: doesn't need to be modified
> +    //O/I: Need to be added together.
> +    //L: Shift right by 32-bits, then add into the sum of O and I
> +    //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
> +    //
> +    //We use hadd to give us a bit of extra precision for the intermediate sums
> +    //but as a result, we shift by 31 bits instead of 32
> +    return (f + (hadd(o, (i + (l>>32))) >> 31));
> +}
> +
> +#define __CLC_MUL_HI_VEC(GENTYPE) \
> +    _CLC_OVERLOAD _CLC_DECL GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
> +        return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
> +    } \
> +    _CLC_OVERLOAD _CLC_DECL GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
> +        return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
> +    } \
> +    _CLC_OVERLOAD _CLC_DECL GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
> +        return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
> +    } \
> +    _CLC_OVERLOAD _CLC_DECL GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
> +        return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
> +    } \
> +    _CLC_OVERLOAD _CLC_DECL GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
> +        return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
> +    } \
> +
> +#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
> +    __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
> +    __CLC_MUL_HI_VEC(TYPE)
> +
> +#define __CLC_MUL_HI_TYPES() \
> +    __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
> +    __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
> +    __CLC_MUL_HI_DEC_IMPL(int, short, 16) \
> +    __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
> +    __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
> +    __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
> +    __CLC_MUL_HI_VEC(long) \
> +    __CLC_MUL_HI_VEC(ulong)
> +
> +__CLC_MUL_HI_TYPES()
> +
> +#undef __CLC_MUL_HI_TYPES
> +#undef __CLC_MUL_HI_DEC_IMPL
> +#undef __CLC_MUL_HI_IMPL
> +#undef __CLC_MUL_HI_VEC
> +#undef __CLC_B32
> -- 
> 1.8.1.2
> 
> 
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev