[Libclc-dev] [PATCH 7/7] Add vstore_half_rte implementation

Wed Jan 31 19:35:35 PST 2018

On Wed, 2018-01-31 at 23:11 +0100, Jeroen Ketema via Libclc-dev wrote:
> The comments on this patch set look a bit funny to me (multiline with
> stars as the left margin, don’t see much of that in LLVM).

These follow other longer comments (such as in erf.cl/erfc.cl). LLVM is
in C++ so it prefers C++ style comments. I agree that libclc codestyle
is inconsistent and can use cleanup, but that's for another day.

> Otherwise, LGTM,

Thanks.

> although I cannot possibly comment on the implementations of the
> rounding functions (I assume these are pretty much standard?).

It's my own implementation, _rtz and _rti round to the nearest half
representable float in that direction (except for high order _rti which
relies on half conversion to convert to Inf).

_rtn/_rtp then just choose from the above too based on sign.

_rte pretty much replicates most of the work, and last/qrs bits follow
the standard (hence extra comments). I assume that the compiler is able
to remove most of the duplicities.

regards,
Jan

> 
> Jeroen
> 
> > On 29 Jan 2018, at 01:07, Jan Vesely via Libclc-dev <libclc-dev at lists.llvm.org> wrote:
> > 
> > Passes CTS on carrizo
> > Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
> > ---
> > generic/include/clc/shared/vstore.h |  2 ++
> > generic/lib/shared/vstore.cl        | 45 ++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 46 insertions(+), 1 deletion(-)
> > 
> > diff --git a/generic/include/clc/shared/vstore.h b/generic/include/clc/shared/vstore.h
> > index b510e0a..ebad330 100644
> > --- a/generic/include/clc/shared/vstore.h
> > +++ b/generic/include/clc/shared/vstore.h
> > @@ -40,6 +40,7 @@ _CLC_VECTOR_VSTORE_HALF_PRIM1(float,)
> > _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtz)
> > _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtn)
> > _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtp)
> > +_CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rte)
> > 
> > #ifdef cl_khr_fp64
> >   _CLC_VECTOR_VSTORE_PRIM1(double)
> > @@ -47,6 +48,7 @@ _CLC_VECTOR_VSTORE_HALF_PRIM1(float, _rtp)
> >   _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtz)
> >   _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtn)
> >   _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rtp)
> > +  _CLC_VECTOR_VSTORE_HALF_PRIM1(double, _rte)
> > #endif
> > 
> > #ifdef cl_khr_fp16
> > diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
> > index 2bfb369..c035095 100644
> > --- a/generic/lib/shared/vstore.cl
> > +++ b/generic/lib/shared/vstore.cl
> > @@ -147,6 +147,27 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x)
> > {
> > 	return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
> > }
> > +_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x)
> > +{
> > +	/* Mantisa + implicit bit */
> > +	const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
> > +	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
> > +	int shift = 13;
> > +	if (exp < -14) {
> > +		/* The default assumes lower 13 bits are rounded,
> > +		 * but it might be more for denormals.
> > +		 * Shifting beyond last == 0b, and qr == 00b is not necessary */
> > +		shift += min(-(exp + 14), 15);
> > +	}
> > +	int mask = (1 << shift) - 1;
> > +	const uint grs = mantissa & mask;
> > +	const uint last = mantissa & (1 << shift);
> > +	/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
> > +	 * exp > 15 should round to inf. */
> > +	bool roundup = (grs > (1 << (shift - 1))) ||
> > +		(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
> > +	return roundup ? __clc_rti(x) : __clc_rtz(x);
> > +}
> > 
> > #ifdef cl_khr_fp64
> > _CLC_DEF _CLC_OVERLOAD double __clc_noop(double x)
> > @@ -192,13 +213,35 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x)
> > {
> > 	return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x);
> > }
> > +_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x)
> > +{
> > +	/* Mantisa + implicit bit */
> > +	const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
> > +	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
> > +	int shift = 42;
> > +	if (exp < -14) {
> > +		/* The default assumes lower 13 bits are rounded,
> > +		 * but it might be more for denormals.
> > +		 * Shifting beyond last == 0b, and qr == 00b is not necessary */
> > +		shift += min(-(exp + 14), 15);
> > +	}
> > +	ulong mask = (1UL << shift) - 1UL;
> > +	const ulong grs = mantissa & mask;
> > +	const ulong last = mantissa & (1UL << shift);
> > +	/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
> > +	 * exp > 15 should round to inf. */
> > +	bool roundup = (grs > (1UL << (shift - 1UL))) ||
> > +		(grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
> > +	return roundup ? __clc_rti(x) : __clc_rtz(x);
> > +}
> > #endif
> > 
> > #define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
> > 	__FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \
> > 	__FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \
> > 	__FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \
> > -	__FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp)
> > +	__FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \
> > +	__FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
> > 
> > #define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
> > 	__XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
> > -- 
> > 2.14.3
> > 
> > _______________________________________________
> > Libclc-dev mailing list
> > Libclc-dev at lists.llvm.org
> > http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev
> 
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev

-- 
Jan Vesely <jan.vesely at rutgers.edu>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part
URL: <http://lists.llvm.org/pipermail/libclc-dev/attachments/20180131/d61a1111/attachment.sig>