[Libclc-dev] [PATCH 2/3] Implement async_work_group_copy builtin

Jeroen Ketema j.ketema at imperial.ac.uk
Sat Aug 9 13:42:44 PDT 2014


On 08 Aug 2014, at 22:40, Tom Stellard <thomas.stellard at amd.com> wrote:

> This is a simple implementation which just copies data synchronously.
> ---
> generic/include/clc/async/async_work_group_copy.h   | 15 +++++++++++++++
> generic/include/clc/async/async_work_group_copy.inc |  5 +++++
> generic/include/clc/clc.h                           |  1 +
> generic/lib/SOURCES                                 |  1 +
> generic/lib/async/async_work_group_copy.cl          | 21 +++++++++++++++++++++
> generic/lib/async/async_work_group_copy.inc         | 16 ++++++++++++++++
> 6 files changed, 59 insertions(+)
> create mode 100644 generic/include/clc/async/async_work_group_copy.h
> create mode 100644 generic/include/clc/async/async_work_group_copy.inc
> create mode 100644 generic/lib/async/async_work_group_copy.cl
> create mode 100644 generic/lib/async/async_work_group_copy.inc
> 
> diff --git a/generic/include/clc/async/async_work_group_copy.h b/generic/include/clc/async/async_work_group_copy.h
> new file mode 100644
> index 0000000..39c637b
> --- /dev/null
> +++ b/generic/include/clc/async/async_work_group_copy.h
> @@ -0,0 +1,15 @@
> +#define __CLC_DST_ADDR_SPACE local
> +#define __CLC_SRC_ADDR_SPACE global
> +#define __CLC_BODY <clc/async/async_work_group_copy.inc>
> +#include <clc/async/gentype.inc>
> +#undef __CLC_DST_ADDR_SPACE
> +#undef __CLC_SRC_ADDR_SPACE
> +#undef __CLC_BODY
> +
> +#define __CLC_DST_ADDR_SPACE global
> +#define __CLC_SRC_ADDR_SPACE local
> +#define __CLC_BODY <clc/async/async_work_group_copy.inc>
> +#include <clc/async/gentype.inc>
> +#undef __CLC_DST_ADDR_SPACE
> +#undef __CLC_SRC_ADDR_SPACE
> +#undef __CLC_BODY
> diff --git a/generic/include/clc/async/async_work_group_copy.inc b/generic/include/clc/async/async_work_group_copy.inc
> new file mode 100644
> index 0000000..d85df6c
> --- /dev/null
> +++ b/generic/include/clc/async/async_work_group_copy.inc
> @@ -0,0 +1,5 @@
> +_CLC_OVERLOAD _CLC_DECL event_t async_work_group_copy(
> +  __CLC_DST_ADDR_SPACE __CLC_GENTYPE *dst,
> +  const __CLC_SRC_ADDR_SPACE __CLC_GENTYPE *src,
> +  size_t num_gentypes,
> +  event_t event);
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index f499e6d..ed741b1 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -125,6 +125,7 @@
> #include <clc/synchronization/barrier.h>
> 
> /* 6.11.10 Async Copy and Prefetch Functions */
> +#include <clc/async/async_work_group_copy.h>
> #include <clc/async/prefetch.h>
> #include <clc/async/wait_group_events.h>
> 
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index 3e847fe..e7dbca5 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -1,3 +1,4 @@
> +async/async_work_group_copy.cl
> async/prefetch.cl
> async/wait_group_events.cl
> atomic/atomic_impl.ll
> diff --git a/generic/lib/async/async_work_group_copy.cl b/generic/lib/async/async_work_group_copy.cl
> new file mode 100644
> index 0000000..31c71d6
> --- /dev/null
> +++ b/generic/lib/async/async_work_group_copy.cl
> @@ -0,0 +1,21 @@
> +#include <clc/clc.h>
> +
> +#ifdef cl_khr_fp64
> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
> +#endif
> +
> +#define __CLC_DST_ADDR_SPACE local
> +#define __CLC_SRC_ADDR_SPACE global
> +#define __CLC_BODY <async_work_group_copy.inc>
> +#include <clc/async/gentype.inc>
> +#undef __CLC_DST_ADDR_SPACE
> +#undef __CLC_SRC_ADDR_SPACE
> +#undef __CLC_BODY
> +
> +#define __CLC_DST_ADDR_SPACE global
> +#define __CLC_SRC_ADDR_SPACE local
> +#define __CLC_BODY <async_work_group_copy.inc>
> +#include <clc/async/gentype.inc>
> +#undef __CLC_DST_ADDR_SPACE
> +#undef __CLC_SRC_ADDR_SPACE
> +#undef __CLC_BODY
> diff --git a/generic/lib/async/async_work_group_copy.inc b/generic/lib/async/async_work_group_copy.inc
> new file mode 100644
> index 0000000..dd3db3f
> --- /dev/null
> +++ b/generic/lib/async/async_work_group_copy.inc
> @@ -0,0 +1,16 @@
> +_CLC_OVERLOAD _CLC_DEF event_t async_work_group_copy(
> +    __CLC_DST_ADDR_SPACE __CLC_GENTYPE *dst,
> +    const __CLC_SRC_ADDR_SPACE __CLC_GENTYPE *src,
> +    size_t num_gentypes,
> +    event_t event) {
> +
> +  // __builtin_memcpy doesn't work with address spaces, so we need to
> +  // implement the copy using a loop.
> +
> +  unsigned i;
> +  for (i = 0; i < num_gentypes; ++i) {
> +    dst[i] = src[i];
> +  }

If I understand this correctly, this lets every thread in the workgroup do the copy.
So this code has a data races if executed by more than one thread. OpenCL 1.2/1.1
does not say anything about the behaviour of racy code, so I’m not sure whether
the behaviour of this code is properly defined. If the intention is to eventually support
OpenCL 2.0, then the behaviour of is definitely undefined (due to the data races).

Jeroen

> +
> +  return event;
> +}
> -- 
> 1.8.1.5
> 
> 
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev





More information about the Libclc-dev mailing list