[Libclc-dev] [PATCH] libclc: Revert generic vload/vstore to pure CLC and move asm to R600

Mon Jul 8 14:38:13 PDT 2013

On Mon, Jul 08, 2013 at 03:59:21PM -0500, Aaron Watry wrote:
> Trying again... first response got caught in moderator queue because
> it was > 40Kb....
> 
> Always with the good questions...  :)
> 
> I used the add_sat/sub_sat function implementations as a source
> pattern, which was where I originally got the interface/implementation
> separation from.  Also, being a Java dev by day, it felt natural to
> continue that existing pattern.  My motivation was mostly just
> maintaining the status quo, so you won't find much resistance to
> change here on my part.
> 
> From a functional standpoint, the only thing that it currently gives
> us is a bit more separation between the high-level and low-level
> implementation details.  In the case of load/stores, we define
> separate loads for e.g.:
> int4 = vload4(0, global int* in);
> uint4 = vload4(0, global uint* in);
> 
> which get redirected to:
> define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)*
> nocapture %addr)
> define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)*
> nocapture %addr)
> 
> which both get wrapped to:
> define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32
> addrspace(1)* nocapture %addr)
> 
> 
> I guess the only time that this would actually make sense would be if
> you had multiple chips/architectures which used the same assembly
> prototypes and could swap out the implementation details based on the
> chip (treating *_if.ll as headers essentially)... but we'd probably
> just do that in the target/architecture-specific directories of libclc
> anyway.
>

What I think would makes sense is to move v{load,store}_impl.ll back
into the generic directory and rename the functions based on llvm address
space, rather than opencl address space (e.g.
__clc_vload4_impl_i32__addr1 instead of __clc_vload4_impl_i32__global)
and then have the r600 version of v{load,store}.cl bypass vload_if.ll
entirely and call the __clc_vload4_impl_i32__addr1 and related function
directly.

This way, the optimized version could be reused by multiple targets,
even if addrspace 1 is global memory on one and constant memory on
another.

What do you think?

-Tom

> If you want, I'll strip out the _if.ll files and re-spin the patch to
> make things simpler.  Given that the functions themselves were marked
> 'alwaysinline', I don't think the generated code should be any
> different.
> 
> On Mon, Jul 8, 2013 at 12:38 PM, Tom Stellard <tom at stellard.net> wrote:
> > On Mon, Jul 01, 2013 at 06:06:39PM -0500, Aaron Watry wrote:
> >> The assembly optimizations were making unsafe assumptions about which address
> >> spaces had which identifiers.
> >>
> >> Also, fix vload/vstore with 64-bit pointers. This was broken previously on
> >> Radeon SI.
> >>
> >> Signed-off-by: Aaron Watry <awatry at gmail.com>
> >> ---
> >>  generic/lib/SOURCES               |   4 --
> >>  generic/lib/shared/vload.cl       |  54 +------------------
> >>  generic/lib/shared/vload_if.ll    |  60 ---------------------
> >>  generic/lib/shared/vload_impl.ll  |  49 -----------------
> >>  generic/lib/shared/vstore.cl      |  58 +-------------------
> >>  generic/lib/shared/vstore_if.ll   |  59 ---------------------
> >>  generic/lib/shared/vstore_impl.ll |  50 ------------------
> >>  r600/lib/SOURCES                  |   6 +++
> >>  r600/lib/shared/vload.cl          |  99 ++++++++++++++++++++++++++++++++++
> >>  r600/lib/shared/vload_if.ll       |  60 +++++++++++++++++++++
> >>  r600/lib/shared/vload_impl.ll     |  44 ++++++++++++++++
> >>  r600/lib/shared/vstore.cl         | 108 ++++++++++++++++++++++++++++++++++++++
> >>  r600/lib/shared/vstore_if.ll      |  59 +++++++++++++++++++++
> >>  r600/lib/shared/vstore_impl.ll    |  45 ++++++++++++++++
> >>  14 files changed, 425 insertions(+), 330 deletions(-)
> >>  delete mode 100644 generic/lib/shared/vload_if.ll
> >>  delete mode 100644 generic/lib/shared/vload_impl.ll
> >>  delete mode 100644 generic/lib/shared/vstore_if.ll
> >>  delete mode 100644 generic/lib/shared/vstore_impl.ll
> >>  create mode 100644 r600/lib/shared/vload.cl
> >>  create mode 100644 r600/lib/shared/vload_if.ll
> >>  create mode 100644 r600/lib/shared/vload_impl.ll
> >>  create mode 100644 r600/lib/shared/vstore.cl
> >>  create mode 100644 r600/lib/shared/vstore_if.ll
> >>  create mode 100644 r600/lib/shared/vstore_impl.ll
> >>
> >> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> >> index 8cda14a..50cc9bd 100644
> >> --- a/generic/lib/SOURCES
> >> +++ b/generic/lib/SOURCES
> >> @@ -24,10 +24,6 @@ shared/clamp.cl
> >>  shared/max.cl
> >>  shared/min.cl
> >>  shared/vload.cl
> >> -shared/vload_if.ll
> >> -shared/vload_impl.ll
> >>  shared/vstore.cl
> >> -shared/vstore_if.ll
> >> -shared/vstore_impl.ll
> >>  workitem/get_global_id.cl
> >>  workitem/get_global_size.cl
> >> diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
> >> index f6ebd37..e8439e7 100644
> >> --- a/generic/lib/shared/vload.cl
> >> +++ b/generic/lib/shared/vload.cl
> >> @@ -27,12 +27,13 @@
> >>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
> >>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
> >>
> >> -//int/uint are special... see below
> >>  #define VLOAD_TYPES() \
> >>      VLOAD_ADDR_SPACES(char) \
> >>      VLOAD_ADDR_SPACES(uchar) \
> >>      VLOAD_ADDR_SPACES(short) \
> >>      VLOAD_ADDR_SPACES(ushort) \
> >> +    VLOAD_ADDR_SPACES(int) \
> >> +    VLOAD_ADDR_SPACES(uint) \
> >>      VLOAD_ADDR_SPACES(long) \
> >>      VLOAD_ADDR_SPACES(ulong) \
> >>      VLOAD_ADDR_SPACES(float) \
> >> @@ -43,54 +44,3 @@ VLOAD_TYPES()
> >>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> >>      VLOAD_ADDR_SPACES(double)
> >>  #endif
> >> -
> >> -VLOAD_VECTORIZE(int, __private)
> >> -VLOAD_VECTORIZE(int, __local)
> >> -VLOAD_VECTORIZE(int, __constant)
> >> -VLOAD_VECTORIZE(uint, __private)
> >> -VLOAD_VECTORIZE(uint, __local)
> >> -VLOAD_VECTORIZE(uint, __constant)
> >> -
> >> -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
> >> -  return (int2)(x[offset] , x[offset+1]);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
> >> -  return (int3)(vload2(offset, x), x[offset+2]);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
> >> -  return (uint2)(x[offset] , x[offset+1]);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
> >> -  return (uint3)(vload2(offset, x), x[offset+2]);
> >> -}
> >> -
> >> -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
> >> - * they aren't actually overridden here
> >> - */
> >> -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
> >> -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
> >> -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
> >> -
> >> -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
> >> -  return __clc_vload4_int__global(offset, x);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
> >> -  return __clc_vload8_int__global(offset, x);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
> >> -  return __clc_vload16_int__global(offset, x);
> >> -}
> >> -
> >> -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
> >> -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
> >> -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
> >> -
> >> -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
> >> -  return __clc_vload4_uint__global(offset, x);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
> >> -  return __clc_vload8_uint__global(offset, x);
> >> -}
> >> -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
> >> -  return __clc_vload16_uint__global(offset, x);
> >> -}
> >> \ No newline at end of file
> >> diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
> >> deleted file mode 100644
> >> index 2634d37..0000000
> >> --- a/generic/lib/shared/vload_if.ll
> >> +++ /dev/null
> >> @@ -1,60 +0,0 @@
> >> -;Start int global vload
> >> -
> >> -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> >> -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> >> -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> >> -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> >> -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> >> -
> >> -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <2 x i32> %call
> >> -}
> >> -
> >> -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <3 x i32> %call
> >> -}
> >> -
> >> -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <4 x i32> %call
> >> -}
> >> -
> >> -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <8 x i32> %call
> >> -}
> >> -
> >> -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <16 x i32> %call
> >> -}
> >> -
> >> -
> >> -;Start uint global vload
> >> -
> >> -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <2 x i32> %call
> >> -}
> >> -
> >> -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <3 x i32> %call
> >> -}
> >> -
> >> -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <4 x i32> %call
> >> -}
> >> -
> >> -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <8 x i32> %call
> >> -}
> >> -
> >> -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> >> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> >> -  ret <16 x i32> %call
> >> -}
> >> diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
> >> deleted file mode 100644
> >> index ae719e0..0000000
> >> --- a/generic/lib/shared/vload_impl.ll
> >> +++ /dev/null
> >> @@ -1,49 +0,0 @@
> >> -; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
> >> -
> >> -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> >> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> >> -  %2 = add i32 %1, %offset
> >> -  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
> >> -  %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
> >> -  ret <2 x i32> %4
> >> -}
> >> -
> >> -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> >> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> >> -  %2 = add i32 %1, %offset
> >> -  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
> >> -  %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
> >> -  ret <3 x i32> %4
> >> -}
> >> -
> >> -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> >> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> >> -  %2 = add i32 %1, %offset
> >> -  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
> >> -  %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
> >> -  ret <4 x i32> %4
> >> -}
> >> -
> >> -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> >> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> >> -  %2 = add i32 %1, %offset
> >> -  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
> >> -  %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
> >> -  ret <8 x i32> %4
> >> -}
> >> -
> >> -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> >> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> >> -  %2 = add i32 %1, %offset
> >> -  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
> >> -  %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
> >> -  ret <16 x i32> %4
> >> -}
> >> -
> >> -!1 = metadata !{metadata !"char", metadata !5}
> >> -!2 = metadata !{metadata !"short", metadata !5}
> >> -!3 = metadata !{metadata !"int", metadata !5}
> >> -!4 = metadata !{metadata !"long", metadata !5}
> >> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
> >> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> >> -
> 
> <snipped the vstore implementation because of message size limits>
> 
> >> 1.8.1.2
> >>
> >>
> >> _______________________________________________
> >> Libclc-dev mailing list
> >> Libclc-dev at pcc.me.uk
> >> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev