[Libclc-dev] [PATCH] libclc: Revert generic vload/vstore to pure CLC and move asm to R600

Mon Jul 8 13:59:21 PDT 2013

Trying again... first response got caught in moderator queue because
it was > 40Kb....

Always with the good questions...  :)

I used the add_sat/sub_sat function implementations as a source
pattern, which was where I originally got the interface/implementation
separation from.  Also, being a Java dev by day, it felt natural to
continue that existing pattern.  My motivation was mostly just
maintaining the status quo, so you won't find much resistance to
change here on my part.

>From a functional standpoint, the only thing that it currently gives
us is a bit more separation between the high-level and low-level
implementation details.  In the case of load/stores, we define
separate loads for e.g.:
int4 = vload4(0, global int* in);
uint4 = vload4(0, global uint* in);

which get redirected to:
define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)*
nocapture %addr)
define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)*
nocapture %addr)

which both get wrapped to:
define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32
addrspace(1)* nocapture %addr)

I guess the only time that this would actually make sense would be if
you had multiple chips/architectures which used the same assembly
prototypes and could swap out the implementation details based on the
chip (treating *_if.ll as headers essentially)... but we'd probably
just do that in the target/architecture-specific directories of libclc
anyway.

If you want, I'll strip out the _if.ll files and re-spin the patch to
make things simpler.  Given that the functions themselves were marked
'alwaysinline', I don't think the generated code should be any
different.

On Mon, Jul 8, 2013 at 12:38 PM, Tom Stellard <tom at stellard.net> wrote:
> On Mon, Jul 01, 2013 at 06:06:39PM -0500, Aaron Watry wrote:
>> The assembly optimizations were making unsafe assumptions about which address
>> spaces had which identifiers.
>>
>> Also, fix vload/vstore with 64-bit pointers. This was broken previously on
>> Radeon SI.
>>
>> Signed-off-by: Aaron Watry <awatry at gmail.com>
>> ---
>>  generic/lib/SOURCES               |   4 --
>>  generic/lib/shared/vload.cl       |  54 +------------------
>>  generic/lib/shared/vload_if.ll    |  60 ---------------------
>>  generic/lib/shared/vload_impl.ll  |  49 -----------------
>>  generic/lib/shared/vstore.cl      |  58 +-------------------
>>  generic/lib/shared/vstore_if.ll   |  59 ---------------------
>>  generic/lib/shared/vstore_impl.ll |  50 ------------------
>>  r600/lib/SOURCES                  |   6 +++
>>  r600/lib/shared/vload.cl          |  99 ++++++++++++++++++++++++++++++++++
>>  r600/lib/shared/vload_if.ll       |  60 +++++++++++++++++++++
>>  r600/lib/shared/vload_impl.ll     |  44 ++++++++++++++++
>>  r600/lib/shared/vstore.cl         | 108 ++++++++++++++++++++++++++++++++++++++
>>  r600/lib/shared/vstore_if.ll      |  59 +++++++++++++++++++++
>>  r600/lib/shared/vstore_impl.ll    |  45 ++++++++++++++++
>>  14 files changed, 425 insertions(+), 330 deletions(-)
>>  delete mode 100644 generic/lib/shared/vload_if.ll
>>  delete mode 100644 generic/lib/shared/vload_impl.ll
>>  delete mode 100644 generic/lib/shared/vstore_if.ll
>>  delete mode 100644 generic/lib/shared/vstore_impl.ll
>>  create mode 100644 r600/lib/shared/vload.cl
>>  create mode 100644 r600/lib/shared/vload_if.ll
>>  create mode 100644 r600/lib/shared/vload_impl.ll
>>  create mode 100644 r600/lib/shared/vstore.cl
>>  create mode 100644 r600/lib/shared/vstore_if.ll
>>  create mode 100644 r600/lib/shared/vstore_impl.ll
>>
>> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
>> index 8cda14a..50cc9bd 100644
>> --- a/generic/lib/SOURCES
>> +++ b/generic/lib/SOURCES
>> @@ -24,10 +24,6 @@ shared/clamp.cl
>>  shared/max.cl
>>  shared/min.cl
>>  shared/vload.cl
>> -shared/vload_if.ll
>> -shared/vload_impl.ll
>>  shared/vstore.cl
>> -shared/vstore_if.ll
>> -shared/vstore_impl.ll
>>  workitem/get_global_id.cl
>>  workitem/get_global_size.cl
>> diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
>> index f6ebd37..e8439e7 100644
>> --- a/generic/lib/shared/vload.cl
>> +++ b/generic/lib/shared/vload.cl
>> @@ -27,12 +27,13 @@
>>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
>>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
>>
>> -//int/uint are special... see below
>>  #define VLOAD_TYPES() \
>>      VLOAD_ADDR_SPACES(char) \
>>      VLOAD_ADDR_SPACES(uchar) \
>>      VLOAD_ADDR_SPACES(short) \
>>      VLOAD_ADDR_SPACES(ushort) \
>> +    VLOAD_ADDR_SPACES(int) \
>> +    VLOAD_ADDR_SPACES(uint) \
>>      VLOAD_ADDR_SPACES(long) \
>>      VLOAD_ADDR_SPACES(ulong) \
>>      VLOAD_ADDR_SPACES(float) \
>> @@ -43,54 +44,3 @@ VLOAD_TYPES()
>>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>>      VLOAD_ADDR_SPACES(double)
>>  #endif
>> -
>> -VLOAD_VECTORIZE(int, __private)
>> -VLOAD_VECTORIZE(int, __local)
>> -VLOAD_VECTORIZE(int, __constant)
>> -VLOAD_VECTORIZE(uint, __private)
>> -VLOAD_VECTORIZE(uint, __local)
>> -VLOAD_VECTORIZE(uint, __constant)
>> -
>> -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
>> -  return (int2)(x[offset] , x[offset+1]);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
>> -  return (int3)(vload2(offset, x), x[offset+2]);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
>> -  return (uint2)(x[offset] , x[offset+1]);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
>> -  return (uint3)(vload2(offset, x), x[offset+2]);
>> -}
>> -
>> -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
>> - * they aren't actually overridden here
>> - */
>> -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
>> -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
>> -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
>> -
>> -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
>> -  return __clc_vload4_int__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
>> -  return __clc_vload8_int__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
>> -  return __clc_vload16_int__global(offset, x);
>> -}
>> -
>> -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
>> -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
>> -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
>> -
>> -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
>> -  return __clc_vload4_uint__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
>> -  return __clc_vload8_uint__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
>> -  return __clc_vload16_uint__global(offset, x);
>> -}
>> \ No newline at end of file
>> diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
>> deleted file mode 100644
>> index 2634d37..0000000
>> --- a/generic/lib/shared/vload_if.ll
>> +++ /dev/null
>> @@ -1,60 +0,0 @@
>> -;Start int global vload
>> -
>> -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
>> -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
>> -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
>> -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
>> -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
>> -
>> -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
>> -  ret <2 x i32> %call
>> -}
>> -
>> -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
>> -  ret <3 x i32> %call
>> -}
>> -
>> -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
>> -  ret <4 x i32> %call
>> -}
>> -
>> -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
>> -  ret <8 x i32> %call
>> -}
>> -
>> -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
>> -  ret <16 x i32> %call
>> -}
>> -
>> -
>> -;Start uint global vload
>> -
>> -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
>> -  ret <2 x i32> %call
>> -}
>> -
>> -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
>> -  ret <3 x i32> %call
>> -}
>> -
>> -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
>> -  ret <4 x i32> %call
>> -}
>> -
>> -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
>> -  ret <8 x i32> %call
>> -}
>> -
>> -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
>> -  ret <16 x i32> %call
>> -}
>> diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
>> deleted file mode 100644
>> index ae719e0..0000000
>> --- a/generic/lib/shared/vload_impl.ll
>> +++ /dev/null
>> @@ -1,49 +0,0 @@
>> -; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
>> -
>> -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
>> -  %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <2 x i32> %4
>> -}
>> -
>> -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
>> -  %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <3 x i32> %4
>> -}
>> -
>> -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
>> -  %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <4 x i32> %4
>> -}
>> -
>> -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
>> -  %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <8 x i32> %4
>> -}
>> -
>> -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
>> -  %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <16 x i32> %4
>> -}
>> -
>> -!1 = metadata !{metadata !"char", metadata !5}
>> -!2 = metadata !{metadata !"short", metadata !5}
>> -!3 = metadata !{metadata !"int", metadata !5}
>> -!4 = metadata !{metadata !"long", metadata !5}
>> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
>> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
>> -

<snipped the vstore implementation because of message size limits>

>> 1.8.1.2
>>
>>
>> _______________________________________________
>> Libclc-dev mailing list
>> Libclc-dev at pcc.me.uk
>> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev