No subject

Tue Jun 18 14:47:27 PDT 2013

us is a bit more separation between the high-level and low-level
implementation details.  In the case of load/stores, we define
separate loads for e.g.:
int4 = vload4(0, global int* in);
uint4 = vload4(0, global uint* in);

which get redirected to:
define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)*
nocapture %addr)
define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)*
nocapture %addr)

which both get wrapped to:
define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32
addrspace(1)* nocapture %addr)

I guess the only time that this would actually make sense would be if
you had multiple chips/architectures which used the same assembly
prototypes and could swap out the implementation details based on the
chip (treating *_if.ll as headers essentially)... but we'd probably
just do that in the target/architecture-specific directories of libclc
anyway.

If you want, I'll strip out the _if.ll files and re-spin the patch to
make things simpler.  Given that the functions themselves were marked
'alwaysinline', I don't think the generated code should be any
different.

On Mon, Jul 8, 2013 at 12:38 PM, Tom Stellard <tom at stellard.net> wrote:
> On Mon, Jul 01, 2013 at 06:06:39PM -0500, Aaron Watry wrote:
>> The assembly optimizations were making unsafe assumptions about which address
>> spaces had which identifiers.
>>
>> Also, fix vload/vstore with 64-bit pointers. This was broken previously on
>> Radeon SI.
>>
>> Signed-off-by: Aaron Watry <awatry at gmail.com>
>> ---
>>  generic/lib/SOURCES               |   4 --
>>  generic/lib/shared/vload.cl       |  54 +------------------
>>  generic/lib/shared/vload_if.ll    |  60 ---------------------
>>  generic/lib/shared/vload_impl.ll  |  49 -----------------
>>  generic/lib/shared/vstore.cl      |  58 +-------------------
>>  generic/lib/shared/vstore_if.ll   |  59 ---------------------
>>  generic/lib/shared/vstore_impl.ll |  50 ------------------
>>  r600/lib/SOURCES                  |   6 +++
>>  r600/lib/shared/vload.cl          |  99 ++++++++++++++++++++++++++++++++++
>>  r600/lib/shared/vload_if.ll       |  60 +++++++++++++++++++++
>>  r600/lib/shared/vload_impl.ll     |  44 ++++++++++++++++
>>  r600/lib/shared/vstore.cl         | 108 ++++++++++++++++++++++++++++++++++++++
>>  r600/lib/shared/vstore_if.ll      |  59 +++++++++++++++++++++
>>  r600/lib/shared/vstore_impl.ll    |  45 ++++++++++++++++
>>  14 files changed, 425 insertions(+), 330 deletions(-)
>>  delete mode 100644 generic/lib/shared/vload_if.ll
>>  delete mode 100644 generic/lib/shared/vload_impl.ll
>>  delete mode 100644 generic/lib/shared/vstore_if.ll
>>  delete mode 100644 generic/lib/shared/vstore_impl.ll
>>  create mode 100644 r600/lib/shared/vload.cl
>>  create mode 100644 r600/lib/shared/vload_if.ll
>>  create mode 100644 r600/lib/shared/vload_impl.ll
>>  create mode 100644 r600/lib/shared/vstore.cl
>>  create mode 100644 r600/lib/shared/vstore_if.ll
>>  create mode 100644 r600/lib/shared/vstore_impl.ll
>>
>> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
>> index 8cda14a..50cc9bd 100644
>> --- a/generic/lib/SOURCES
>> +++ b/generic/lib/SOURCES
>> @@ -24,10 +24,6 @@ shared/clamp.cl
>>  shared/max.cl
>>  shared/min.cl
>>  shared/vload.cl
>> -shared/vload_if.ll
>> -shared/vload_impl.ll
>>  shared/vstore.cl
>> -shared/vstore_if.ll
>> -shared/vstore_impl.ll
>>  workitem/get_global_id.cl
>>  workitem/get_global_size.cl
>> diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
>> index f6ebd37..e8439e7 100644
>> --- a/generic/lib/shared/vload.cl
>> +++ b/generic/lib/shared/vload.cl
>> @@ -27,12 +27,13 @@
>>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
>>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
>>
>> -//int/uint are special... see below
>>  #define VLOAD_TYPES() \
>>      VLOAD_ADDR_SPACES(char) \
>>      VLOAD_ADDR_SPACES(uchar) \
>>      VLOAD_ADDR_SPACES(short) \
>>      VLOAD_ADDR_SPACES(ushort) \
>> +    VLOAD_ADDR_SPACES(int) \
>> +    VLOAD_ADDR_SPACES(uint) \
>>      VLOAD_ADDR_SPACES(long) \
>>      VLOAD_ADDR_SPACES(ulong) \
>>      VLOAD_ADDR_SPACES(float) \
>> @@ -43,54 +44,3 @@ VLOAD_TYPES()
>>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>>      VLOAD_ADDR_SPACES(double)
>>  #endif
>> -
>> -VLOAD_VECTORIZE(int, __private)
>> -VLOAD_VECTORIZE(int, __local)
>> -VLOAD_VECTORIZE(int, __constant)
>> -VLOAD_VECTORIZE(uint, __private)
>> -VLOAD_VECTORIZE(uint, __local)
>> -VLOAD_VECTORIZE(uint, __constant)
>> -
>> -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
>> -  return (int2)(x[offset] , x[offset+1]);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
>> -  return (int3)(vload2(offset, x), x[offset+2]);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
>> -  return (uint2)(x[offset] , x[offset+1]);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
>> -  return (uint3)(vload2(offset, x), x[offset+2]);
>> -}
>> -
>> -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
>> - * they aren't actually overridden here
>> - */
>> -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
>> -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
>> -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
>> -
>> -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
>> -  return __clc_vload4_int__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
>> -  return __clc_vload8_int__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
>> -  return __clc_vload16_int__global(offset, x);
>> -}
>> -
>> -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
>> -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
>> -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
>> -
>> -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
>> -  return __clc_vload4_uint__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
>> -  return __clc_vload8_uint__global(offset, x);
>> -}
>> -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
>> -  return __clc_vload16_uint__global(offset, x);
>> -}
>> \ No newline at end of file
>> diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
>> deleted file mode 100644
>> index 2634d37..0000000
>> --- a/generic/lib/shared/vload_if.ll
>> +++ /dev/null
>> @@ -1,60 +0,0 @@
>> -;Start int global vload
>> -
>> -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
>> -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
>> -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
>> -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
>> -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
>> -
>> -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
>> -  ret <2 x i32> %call
>> -}
>> -
>> -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
>> -  ret <3 x i32> %call
>> -}
>> -
>> -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
>> -  ret <4 x i32> %call
>> -}
>> -
>> -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
>> -  ret <8 x i32> %call
>> -}
>> -
>> -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
>> -  ret <16 x i32> %call
>> -}
>> -
>> -
>> -;Start uint global vload
>> -
>> -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
>> -  ret <2 x i32> %call
>> -}
>> -
>> -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
>> -  ret <3 x i32> %call
>> -}
>> -
>> -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
>> -  ret <4 x i32> %call
>> -}
>> -
>> -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
>> -  ret <8 x i32> %call
>> -}
>> -
>> -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
>> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
>> -  ret <16 x i32> %call
>> -}
>> diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
>> deleted file mode 100644
>> index ae719e0..0000000
>> --- a/generic/lib/shared/vload_impl.ll
>> +++ /dev/null
>> @@ -1,49 +0,0 @@
>> -; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
>> -
>> -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
>> -  %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <2 x i32> %4
>> -}
>> -
>> -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
>> -  %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <3 x i32> %4
>> -}
>> -
>> -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
>> -  %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <4 x i32> %4
>> -}
>> -
>> -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
>> -  %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <8 x i32> %4
>> -}
>> -
>> -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
>> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
>> -  %2 = add i32 %1, %offset
>> -  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
>> -  %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
>> -  ret <16 x i32> %4
>> -}
>> -
>> -!1 = metadata !{metadata !"char", metadata !5}
>> -!2 = metadata !{metadata !"short", metadata !5}
>> -!3 = metadata !{metadata !"int", metadata !5}
>> -!4 = metadata !{metadata !"long", metadata !5}
>> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
>> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
>> -

<snipped the vstore implementation because of message size limits>

>> 1.8.1.2
>>
>>
>> _______________________________________________
>> Libclc-dev mailing list
>> Libclc-dev at pcc.me.uk
>> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev