[Libclc-dev] Endless loop with git versions of mesa, libclc and llvm 3.9.0

Ricardo Ribalda Delgado via Libclc-dev libclc-dev at lists.llvm.org
Wed Apr 19 03:50:54 PDT 2017


Hi Jan


I have updated my installation to: kernel 4.10, mesa 17, llvm 4.0.1
and master git of llvm

I do not get more gpu hangs, but now cpu hangs, that can be recovered
via ctrl+c.

I do not know if it is the right protocol, but I have updated the
ticket that you previously mention.

Thanks!

On Fri, Sep 16, 2016 at 8:08 PM, Jan Vesely <jan.vesely at rutgers.edu> wrote:
> On Fri, 2016-09-16 at 13:11 -0400, Tom Stellard via Libclc-dev wrote:
>> On Thu, Sep 15, 2016 at 10:25:56AM +0200, Ricardo Ribalda Delgado via
>> Libclc-dev wrote:
>> >
>> > Hi
>> >
>> > I am evaluating libclc as a replacement for fglrx opencl library. I
>> > have a pre souther island board:
>> >
>>
>> Unfortunately, pre-Southern Islands is not very well supported.
>
> Depending on LLVM version (and luck) you can get 98% piglit pass rate
> (not that it means much), but the status on llvm-3.9 is probably
> 'mostly broken'.
>
>>
>> >
>> > 00:01.0 VGA compatible controller: Advanced Micro Devices, Inc.
>> > [AMD/ATI] Wrestler [Radeon HD 6320]
>> >
>> > Using the git version of mesa and libclc in combination with llvm
>> > 3.9.0 I get into an endless loop when I run clpeak.
>> >
>> > Any pointers about how to debug it furhter?
>>
>> Try to reduce the test case as much as you can.  If you have the ISA
>> dumps, I can try to take a look at them.
>
> I think this has been previously reported here: https://bugs.freedeskto
> p.org/show_bug.cgi?id=96296
>
> Jan
>
>>
>> -Tom
>>
>> >
>> > Thanks
>> >
>> > root at qt5022-open:~# CLOVER_DEBUG=clc,llvm,native
>> > CLOVER_DEBUG_FILE=clover.dbg clpeak
>> >
>> > Platform: Clover
>> >   Device: AMD PALM (DRM 2.45.0 / 4.7.0-qtec-standard, LLVM 3.9.0)
>> >     Driver version  : 12.1.0-devel (Linux x64)
>> >     Compute units   : 2
>> >     Clock frequency : 0 MHz
>> >
>> >
>> > root at qt5022-open:~# cat clover.dbg.cl
>> > // Options:  -cl-mad-enable
>> >
>> > #undef FETCH_2
>> > #undef FETCH_8
>> >
>> > #define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum
>> > +=
>> > A[id]; id += jumpBy;
>> > #define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
>> > FETCH_2(sum, id, A, jumpBy);
>> > #define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
>> > FETCH_4(sum, id, A, jumpBy);
>> >
>> >
>> > #define FETCH_PER_WI 16
>> >  __kernel void global_bandwidth_v1_local_offset(__global float *A,
>> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
>> > *
>> > FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = sum; } __kernel void
>> > global_bandwidth_v2_local_offset(__global float2 *A, __global float
>> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
>> > +
>> > get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
>> > global_bandwidth_v4_local_offset(__global float4 *A, __global float
>> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
>> > +
>> > get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
>> > __kernel void global_bandwidth_v8_local_offset(__global float8 *A,
>> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
>> > *
>> > FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id,
>> > A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
>> > (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
>> > global_bandwidth_v16_local_offset(__global float16 *A, __global
>> > float
>> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
>> > +
>> > get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t
>> > =
>> > (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
>> > (sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB)
>> > +
>> > (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t;
>> > }
>> > __kernel void global_bandwidth_v1_global_offset(__global float *A,
>> > __global float *B) { int id = get_global_id(0); float sum = 0;
>> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
>> > get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
>> > global_bandwidth_v2_global_offset(__global float2 *A, __global
>> > float
>> > *B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id,
>> > A,
>> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
>> > global_bandwidth_v4_global_offset(__global float4 *A, __global
>> > float
>> > *B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id,
>> > A,
>> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
>> > __kernel void global_bandwidth_v8_global_offset(__global float8 *A,
>> > __global float *B) { int id = get_global_id(0); float8 sum = 0;
>> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
>> > get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
>> > (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
>> > __kernel void global_bandwidth_v16_global_offset(__global float16
>> > *A,
>> > __global float *B) { int id = get_global_id(0); float16 sum = 0;
>> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
>> > get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
>> > (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8)
>> > +
>> > (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
>> > (sum.SF); B[get_global_id(0)] = t; }
>> > #undef MAD_4
>> > #undef MAD_16
>> > #undef MAD_64
>> >
>> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
>> > x,
>> > y); y = mad(x, y, x);
>> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
>> > MAD_4(x, y);
>> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>> > MAD_16(x, y);
>> >  __kernel void compute_sp_v1(__global float *ptr, float _A) { float
>> > x
>> > = _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
>> > float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
>> > (float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
>> > float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4
>> > y
>> > = (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
>> > __kernel
>> > void compute_sp_v8(__global float *ptr, float _A) { float8 x =
>> > (float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
>> > (_A+7));
>> > float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1)
>> > +
>> > (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel
>> > void
>> > compute_sp_v16(__global float *ptr, float _A) { float16 x =
>> > (float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
>> > (_A+7),
>> > (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14),
>> > (_A+15));
>> > float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) +
>> > (y.SAB) +
>> > (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
>> > #if defined(cl_khr_fp64)
>> >  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>> >  #define DOUBLE_AVAILABLE
>> > #elif defined(cl_amd_fp64)
>> >  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
>> >  #define DOUBLE_AVAILABLE
>> > #endif
>> > #undef MAD_4
>> > #undef MAD_16
>> > #undef MAD_64
>> >
>> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
>> > x,
>> > y); y = mad(x, y, x);
>> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
>> > MAD_4(x, y);
>> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>> > MAD_16(x, y);
>> >
>> >
>> > #ifdef DOUBLE_AVAILABLE
>> >  __kernel void compute_dp_v1(__global double *ptr, double _A) {
>> > double
>> > x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); ptr[get_global_id(0)] = y; } __kernel void
>> > compute_dp_v2(__global
>> > double *ptr, double _A) { double2 x = (double2)(_A, (_A+1));
>> > double2 y
>> > = (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double
>> > *ptr,
>> > double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
>> > double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
>> > (y.S3); } __kernel void compute_dp_v8(__global double *ptr, double
>> > _A)
>> > { double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
>> > (_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) +
>> > (y.S7);
>> > } __kernel void compute_dp_v16(__global double *ptr, double _A) {
>> > double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
>> > (_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
>> > (_A+14), (_A+15)); double16 y = (double16)get_local_id(0);
>> > MAD_64(x,
>> > y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67)
>> > +
>> > (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0
>> > +
>> > t.S1; }
>> >
>> > #endif
>> >
>> > #undef MAD_4
>> > #undef MAD_16
>> > #undef MAD_64
>> >
>> > #define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y
>> > = (x*y) + x;
>> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
>> > MAD_4(x, y);
>> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>> > MAD_16(x, y);
>> >  __kernel void compute_integer_v1(__global int *ptr, int _A) { int
>> > x =
>> > _A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = y; } __kernel void
>> > compute_integer_v2(__global
>> > int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
>> > (int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int
>> > *ptr,
>> > int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
>> > (int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
>> > __kernel
>> > void compute_integer_v8(__global int *ptr, int _A) { int8 x =
>> > (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
>> > int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2)
>> > +
>> > (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
>> > compute_integer_v16(__global int *ptr, int _A) { int16 x =
>> > (int16)(_A,
>> > (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
>> > (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
>> > int16 y
>> > = (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t =
>> > (y.S01)
>> > + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) +
>> > (y.SEF);
>> > ptr[get_global_id(0)] = t.S0 + t.S1; }// Options:  -cl-mad-enable
>> >
>> > #undef FETCH_2
>> > #undef FETCH_8
>> >
>> > #define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum
>> > +=
>> > A[id]; id += jumpBy;
>> > #define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
>> > FETCH_2(sum, id, A, jumpBy);
>> > #define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
>> > FETCH_4(sum, id, A, jumpBy);
>> >
>> >
>> > #define FETCH_PER_WI 16
>> >  __kernel void global_bandwidth_v1_local_offset(__global float *A,
>> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
>> > *
>> > FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = sum; } __kernel void
>> > global_bandwidth_v2_local_offset(__global float2 *A, __global float
>> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
>> > +
>> > get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
>> > global_bandwidth_v4_local_offset(__global float4 *A, __global float
>> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
>> > +
>> > get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
>> > __kernel void global_bandwidth_v8_local_offset(__global float8 *A,
>> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
>> > *
>> > FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id,
>> > A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
>> > (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
>> > global_bandwidth_v16_local_offset(__global float16 *A, __global
>> > float
>> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
>> > +
>> > get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
>> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t
>> > =
>> > (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
>> > (sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB)
>> > +
>> > (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t;
>> > }
>> > __kernel void global_bandwidth_v1_global_offset(__global float *A,
>> > __global float *B) { int id = get_global_id(0); float sum = 0;
>> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
>> > get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
>> > global_bandwidth_v2_global_offset(__global float2 *A, __global
>> > float
>> > *B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id,
>> > A,
>> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
>> > global_bandwidth_v4_global_offset(__global float4 *A, __global
>> > float
>> > *B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id,
>> > A,
>> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
>> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
>> > __kernel void global_bandwidth_v8_global_offset(__global float8 *A,
>> > __global float *B) { int id = get_global_id(0); float8 sum = 0;
>> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
>> > get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
>> > (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
>> > __kernel void global_bandwidth_v16_global_offset(__global float16
>> > *A,
>> > __global float *B) { int id = get_global_id(0); float16 sum = 0;
>> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
>> > get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
>> > (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8)
>> > +
>> > (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
>> > (sum.SF); B[get_global_id(0)] = t; }
>> > #undef MAD_4
>> > #undef MAD_16
>> > #undef MAD_64
>> >
>> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
>> > x,
>> > y); y = mad(x, y, x);
>> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
>> > MAD_4(x, y);
>> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>> > MAD_16(x, y);
>> >  __kernel void compute_sp_v1(__global float *ptr, float _A) { float
>> > x
>> > = _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
>> > float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
>> > (float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
>> > float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4
>> > y
>> > = (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
>> > __kernel
>> > void compute_sp_v8(__global float *ptr, float _A) { float8 x =
>> > (float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
>> > (_A+7));
>> > float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1)
>> > +
>> > (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel
>> > void
>> > compute_sp_v16(__global float *ptr, float _A) { float16 x =
>> > (float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
>> > (_A+7),
>> > (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14),
>> > (_A+15));
>> > float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) +
>> > (y.SAB) +
>> > (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
>> > #if defined(cl_khr_fp64)
>> >  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>> >  #define DOUBLE_AVAILABLE
>> > #elif defined(cl_amd_fp64)
>> >  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
>> >  #define DOUBLE_AVAILABLE
>> > #endif
>> > #undef MAD_4
>> > #undef MAD_16
>> > #undef MAD_64
>> >
>> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
>> > x,
>> > y); y = mad(x, y, x);
>> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
>> > MAD_4(x, y);
>> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>> > MAD_16(x, y);
>> >
>> >
>> > #ifdef DOUBLE_AVAILABLE
>> >  __kernel void compute_dp_v1(__global double *ptr, double _A) {
>> > double
>> > x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); ptr[get_global_id(0)] = y; } __kernel void
>> > compute_dp_v2(__global
>> > double *ptr, double _A) { double2 x = (double2)(_A, (_A+1));
>> > double2 y
>> > = (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double
>> > *ptr,
>> > double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
>> > double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
>> > (y.S3); } __kernel void compute_dp_v8(__global double *ptr, double
>> > _A)
>> > { double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
>> > (_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) +
>> > (y.S7);
>> > } __kernel void compute_dp_v16(__global double *ptr, double _A) {
>> > double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
>> > (_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
>> > (_A+14), (_A+15)); double16 y = (double16)get_local_id(0);
>> > MAD_64(x,
>> > y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67)
>> > +
>> > (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0
>> > +
>> > t.S1; }
>> >
>> > #endif
>> >
>> > #undef MAD_4
>> > #undef MAD_16
>> > #undef MAD_64
>> >
>> > #define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y
>> > = (x*y) + x;
>> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
>> > MAD_4(x, y);
>> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>> > MAD_16(x, y);
>> >  __kernel void compute_integer_v1(__global int *ptr, int _A) { int
>> > x =
>> > _A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = y; } __kernel void
>> > compute_integer_v2(__global
>> > int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
>> > (int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
>> > (y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int
>> > *ptr,
>> > int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
>> > (int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
>> > y);
>> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
>> > __kernel
>> > void compute_integer_v8(__global int *ptr, int _A) { int8 x =
>> > (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
>> > int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
>> > MAD_64(x,
>> > y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2)
>> > +
>> > (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
>> > compute_integer_v16(__global int *ptr, int _A) { int16 x =
>> > (int16)(_A,
>> > (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
>> > (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
>> > int16 y
>> > = (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t =
>> > (y.S01)
>> > + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) +
>> > (y.SEF);
>> > ptr[get_global_id(0)] = t.S0 + t.S1; }
>> >
>> > clover.dbg.ll in:
>> >
>> > http://ge.tt/9cZmpQe2
>> >
>> > Thanks for your help!
>> >
>> >
>> >
>> > --
>> > Ricardo Ribalda
>> > _______________________________________________
>> > Libclc-dev mailing list
>> > Libclc-dev at lists.llvm.org
>> > http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev
>> _______________________________________________
>> Libclc-dev mailing list
>> Libclc-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev



-- 
Ricardo Ribalda


More information about the Libclc-dev mailing list