[Libclc-dev] Endless loop with git versions of mesa, libclc and llvm 3.9.0

Jan Vesely via Libclc-dev libclc-dev at lists.llvm.org
Fri Sep 16 11:08:21 PDT 2016


On Fri, 2016-09-16 at 13:11 -0400, Tom Stellard via Libclc-dev wrote:
> On Thu, Sep 15, 2016 at 10:25:56AM +0200, Ricardo Ribalda Delgado via
> Libclc-dev wrote:
> > 
> > Hi
> > 
> > I am evaluating libclc as a replacement for fglrx opencl library. I
> > have a pre souther island board:
> > 
> 
> Unfortunately, pre-Southern Islands is not very well supported.

Depending on LLVM version (and luck) you can get 98% piglit pass rate
(not that it means much), but the status on llvm-3.9 is probably
'mostly broken'.

> 
> > 
> > 00:01.0 VGA compatible controller: Advanced Micro Devices, Inc.
> > [AMD/ATI] Wrestler [Radeon HD 6320]
> > 
> > Using the git version of mesa and libclc in combination with llvm
> > 3.9.0 I get into an endless loop when I run clpeak.
> > 
> > Any pointers about how to debug it furhter?
> 
> Try to reduce the test case as much as you can.  If you have the ISA
> dumps, I can try to take a look at them.

I think this has been previously reported here: https://bugs.freedeskto
p.org/show_bug.cgi?id=96296

Jan

> 
> -Tom
> 
> > 
> > Thanks
> > 
> > root at qt5022-open:~# CLOVER_DEBUG=clc,llvm,native
> > CLOVER_DEBUG_FILE=clover.dbg clpeak
> > 
> > Platform: Clover
> >   Device: AMD PALM (DRM 2.45.0 / 4.7.0-qtec-standard, LLVM 3.9.0)
> >     Driver version  : 12.1.0-devel (Linux x64)
> >     Compute units   : 2
> >     Clock frequency : 0 MHz
> > 
> > 
> > root at qt5022-open:~# cat clover.dbg.cl
> > // Options:  -cl-mad-enable
> > 
> > #undef FETCH_2
> > #undef FETCH_8
> > 
> > #define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum
> > +=
> > A[id]; id += jumpBy;
> > #define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
> > FETCH_2(sum, id, A, jumpBy);
> > #define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
> > FETCH_4(sum, id, A, jumpBy);
> > 
> > 
> > #define FETCH_PER_WI 16
> >  __kernel void global_bandwidth_v1_local_offset(__global float *A,
> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
> > *
> > FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = sum; } __kernel void
> > global_bandwidth_v2_local_offset(__global float2 *A, __global float
> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
> > +
> > get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> > global_bandwidth_v4_local_offset(__global float4 *A, __global float
> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
> > +
> > get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> > __kernel void global_bandwidth_v8_local_offset(__global float8 *A,
> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
> > *
> > FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id,
> > A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
> > (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
> > global_bandwidth_v16_local_offset(__global float16 *A, __global
> > float
> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
> > +
> > get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t
> > =
> > (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
> > (sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB)
> > +
> > (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t;
> > }
> > __kernel void global_bandwidth_v1_global_offset(__global float *A,
> > __global float *B) { int id = get_global_id(0); float sum = 0;
> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> > get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
> > global_bandwidth_v2_global_offset(__global float2 *A, __global
> > float
> > *B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id,
> > A,
> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> > global_bandwidth_v4_global_offset(__global float4 *A, __global
> > float
> > *B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id,
> > A,
> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> > __kernel void global_bandwidth_v8_global_offset(__global float8 *A,
> > __global float *B) { int id = get_global_id(0); float8 sum = 0;
> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> > get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
> > (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
> > __kernel void global_bandwidth_v16_global_offset(__global float16
> > *A,
> > __global float *B) { int id = get_global_id(0); float16 sum = 0;
> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> > get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
> > (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8)
> > +
> > (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
> > (sum.SF); B[get_global_id(0)] = t; }
> > #undef MAD_4
> > #undef MAD_16
> > #undef MAD_64
> > 
> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
> > x,
> > y); y = mad(x, y, x);
> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> > MAD_4(x, y);
> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> > MAD_16(x, y);
> >  __kernel void compute_sp_v1(__global float *ptr, float _A) { float
> > x
> > = _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
> > float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
> > (float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
> > float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4
> > y
> > = (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
> > __kernel
> > void compute_sp_v8(__global float *ptr, float _A) { float8 x =
> > (float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
> > (_A+7));
> > float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1)
> > +
> > (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel
> > void
> > compute_sp_v16(__global float *ptr, float _A) { float16 x =
> > (float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
> > (_A+7),
> > (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14),
> > (_A+15));
> > float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) +
> > (y.SAB) +
> > (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
> > #if defined(cl_khr_fp64)
> >  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> >  #define DOUBLE_AVAILABLE
> > #elif defined(cl_amd_fp64)
> >  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
> >  #define DOUBLE_AVAILABLE
> > #endif
> > #undef MAD_4
> > #undef MAD_16
> > #undef MAD_64
> > 
> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
> > x,
> > y); y = mad(x, y, x);
> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> > MAD_4(x, y);
> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> > MAD_16(x, y);
> > 
> > 
> > #ifdef DOUBLE_AVAILABLE
> >  __kernel void compute_dp_v1(__global double *ptr, double _A) {
> > double
> > x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); ptr[get_global_id(0)] = y; } __kernel void
> > compute_dp_v2(__global
> > double *ptr, double _A) { double2 x = (double2)(_A, (_A+1));
> > double2 y
> > = (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double
> > *ptr,
> > double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
> > double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
> > (y.S3); } __kernel void compute_dp_v8(__global double *ptr, double
> > _A)
> > { double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> > (_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) +
> > (y.S7);
> > } __kernel void compute_dp_v16(__global double *ptr, double _A) {
> > double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> > (_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
> > (_A+14), (_A+15)); double16 y = (double16)get_local_id(0);
> > MAD_64(x,
> > y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67)
> > +
> > (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0
> > +
> > t.S1; }
> > 
> > #endif
> > 
> > #undef MAD_4
> > #undef MAD_16
> > #undef MAD_64
> > 
> > #define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y
> > = (x*y) + x;
> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> > MAD_4(x, y);
> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> > MAD_16(x, y);
> >  __kernel void compute_integer_v1(__global int *ptr, int _A) { int
> > x =
> > _A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = y; } __kernel void
> > compute_integer_v2(__global
> > int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
> > (int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int
> > *ptr,
> > int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
> > (int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
> > __kernel
> > void compute_integer_v8(__global int *ptr, int _A) { int8 x =
> > (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
> > int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2)
> > +
> > (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
> > compute_integer_v16(__global int *ptr, int _A) { int16 x =
> > (int16)(_A,
> > (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
> > (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
> > int16 y
> > = (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t =
> > (y.S01)
> > + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) +
> > (y.SEF);
> > ptr[get_global_id(0)] = t.S0 + t.S1; }// Options:  -cl-mad-enable
> > 
> > #undef FETCH_2
> > #undef FETCH_8
> > 
> > #define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum
> > +=
> > A[id]; id += jumpBy;
> > #define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
> > FETCH_2(sum, id, A, jumpBy);
> > #define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
> > FETCH_4(sum, id, A, jumpBy);
> > 
> > 
> > #define FETCH_PER_WI 16
> >  __kernel void global_bandwidth_v1_local_offset(__global float *A,
> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
> > *
> > FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = sum; } __kernel void
> > global_bandwidth_v2_local_offset(__global float2 *A, __global float
> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
> > +
> > get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> > global_bandwidth_v4_local_offset(__global float4 *A, __global float
> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
> > +
> > get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> > __kernel void global_bandwidth_v8_local_offset(__global float8 *A,
> > __global float *B) { int id = (get_group_id(0) * get_local_size(0)
> > *
> > FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id,
> > A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
> > (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
> > global_bandwidth_v16_local_offset(__global float16 *A, __global
> > float
> > *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI)
> > +
> > get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
> > get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t
> > =
> > (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
> > (sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB)
> > +
> > (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t;
> > }
> > __kernel void global_bandwidth_v1_global_offset(__global float *A,
> > __global float *B) { int id = get_global_id(0); float sum = 0;
> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> > get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
> > global_bandwidth_v2_global_offset(__global float2 *A, __global
> > float
> > *B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id,
> > A,
> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> > global_bandwidth_v4_global_offset(__global float4 *A, __global
> > float
> > *B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id,
> > A,
> > get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> > B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> > __kernel void global_bandwidth_v8_global_offset(__global float8 *A,
> > __global float *B) { int id = get_global_id(0); float8 sum = 0;
> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> > get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
> > (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
> > __kernel void global_bandwidth_v16_global_offset(__global float16
> > *A,
> > __global float *B) { int id = get_global_id(0); float16 sum = 0;
> > FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> > get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
> > (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8)
> > +
> > (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
> > (sum.SF); B[get_global_id(0)] = t; }
> > #undef MAD_4
> > #undef MAD_16
> > #undef MAD_64
> > 
> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
> > x,
> > y); y = mad(x, y, x);
> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> > MAD_4(x, y);
> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> > MAD_16(x, y);
> >  __kernel void compute_sp_v1(__global float *ptr, float _A) { float
> > x
> > = _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
> > float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
> > (float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
> > float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4
> > y
> > = (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
> > __kernel
> > void compute_sp_v8(__global float *ptr, float _A) { float8 x =
> > (float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
> > (_A+7));
> > float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1)
> > +
> > (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel
> > void
> > compute_sp_v16(__global float *ptr, float _A) { float16 x =
> > (float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6),
> > (_A+7),
> > (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14),
> > (_A+15));
> > float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) +
> > (y.SAB) +
> > (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
> > #if defined(cl_khr_fp64)
> >  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> >  #define DOUBLE_AVAILABLE
> > #elif defined(cl_amd_fp64)
> >  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
> >  #define DOUBLE_AVAILABLE
> > #endif
> > #undef MAD_4
> > #undef MAD_16
> > #undef MAD_64
> > 
> > #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y,
> > x,
> > y); y = mad(x, y, x);
> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> > MAD_4(x, y);
> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> > MAD_16(x, y);
> > 
> > 
> > #ifdef DOUBLE_AVAILABLE
> >  __kernel void compute_dp_v1(__global double *ptr, double _A) {
> > double
> > x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); ptr[get_global_id(0)] = y; } __kernel void
> > compute_dp_v2(__global
> > double *ptr, double _A) { double2 x = (double2)(_A, (_A+1));
> > double2 y
> > = (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double
> > *ptr,
> > double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
> > double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
> > (y.S3); } __kernel void compute_dp_v8(__global double *ptr, double
> > _A)
> > { double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> > (_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) +
> > (y.S7);
> > } __kernel void compute_dp_v16(__global double *ptr, double _A) {
> > double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> > (_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
> > (_A+14), (_A+15)); double16 y = (double16)get_local_id(0);
> > MAD_64(x,
> > y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67)
> > +
> > (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0
> > +
> > t.S1; }
> > 
> > #endif
> > 
> > #undef MAD_4
> > #undef MAD_16
> > #undef MAD_64
> > 
> > #define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y
> > = (x*y) + x;
> > #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> > MAD_4(x, y);
> > #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> > MAD_16(x, y);
> >  __kernel void compute_integer_v1(__global int *ptr, int _A) { int
> > x =
> > _A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = y; } __kernel void
> > compute_integer_v2(__global
> > int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
> > (int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> > (y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int
> > *ptr,
> > int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
> > (int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> > y);
> > ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); }
> > __kernel
> > void compute_integer_v8(__global int *ptr, int _A) { int8 x =
> > (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
> > int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> > MAD_64(x,
> > y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2)
> > +
> > (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
> > compute_integer_v16(__global int *ptr, int _A) { int16 x =
> > (int16)(_A,
> > (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
> > (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
> > int16 y
> > = (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t =
> > (y.S01)
> > + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) +
> > (y.SEF);
> > ptr[get_global_id(0)] = t.S0 + t.S1; }
> > 
> > clover.dbg.ll in:
> > 
> > http://ge.tt/9cZmpQe2
> > 
> > Thanks for your help!
> > 
> > 
> > 
> > -- 
> > Ricardo Ribalda
> > _______________________________________________
> > Libclc-dev mailing list
> > Libclc-dev at lists.llvm.org
> > http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part
URL: <http://lists.llvm.org/pipermail/libclc-dev/attachments/20160916/bba326cd/attachment-0001.sig>


More information about the Libclc-dev mailing list